added better threads configuration estimator for the minMax, minMaxLoc,…

added better threads configuration estimator for the minMax, minMaxLoc, countNonZero functions in gpu module

added better threads configuration estimator for the minMax, minMaxLoc,…
added better threads configuration estimator for the minMax, minMaxLoc, countNonZero functions in gpu module
32a9b63d · Alexey Spizhevoy · 3fd07809 · 32a9b63d · 32a9b63d
Commit 32a9b63d authored Nov 30, 2010 by Alexey Spizhevoy
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 51 deletions

arithm.cpp modules/gpu/src/arithm.cpp +8 -8

mathfunc.cu modules/gpu/src/cuda/mathfunc.cu +43 -43

No files found.
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -496,7 +496,7 @@ Scalar cv::gpu::sum(const GpuMat& src)

 namespace cv { namespace gpu { namespace mathfunc { namespace minmax {

-    void get_buf_size_required(int elem_size, int& cols, int& rows);
+    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
    
    template <typename T> 
    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
@@ -551,7 +551,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    
    Size bufSize;
-    get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height);
+    get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
    buf.create(bufSize, CV_8U);

    if (mask.empty())
@@ -574,8 +574,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp

 namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {

-    void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, 
-                               int& b2cols, int& b2rows);
+    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
+                               int& b1rows, int& b2cols, int& b2rows);

    template <typename T> 
    void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, 
@@ -636,8 +636,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    int maxLoc_[2];

    Size valbuf_size, locbuf_size;
-    get_buf_size_required(src.elemSize(), valbuf_size.width, valbuf_size.height, 
-                          locbuf_size.width, locbuf_size.height);
+    get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width, 
+                          valbuf_size.height, locbuf_size.width, locbuf_size.height);
    valbuf.create(valbuf_size, CV_8U);
    locbuf.create(locbuf_size, CV_8U);

@@ -663,7 +663,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point

 namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {

-    void get_buf_size_required(int& cols, int& rows);
+    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);

    template <typename T> 
    int count_non_zero_caller(const DevMem2D src, PtrStep buf);
@@ -697,7 +697,7 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));

    Size buf_size;
-    get_buf_size_required(buf_size.width, buf_size.height);
+    get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
    buf.create(buf_size, CV_8U);

    Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];

--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -425,25 +425,25 @@ namespace cv { namespace gpu { namespace mathfunc

    // Estimates good thread configuration
    //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimate_thread_cfg(dim3& threads, dim3& grid)
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
    {
-        threads = dim3(64, 4);
-        grid = dim3(6, 5);
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
    }


    // Returns required buffer sizes
-    void get_buf_size_required(int elem_size, int& cols, int& rows)
+    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        cols = grid.x * grid.y * elem_size; 
-        rows = 2;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * elem_size; 
+        bufrows = 2;
    }


    // Estimates device constants which are used in the kernels using specified thread configuration
-    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
    {        
        int twidth = divUp(divUp(cols, grid.x), threads.x);
        int theight = divUp(divUp(rows, grid.y), threads.y);
@@ -567,8 +567,8 @@ namespace cv { namespace gpu { namespace mathfunc
    void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)buf.ptr(0);
        T* maxval_buf = (T*)buf.ptr(1);
@@ -596,8 +596,8 @@ namespace cv { namespace gpu { namespace mathfunc
    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)buf.ptr(0);
        T* maxval_buf = (T*)buf.ptr(1);
@@ -650,8 +650,8 @@ namespace cv { namespace gpu { namespace mathfunc
    void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)buf.ptr(0);
        T* maxval_buf = (T*)buf.ptr(1);
@@ -679,8 +679,8 @@ namespace cv { namespace gpu { namespace mathfunc
    void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)buf.ptr(0);
        T* maxval_buf = (T*)buf.ptr(1);
@@ -719,19 +719,19 @@ namespace cv { namespace gpu { namespace mathfunc

    // Estimates good thread configuration
    //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimate_thread_cfg(dim3& threads, dim3& grid)
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
    {
-        threads = dim3(64, 4);
-        grid = dim3(6, 5);
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
    }


    // Returns required buffer sizes
-    void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, 
-                               int& b2cols, int& b2rows)
+    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
+                               int& b1rows, int& b2cols, int& b2rows)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
+        estimate_thread_cfg(cols, rows, threads, grid);
        b1cols = grid.x * grid.y * elem_size; // For values
        b1rows = 2;
        b2cols = grid.x * grid.y * sizeof(int); // For locations
@@ -740,7 +740,7 @@ namespace cv { namespace gpu { namespace mathfunc


    // Estimates device constants which are used in the kernels using specified thread configuration
-    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
    {        
        int twidth = divUp(divUp(cols, grid.x), threads.x);
        int theight = divUp(divUp(rows, grid.y), threads.y);
@@ -886,8 +886,8 @@ namespace cv { namespace gpu { namespace mathfunc
                                 int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)valbuf.ptr(0);
        T* maxval_buf = (T*)valbuf.ptr(1);
@@ -924,8 +924,8 @@ namespace cv { namespace gpu { namespace mathfunc
                            int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)valbuf.ptr(0);
        T* maxval_buf = (T*)valbuf.ptr(1);
@@ -994,8 +994,8 @@ namespace cv { namespace gpu { namespace mathfunc
                                           int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)valbuf.ptr(0);
        T* maxval_buf = (T*)valbuf.ptr(1);
@@ -1032,8 +1032,8 @@ namespace cv { namespace gpu { namespace mathfunc
                                      int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        T* minval_buf = (T*)valbuf.ptr(0);
        T* maxval_buf = (T*)valbuf.ptr(1);
@@ -1077,23 +1077,23 @@ namespace cv { namespace gpu { namespace mathfunc

    __device__ unsigned int blocks_finished = 0;

-    void estimate_thread_cfg(dim3& threads, dim3& grid)
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
    {
-        threads = dim3(64, 4);
-        grid = dim3(6, 5);
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
    }


-    void get_buf_size_required(int& cols, int& rows)
+    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        cols = grid.x * grid.y * sizeof(int);
-        rows = 1;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * sizeof(int);
+        bufrows = 1;
    }


-    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
    {        
        int twidth = divUp(divUp(cols, grid.x), threads.x);
        int theight = divUp(divUp(rows, grid.y), threads.y);
@@ -1182,8 +1182,8 @@ namespace cv { namespace gpu { namespace mathfunc
    int count_non_zero_caller(const DevMem2D src, PtrStep buf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        unsigned int* count_buf = (unsigned int*)buf.ptr(0);

@@ -1226,8 +1226,8 @@ namespace cv { namespace gpu { namespace mathfunc
    int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
    {
        dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);

        unsigned int* count_buf = (unsigned int*)buf.ptr(0);