Commit 32a9b63d authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

added better threads configuration estimator for the minMax, minMaxLoc,…

added better threads configuration estimator for the minMax, minMaxLoc, countNonZero functions in gpu module
parent 3fd07809
......@@ -496,7 +496,7 @@ Scalar cv::gpu::sum(const GpuMat& src)
namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
void get_buf_size_required(int elem_size, int& cols, int& rows);
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
......@@ -551,7 +551,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size bufSize;
get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height);
get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
if (mask.empty())
......@@ -574,8 +574,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
void get_buf_size_required(int elem_size, int& b1cols, int& b1rows,
int& b2cols, int& b2rows);
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows);
template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
......@@ -636,8 +636,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
int maxLoc_[2];
Size valbuf_size, locbuf_size;
get_buf_size_required(src.elemSize(), valbuf_size.width, valbuf_size.height,
locbuf_size.width, locbuf_size.height);
get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width,
valbuf_size.height, locbuf_size.width, locbuf_size.height);
valbuf.create(valbuf_size, CV_8U);
locbuf.create(locbuf_size, CV_8U);
......@@ -663,7 +663,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
void get_buf_size_required(int& cols, int& rows);
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf);
......@@ -697,7 +697,7 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
Size buf_size;
get_buf_size_required(buf_size.width, buf_size.height);
get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
buf.create(buf_size, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
......
......@@ -425,25 +425,25 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(dim3& threads, dim3& grid)
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(64, 4);
grid = dim3(6, 5);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
}
// Returns required buffer sizes
void get_buf_size_required(int elem_size, int& cols, int& rows)
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
cols = grid.x * grid.y * elem_size;
rows = 2;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size;
bufrows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
......@@ -567,8 +567,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
......@@ -596,8 +596,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
......@@ -650,8 +650,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
......@@ -679,8 +679,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
......@@ -719,19 +719,19 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(dim3& threads, dim3& grid)
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(64, 4);
grid = dim3(6, 5);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
}
// Returns required buffer sizes
void get_buf_size_required(int elem_size, int& b1cols, int& b1rows,
int& b2cols, int& b2rows)
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_thread_cfg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations
......@@ -740,7 +740,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates device constants which are used in the kernels using specified thread configuration
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
......@@ -886,8 +886,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
......@@ -924,8 +924,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
......@@ -994,8 +994,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
......@@ -1032,8 +1032,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
......@@ -1077,23 +1077,23 @@ namespace cv { namespace gpu { namespace mathfunc
__device__ unsigned int blocks_finished = 0;
void estimate_thread_cfg(dim3& threads, dim3& grid)
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(64, 4);
grid = dim3(6, 5);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
}
void get_buf_size_required(int& cols, int& rows)
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
cols = grid.x * grid.y * sizeof(int);
rows = 1;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
......@@ -1182,8 +1182,8 @@ namespace cv { namespace gpu { namespace mathfunc
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
......@@ -1226,8 +1226,8 @@ namespace cv { namespace gpu { namespace mathfunc
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment