Commit fa5422a2 authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

added buf support into gpu::minMax

parent b7e9c622
...@@ -424,6 +424,9 @@ namespace cv ...@@ -424,6 +424,9 @@ namespace cv
//! finds global minimum and maximum array elements and returns their values //! finds global minimum and maximum array elements and returns their values
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0); CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0);
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& buf);
//! finds global minimum and maximum array elements and returns their values with locations //! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0); CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0);
......
...@@ -66,6 +66,7 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return ...@@ -66,6 +66,7 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return
void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); } void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); }
void cv::gou::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); }
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }
...@@ -492,20 +493,24 @@ Scalar cv::gpu::sum(const GpuMat& src) ...@@ -492,20 +493,24 @@ Scalar cv::gpu::sum(const GpuMat& src)
namespace cv { namespace gpu { namespace mathfunc { namespace minmax { namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, void get_buf_size_required(int elem_size, int& cols, int& rows);
int& b2cols, int& b2rows);
template <typename T> template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
unsigned char* minval_buf, unsigned char* maxval_buf);
template <typename T> template <typename T>
void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
unsigned char* minval_buf, unsigned char* maxval_buf);
}}}} }}}}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal) void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal)
{
GpuMat buf;
minMax(src, minVal, maxVal, buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& buf)
{ {
using namespace mathfunc::minmax; using namespace mathfunc::minmax;
...@@ -513,26 +518,25 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal) ...@@ -513,26 +518,25 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal)
if (!maxVal) maxVal = &maxVal_; if (!maxVal) maxVal = &maxVal_;
GpuMat src_ = src.reshape(1); GpuMat src_ = src.reshape(1);
// Allocate GPU buffers Size bufSize;
Size b1size, b2size; get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height);
get_buf_size_required(src.elemSize(), b1size.width, b1size.height, b2size.width, b2size.height); buf.create(bufSize, CV_8U);
GpuMat b1(b1size, CV_8U), b2(b2size, CV_8U);
int major, minor; int major, minor;
getComputeCapability(getDevice(), major, minor); getComputeCapability(getDevice(), major, minor);
if (major >= 1 && minor >= 1) if (major >= 1 && minor >= 1)
{ {
switch (src_.type()) switch (src_.type())
{ {
case CV_8U: min_max_caller<unsigned char>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_8U: min_max_caller<unsigned char>(src_, minVal, maxVal, buf); break;
case CV_8S: min_max_caller<signed char>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_8S: min_max_caller<signed char>(src_, minVal, maxVal, buf); break;
case CV_16U: min_max_caller<unsigned short>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_16U: min_max_caller<unsigned short>(src_, minVal, maxVal, buf); break;
case CV_16S: min_max_caller<signed short>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_16S: min_max_caller<signed short>(src_, minVal, maxVal, buf); break;
case CV_32S: min_max_caller<int>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_32S: min_max_caller<int>(src_, minVal, maxVal, buf); break;
case CV_32F: min_max_caller<float>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_32F: min_max_caller<float>(src_, minVal, maxVal, buf); break;
case CV_64F: min_max_caller<double>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_64F: min_max_caller<double>(src_, minVal, maxVal, buf); break;
default: CV_Error(CV_StsBadArg, "Unsupported type"); default: CV_Error(CV_StsBadArg, "Unsupported type");
} }
} }
...@@ -540,12 +544,12 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal) ...@@ -540,12 +544,12 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal)
{ {
switch (src_.type()) switch (src_.type())
{ {
case CV_8U: min_max_caller_2steps<unsigned char>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_8U: min_max_caller_2steps<unsigned char>(src_, minVal, maxVal, buf); break;
case CV_8S: min_max_caller_2steps<signed char>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_8S: min_max_caller_2steps<signed char>(src_, minVal, maxVal, buf); break;
case CV_16U: min_max_caller_2steps<unsigned short>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_16U: min_max_caller_2steps<unsigned short>(src_, minVal, maxVal, buf); break;
case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, buf); break;
case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, buf); break;
case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, b1.data, b2.data); break; case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, buf); break;
default: CV_Error(CV_StsBadArg, "Unsupported type"); default: CV_Error(CV_StsBadArg, "Unsupported type");
} }
} }
......
...@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace mathfunc
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Min max // Min max
// To avoid shared banck confilict we convert reach value into value of // To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum) // appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {}; template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; }; template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
...@@ -423,6 +423,10 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -423,6 +423,10 @@ namespace cv { namespace gpu { namespace mathfunc
static const unsigned int czero = 0; static const unsigned int czero = 0;
// Global counter of blocks finished its work
__device__ unsigned int blocks_finished;
// Estimates good thread configuration // Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256 // - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(dim3& threads, dim3& grid) void estimate_thread_cfg(dim3& threads, dim3& grid)
...@@ -431,15 +435,17 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -431,15 +435,17 @@ namespace cv { namespace gpu { namespace mathfunc
grid = dim3(6, 5); grid = dim3(6, 5);
} }
// Returns required buffer sizes // Returns required buffer sizes
void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows) void get_buf_size_required(int elem_size, int& cols, int& rows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(threads, grid);
b1cols = grid.x * grid.y * elem_size; b1rows = 1; cols = grid.x * grid.y * elem_size;
b2cols = grid.x * grid.y * elem_size; b2rows = 1; rows = 2;
} }
// Estimates device constants which are used in the kernels using specified thread configuration // Estimates device constants which are used in the kernels using specified thread configuration
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
...@@ -449,6 +455,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -449,6 +455,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
} }
// Does min and max in shared memory // Does min and max in shared memory
template <typename T> template <typename T>
__device__ void merge(unsigned int tid, unsigned int offset, volatile T* minval, volatile T* maxval) __device__ void merge(unsigned int tid, unsigned int offset, volatile T* minval, volatile T* maxval)
...@@ -457,8 +464,6 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -457,8 +464,6 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[tid] = max(maxval[tid], maxval[tid + offset]); maxval[tid] = max(maxval[tid], maxval[tid + offset]);
} }
// Global counter of blocks finished its work
__device__ unsigned int blocks_finished;
template <int nthreads, typename T> template <int nthreads, typename T>
__global__ void min_max_kernel(int cols, int rows, const PtrStep src, T* minval, T* maxval) __global__ void min_max_kernel(int cols, int rows, const PtrStep src, T* minval, T* maxval)
...@@ -535,33 +540,19 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -535,33 +540,19 @@ namespace cv { namespace gpu { namespace mathfunc
#endif #endif
} }
// This kernel will be used only when compute capability is 1.0
template <typename T>
__global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size)
{
T val;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::min();
for (unsigned int i = 0; i < size; ++i)
{
val = minval[i]; if (val < mymin) mymin = val;
val = maxval[i]; if (val > mymax) mymax = val;
}
minval[0] = mymin;
maxval[0] = mymax;
}
template <typename T> template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
unsigned char* minval_buf, unsigned char* maxval_buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); estimate_kernel_consts(src.cols, src.rows, threads, grid);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); T* minval_buf = (T*)buf.ptr(0);
min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, (T*)minval_buf, (T*)maxval_buf); T* maxval_buf = (T*)buf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -569,19 +560,47 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -569,19 +560,47 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
}
template void min_max_caller<unsigned char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<signed char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<unsigned short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<signed short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
// This kernel will be used only when compute capability is 1.0
template <typename T>
__global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size)
{
T val;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::min();
for (unsigned int i = 0; i < size; ++i)
{
val = minval[i]; if (val < mymin) mymin = val;
val = maxval[i]; if (val > mymax) mymax = val;
}
minval[0] = mymin;
maxval[0] = mymax;
} }
template <typename T> template <typename T>
void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
unsigned char* minval_buf, unsigned char* maxval_buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); estimate_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, (T*)minval_buf, (T*)maxval_buf); min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, minval_buf, maxval_buf);
min_max_kernel_2ndstep<T><<<1, 1>>>((T*)minval_buf, (T*)maxval_buf, grid.x * grid.y); min_max_kernel_2ndstep<T><<<1, 1>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -591,23 +610,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -591,23 +610,16 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_; *maxval = maxval_;
} }
template void min_max_caller<unsigned char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*); template void min_max_caller_2steps<unsigned char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<signed char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*); template void min_max_caller_2steps<signed char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<unsigned short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*); template void min_max_caller_2steps<unsigned short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<signed short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*); template void min_max_caller_2steps<signed short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, unsigned char*, unsigned char*); template void min_max_caller_2steps<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*, double*, unsigned char*, unsigned char*); template void min_max_caller_2steps<float>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<double>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
template void min_max_caller_2steps<unsigned char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
template void min_max_caller_2steps<signed char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
template void min_max_caller_2steps<unsigned short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
template void min_max_caller_2steps<signed short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
template void min_max_caller_2steps<int>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
template void min_max_caller_2steps<float>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
} // namespace minmax } // namespace minmax
namespace minmaxloc { namespace minmaxloc {
template <typename T, int op> struct OptLoc {}; template <typename T, int op> struct OptLoc {};
......
...@@ -676,6 +676,8 @@ struct CV_GpuMinMaxTest: public CvTest ...@@ -676,6 +676,8 @@ struct CV_GpuMinMaxTest: public CvTest
{ {
CV_GpuMinMaxTest(): CvTest("GPU-MinMaxTest", "minMax") {} CV_GpuMinMaxTest(): CvTest("GPU-MinMaxTest", "minMax") {}
cv::gpu::GpuMat buf;
void run(int) void run(int)
{ {
int depth_end; int depth_end;
...@@ -732,7 +734,7 @@ struct CV_GpuMinMaxTest: public CvTest ...@@ -732,7 +734,7 @@ struct CV_GpuMinMaxTest: public CvTest
double minVal_, maxVal_; double minVal_, maxVal_;
cv::Point minLoc_, maxLoc_; cv::Point minLoc_, maxLoc_;
cv::gpu::minMax(cv::gpu::GpuMat(src), &minVal_, &maxVal_); cv::gpu::minMax(cv::gpu::GpuMat(src), &minVal_, &maxVal_, buf);
if (abs(minVal - minVal_) > 1e-3f) if (abs(minVal - minVal_) > 1e-3f)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment