Commit df852937 authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

refactoring: moved gpu reduction-based functions into separated file

parent 1922e50f
......@@ -360,66 +360,17 @@ namespace cv
friend struct StreamAccessor;
};
////////////////////////////// Arithmetics ///////////////////////////////////
//! transposes the matrix
//! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type
CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst);
//! computes mean value and standard deviation of all or selected array elements
//! supports only CV_8UC1 type
CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
//! computes norm of array
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
//! computes norm of the difference between two arrays
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
//! reverses the order of the rows, columns or both in a matrix
//! supports CV_8UC1, CV_8UC4 types
CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sum(const GpuMat& src);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sqrSum(const GpuMat& src);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
const GpuMat& mask=GpuMat());
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
//! counts non-zero array elements
CV_EXPORTS int countNonZero(const GpuMat& src);
//! counts non-zero array elements
CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
//! destination array will have the depth type as lut and the same channels number as source
//! supports CV_8UC1, CV_8UC3 types
......@@ -487,25 +438,6 @@ namespace cv
//! async version
CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream);
//! computes per-element minimum of two arrays (dst = min(src1, src2))
CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
//! Async version
CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
//! computes per-element minimum of array and scalar (dst = min(src1, src2))
CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst);
//! Async version
CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
//! Async version
CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
//! computes per-element maximum of array and scalar (dst = max(src1, src2))
CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst);
//! Async version
CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
//////////////////////////// Per-element operations ////////////////////////////////////
......@@ -576,6 +508,26 @@ namespace cv
//! async version
CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
//! computes per-element minimum of two arrays (dst = min(src1, src2))
CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
//! Async version
CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
//! computes per-element minimum of array and scalar (dst = min(src1, src2))
CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst);
//! Async version
CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
//! Async version
CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
//! computes per-element maximum of array and scalar (dst = max(src1, src2))
CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst);
//! Async version
CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
////////////////////////////// Image processing //////////////////////////////
......@@ -663,15 +615,66 @@ namespace cv
//! computes Harris cornerness criteria at each image pixel
CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
//! computes the proximity map for the raster template and the image where the template is searched for
CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
////////////////////////////// Matrix reductions //////////////////////////////
//! computes mean value and standard deviation of all or selected array elements
//! supports only CV_8UC1 type
CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
//! computes norm of array
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
//! computes norm of the difference between two arrays
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sum(const GpuMat& src);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sqrSum(const GpuMat& src);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
const GpuMat& mask=GpuMat());
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
//! counts non-zero array elements
CV_EXPORTS int countNonZero(const GpuMat& src);
//! counts non-zero array elements
CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
//////////////////////////////// Filter Engine ////////////////////////////////
/*!
......
......@@ -49,20 +49,7 @@ using namespace std;
#if !defined (HAVE_CUDA)
void cv::gpu::transpose(const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); }
......@@ -78,14 +65,6 @@ void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool)
void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */
......@@ -118,54 +97,6 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
}
}
////////////////////////////////////////////////////////////////////////
// meanStdDev
void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
{
CV_Assert(src.type() == CV_8UC1);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
}
////////////////////////////////////////////////////////////////////////
// norm
double cv::gpu::norm(const GpuMat& src1, int normType)
{
return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType);
}
double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
{
CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert(src1.type() == CV_8UC1);
CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
NppiSize oSizeROI, Npp64f* pRetVal);
static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
NppiSize sz;
sz.width = src1.cols;
sz.height = src1.rows;
int funcIdx = normType >> 1;
double retVal;
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step,
src2.ptr<Npp8u>(), src2.step,
sz, &retVal) );
return retVal;
}
////////////////////////////////////////////////////////////////////////
// flip
......@@ -193,305 +124,6 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode)
}
}
////////////////////////////////////////////////////////////////////////
// sum
namespace cv { namespace gpu { namespace mathfunc
{
template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
namespace sum
{
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows);
}
}}}
Scalar cv::gpu::sum(const GpuMat& src)
{
GpuMat buf;
return sum(src, buf);
}
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>,
sum_multipass_caller<unsigned short>, sum_multipass_caller<short>,
sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
{ sum_caller<unsigned char>, sum_caller<char>,
sum_caller<unsigned short>, sum_caller<short>,
sum_caller<int>, sum_caller<float>, 0 } };
Size bufSize;
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
double result[4];
caller(src, buf, result, src.channels());
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::gpu::sqrSum(const GpuMat& src)
{
GpuMat buf;
return sqrSum(src, buf);
}
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>,
sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>,
sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
{ sqsum_caller<unsigned char>, sqsum_caller<char>,
sqsum_caller<unsigned short>, sqsum_caller<short>,
sqsum_caller<int>, sqsum_caller<float>, 0 } };
Size bufSize;
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
double result[4];
caller(src, buf, result, src.channels());
return Scalar(result[0], result[1], result[2], result[3]);
}
////////////////////////////////////////////////////////////////////////
// minMax
namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
}}}}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
{
GpuMat buf;
minMax(src, minVal, maxVal, mask, buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{
using namespace mathfunc::minmax;
typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);
static const Caller callers[2][7] =
{ { min_max_multipass_caller<unsigned char>, min_max_multipass_caller<char>,
min_max_multipass_caller<unsigned short>, min_max_multipass_caller<short>,
min_max_multipass_caller<int>, min_max_multipass_caller<float>, 0 },
{ min_max_caller<unsigned char>, min_max_caller<char>,
min_max_caller<unsigned short>, min_max_caller<short>,
min_max_caller<int>, min_max_caller<float>, min_max_caller<double> } };
static const MaskedCaller masked_callers[2][7] =
{ { min_max_mask_multipass_caller<unsigned char>, min_max_mask_multipass_caller<char>,
min_max_mask_multipass_caller<unsigned short>, min_max_mask_multipass_caller<short>,
min_max_mask_multipass_caller<int>, min_max_mask_multipass_caller<float>, 0 },
{ min_max_mask_caller<unsigned char>, min_max_mask_caller<char>,
min_max_mask_caller<unsigned short>, min_max_mask_caller<short>,
min_max_mask_caller<int>, min_max_mask_caller<float>,
min_max_mask_caller<double> } };
CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size bufSize;
get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
if (mask.empty())
{
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, minVal, maxVal, buf);
}
else
{
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, mask, minVal, maxVal, buf);
}
}
////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows);
template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
}}}}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
{
GpuMat valbuf, locbuf;
minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valbuf, locbuf);
}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
{
using namespace mathfunc::minmaxloc;
typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
static const Caller callers[2][7] =
{ { min_max_loc_multipass_caller<unsigned char>, min_max_loc_multipass_caller<char>,
min_max_loc_multipass_caller<unsigned short>, min_max_loc_multipass_caller<short>,
min_max_loc_multipass_caller<int>, min_max_loc_multipass_caller<float>, 0 },
{ min_max_loc_caller<unsigned char>, min_max_loc_caller<char>,
min_max_loc_caller<unsigned short>, min_max_loc_caller<short>,
min_max_loc_caller<int>, min_max_loc_caller<float>, min_max_loc_caller<double> } };
static const MaskedCaller masked_callers[2][7] =
{ { min_max_loc_mask_multipass_caller<unsigned char>, min_max_loc_mask_multipass_caller<char>,
min_max_loc_mask_multipass_caller<unsigned short>, min_max_loc_mask_multipass_caller<short>,
min_max_loc_mask_multipass_caller<int>, min_max_loc_mask_multipass_caller<float>, 0 },
{ min_max_loc_mask_caller<unsigned char>, min_max_loc_mask_caller<char>,
min_max_loc_mask_caller<unsigned short>, min_max_loc_mask_caller<short>,
min_max_loc_mask_caller<int>, min_max_loc_mask_caller<float>, min_max_loc_mask_caller<double> } };
CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
int minLoc_[2];
int maxLoc_[2];
Size valbuf_size, locbuf_size;
get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width,
valbuf_size.height, locbuf_size.width, locbuf_size.height);
valbuf.create(valbuf_size, CV_8U);
locbuf.create(locbuf_size, CV_8U);
if (mask.empty())
{
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
}
else
{
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
}
if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
}
////////////////////////////////////////////////////////////////////////
// Count non zero
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf);
template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf);
}}}}
int cv::gpu::countNonZero(const GpuMat& src)
{
GpuMat buf;
return countNonZero(src, buf);
}
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc::countnonzero;
typedef int (*Caller)(const DevMem2D src, PtrStep buf);
static const Caller callers[2][7] =
{ { count_non_zero_multipass_caller<unsigned char>, count_non_zero_multipass_caller<char>,
count_non_zero_multipass_caller<unsigned short>, count_non_zero_multipass_caller<short>,
count_non_zero_multipass_caller<int>, count_non_zero_multipass_caller<float>, 0},
{ count_non_zero_caller<unsigned char>, count_non_zero_caller<char>,
count_non_zero_caller<unsigned short>, count_non_zero_caller<short>,
count_non_zero_caller<int>, count_non_zero_caller<float>, count_non_zero_caller<double> } };
CV_Assert(src.channels() == 1);
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
Size buf_size;
get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
buf.create(buf_size, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
return caller(src, buf);
}
////////////////////////////////////////////////////////////////////////
// LUT
......@@ -711,144 +343,4 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat&
}
//////////////////////////////////////////////////////////////////////////////
// min/max
namespace cv { namespace gpu { namespace mathfunc
{
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
}}}
namespace
{
template <typename T>
void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
}
template <typename T>
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{
dst.create(src1.size(), src1.type());
mathfunc::min_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
}
template <typename T>
void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
}
template <typename T>
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{
dst.create(src1.size(), src1.type());
mathfunc::max_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
}
}
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
#endif /* !defined (HAVE_CUDA) */
......@@ -345,4 +345,127 @@ namespace cv { namespace gpu { namespace mathfunc
template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
//////////////////////////////////////////////////////////////////////////
// min/max
struct MinOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return min(a, b);
}
__device__ float operator()(float a, float b)
{
return fmin(a, b);
}
__device__ double operator()(double a, double b)
{
return fmin(a, b);
}
};
struct MaxOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return max(a, b);
}
__device__ float operator()(float a, float b)
{
return fmax(a, b);
}
__device__ double operator()(double a, double b)
{
return fmax(a, b);
}
};
struct ScalarMinOp
{
double s;
explicit ScalarMinOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmin((double)a, s));
}
};
struct ScalarMaxOp
{
double s;
explicit ScalarMaxOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmax((double)a, s));
}
};
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MinOp op;
transform(src1, src2, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MaxOp op;
transform(src1, src2, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMinOp op(src2);
transform(src1, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMaxOp op(src2);
transform(src1, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
}}}
......@@ -58,49 +58,6 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc
{
template <int size, typename T>
__device__ void sum_in_smem(volatile T* data, const uint tid)
{
T sum = data[tid];
if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) data[tid] = sum = sum + data[tid + 32];
if (size >= 32) data[tid] = sum = sum + data[tid + 16];
if (size >= 16) data[tid] = sum = sum + data[tid + 8];
if (size >= 8) data[tid] = sum = sum + data[tid + 4];
if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1];
}
}
struct Mask8U
{
explicit Mask8U(PtrStep mask): mask(mask) {}
__device__ bool operator()(int y, int x) const
{
return mask.ptr(y)[x];
}
PtrStep mask;
};
struct MaskTrue
{
__device__ bool operator()(int y, int x) const
{
return true;
}
};
struct Nothing
{
static __device__ void calc(int, int, float, float, float*, size_t, float)
......@@ -259,1676 +216,42 @@ namespace cv { namespace gpu { namespace mathfunc
}
//////////////////////////////////////////////////////////////////////////////
// Min max
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
namespace minmax
{
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size;
bufrows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
// Does min and max in shared memory
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
{
minval[tid] = min(minval[tid], minval[tid + offset]);
maxval[tid] = max(maxval[tid], maxval[tid + offset]);
}
template <int size, typename T>
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval);
if (size >= 32) merge(tid, 16, minval, maxval);
if (size >= 16) merge(tid, 8, minval, maxval);
if (size >= 8) merge(tid, 4, minval, maxval);
if (size >= 4) merge(tid, 2, minval, maxval);
if (size >= 2) merge(tid, 1, minval, maxval);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* src_row = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
T val = src_row[x];
if (mask(y, x))
{
mymin = min(mymin, val);
mymax = max(mymax, val);
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#endif
}
template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep);
template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
template <int nthreads, typename T>
__global__ void min_max_pass2_kernel(T* minval, T* maxval, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
}
}
template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep);
} // namespace minmax
///////////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minmaxloc {
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations
b2rows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
volatile uint* minloc, volatile uint* maxloc)
{
T val = minval[tid + offset];
if (val < minval[tid])
{
minval[tid] = val;
minloc[tid] = minloc[tid + offset];
}
val = maxval[tid + offset];
if (val > maxval[tid])
{
maxval[tid] = val;
maxloc[tid] = maxloc[tid + offset];
}
}
template <int size, typename T>
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint myminloc = 0;
uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// transpose
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* ptr = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
if (mask(y, x))
__global__ void transpose(const DevMem2Di src, PtrStepi dst)
{
T val = ptr[x];
if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
sminloc[tid] = myminloc;
smaxloc[tid] = mymaxloc;
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
__shared__ int s_mem[16 * 17];
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
if (tid == 0)
if (y < src.rows && x < src.cols)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
s_mem[smem_idx] = src.ptr(y)[x];
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
}
#endif
}
template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
// This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T>
__global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
x = blockIdx.y * blockDim.x + threadIdx.x;
y = blockIdx.x * blockDim.y + threadIdx.y;
if (tid == 0)
if (y < src.cols && x < src.rows)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
}
dst.ptr(y)[x] = s_mem[smem_idx];
}
template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
dim3 threads(16, 16, 1);
dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
transpose<<<grid, threads>>>(src, dst);
cudaSafeCall( cudaThreadSynchronize() );
}
template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
} // namespace minmaxloc
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countnonzero
{
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count)
{
__shared__ uint scount[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint cnt = 0;
for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
cnt += ptr[x0 + x * blockDim.x] != 0;
}
scount[tid] = cnt;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
#endif
}
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T>
__global__ void count_non_zero_pass2_kernel(uint* count, int size)
{
__shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
count[0] = scount[0];
}
template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep);
} // namespace countnonzero
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// transpose
__global__ void transpose(const DevMem2Di src, PtrStepi dst)
{
__shared__ int s_mem[16 * 17];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
if (y < src.rows && x < src.cols)
{
s_mem[smem_idx] = src.ptr(y)[x];
}
__syncthreads();
smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
x = blockIdx.y * blockDim.x + threadIdx.x;
y = blockIdx.x * blockDim.y + threadIdx.y;
if (y < src.cols && x < src.rows)
{
dst.ptr(y)[x] = s_mem[smem_idx];
}
}
void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)
{
dim3 threads(16, 16, 1);
dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
transpose<<<grid, threads>>>(src, dst);
cudaSafeCall( cudaThreadSynchronize() );
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// min/max
struct MinOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return min(a, b);
}
__device__ float operator()(float a, float b)
{
return fmin(a, b);
}
__device__ double operator()(double a, double b)
{
return fmin(a, b);
}
};
struct MaxOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return max(a, b);
}
__device__ float operator()(float a, float b)
{
return fmax(a, b);
}
__device__ double operator()(double a, double b)
{
return fmax(a, b);
}
};
struct ScalarMinOp
{
double s;
explicit ScalarMinOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmin((double)a, s));
}
};
struct ScalarMaxOp
{
double s;
explicit ScalarMaxOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmax((double)a, s));
}
};
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MinOp op;
transform(src1, src2, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MaxOp op;
transform(src1, src2, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMinOp op(src2);
transform(src1, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMaxOp op(src2);
transform(src1, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////////
// Sum
namespace sum
{
template <typename T> struct SumType {};
template <> struct SumType<uchar> { typedef uint R; };
template <> struct SumType<char> { typedef int R; };
template <> struct SumType<ushort> { typedef uint R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename R>
struct IdentityOp { static __device__ R call(R x) { return x; } };
template <typename R>
struct SqrOp { static __device__ R call(R x) { return x * x; } };
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
const int threads_x = 32;
const int threads_y = 8;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y),
divUp(rows, threads.y * threads.x));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel(const DevMem2D src, R* result)
{
__shared__ R smem[nthreads];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
R sum = 0;
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
sum += Op::call(ptr[x0 + x * blockDim.x]);
}
smem[tid] = sum;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = smem[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
{
result[0] = smem[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) result[bid] = smem[0];
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel(R* result, int size)
{
__shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x;
smem[tid] = tid < size ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
result[0] = smem[0];
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
{
typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
{
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
Op::call(val.z), Op::call(val.w));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
smem[tid + 3 * nthreads] = sum.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
}
}
} // namespace sum
template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
}}}
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "transform.hpp"
#include "internal_shared.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc
{
// Performs reduction in shared memory
template <int size, typename T>
__device__ void sum_in_smem(volatile T* data, const uint tid)
{
T sum = data[tid];
if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) data[tid] = sum = sum + data[tid + 32];
if (size >= 32) data[tid] = sum = sum + data[tid + 16];
if (size >= 16) data[tid] = sum = sum + data[tid + 8];
if (size >= 8) data[tid] = sum = sum + data[tid + 4];
if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1];
}
}
struct Mask8U
{
explicit Mask8U(PtrStep mask): mask(mask) {}
__device__ bool operator()(int y, int x) const
{
return mask.ptr(y)[x];
}
PtrStep mask;
};
struct MaskTrue
{
__device__ bool operator()(int y, int x) const
{
return true;
}
};
//////////////////////////////////////////////////////////////////////////////
// Min max
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
namespace minmax
{
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size;
bufrows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
// Does min and max in shared memory
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
{
minval[tid] = min(minval[tid], minval[tid + offset]);
maxval[tid] = max(maxval[tid], maxval[tid + offset]);
}
template <int size, typename T>
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval);
if (size >= 32) merge(tid, 16, minval, maxval);
if (size >= 16) merge(tid, 8, minval, maxval);
if (size >= 8) merge(tid, 4, minval, maxval);
if (size >= 4) merge(tid, 2, minval, maxval);
if (size >= 2) merge(tid, 1, minval, maxval);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* src_row = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
T val = src_row[x];
if (mask(y, x))
{
mymin = min(mymin, val);
mymax = max(mymax, val);
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#endif
}
template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep);
template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
template <int nthreads, typename T>
__global__ void min_max_pass2_kernel(T* minval, T* maxval, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
}
}
template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep);
} // namespace minmax
///////////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minmaxloc {
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations
b2rows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
volatile uint* minloc, volatile uint* maxloc)
{
T val = minval[tid + offset];
if (val < minval[tid])
{
minval[tid] = val;
minloc[tid] = minloc[tid + offset];
}
val = maxval[tid + offset];
if (val > maxval[tid])
{
maxval[tid] = val;
maxloc[tid] = maxloc[tid + offset];
}
}
template <int size, typename T>
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint myminloc = 0;
uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* ptr = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
if (mask(y, x))
{
T val = ptr[x];
if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
sminloc[tid] = myminloc;
smaxloc[tid] = mymaxloc;
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
}
#endif
}
template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
// This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T>
__global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
}
}
template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
} // namespace minmaxloc
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countnonzero
{
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count)
{
__shared__ uint scount[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint cnt = 0;
for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
cnt += ptr[x0 + x * blockDim.x] != 0;
}
scount[tid] = cnt;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
#endif
}
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T>
__global__ void count_non_zero_pass2_kernel(uint* count, int size)
{
__shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
count[0] = scount[0];
}
template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep);
} // namespace countnonzero
//////////////////////////////////////////////////////////////////////////
// Sum
namespace sum
{
template <typename T> struct SumType {};
template <> struct SumType<uchar> { typedef uint R; };
template <> struct SumType<char> { typedef int R; };
template <> struct SumType<ushort> { typedef uint R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename R>
struct IdentityOp { static __device__ R call(R x) { return x; } };
template <typename R>
struct SqrOp { static __device__ R call(R x) { return x * x; } };
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
const int threads_x = 32;
const int threads_y = 8;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y),
divUp(rows, threads.y * threads.x));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel(const DevMem2D src, R* result)
{
__shared__ R smem[nthreads];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
R sum = 0;
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
sum += Op::call(ptr[x0 + x * blockDim.x]);
}
smem[tid] = sum;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = smem[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
{
result[0] = smem[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) result[bid] = smem[0];
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel(R* result, int size)
{
__shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x;
smem[tid] = tid < size ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
result[0] = smem[0];
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
{
typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
{
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
Op::call(val.z), Op::call(val.w));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
smem[tid + 3 * nthreads] = sum.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
}
}
} // namespace sum
template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
}}}
\ No newline at end of file
......@@ -66,10 +66,14 @@ void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&)
void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat&) { throw_nogpu(); return GpuMat(); }
cv::gpu::GpuMat cv::gpu::operator | (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
cv::gpu::GpuMat cv::gpu::operator & (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
#else
......@@ -574,4 +578,144 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
::bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// Minimum and maximum operations
namespace cv { namespace gpu { namespace mathfunc
{
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
}}}
namespace
{
template <typename T>
void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
}
template <typename T>
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{
dst.create(src1.size(), src1.type());
mathfunc::min_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
}
template <typename T>
void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
}
template <typename T>
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{
dst.create(src1.size(), src1.type());
mathfunc::max_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
}
}
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
}
#endif
\ No newline at end of file
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA)
void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
#else
////////////////////////////////////////////////////////////////////////
// meanStdDev
void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
{
CV_Assert(src.type() == CV_8UC1);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
}
////////////////////////////////////////////////////////////////////////
// norm
double cv::gpu::norm(const GpuMat& src1, int normType)
{
return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType);
}
double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
{
CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert(src1.type() == CV_8UC1);
CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
NppiSize oSizeROI, Npp64f* pRetVal);
static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
NppiSize sz;
sz.width = src1.cols;
sz.height = src1.rows;
int funcIdx = normType >> 1;
double retVal;
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step,
src2.ptr<Npp8u>(), src2.step,
sz, &retVal) );
return retVal;
}
////////////////////////////////////////////////////////////////////////
// Sum
namespace cv { namespace gpu { namespace mathfunc
{
template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
namespace sum
{
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows);
}
}}}
Scalar cv::gpu::sum(const GpuMat& src)
{
GpuMat buf;
return sum(src, buf);
}
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>,
sum_multipass_caller<unsigned short>, sum_multipass_caller<short>,
sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
{ sum_caller<unsigned char>, sum_caller<char>,
sum_caller<unsigned short>, sum_caller<short>,
sum_caller<int>, sum_caller<float>, 0 } };
Size bufSize;
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
double result[4];
caller(src, buf, result, src.channels());
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::gpu::sqrSum(const GpuMat& src)
{
GpuMat buf;
return sqrSum(src, buf);
}
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>,
sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>,
sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
{ sqsum_caller<unsigned char>, sqsum_caller<char>,
sqsum_caller<unsigned short>, sqsum_caller<short>,
sqsum_caller<int>, sqsum_caller<float>, 0 } };
Size bufSize;
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
double result[4];
caller(src, buf, result, src.channels());
return Scalar(result[0], result[1], result[2], result[3]);
}
////////////////////////////////////////////////////////////////////////
// Find min or max
namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
}}}}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
{
GpuMat buf;
minMax(src, minVal, maxVal, mask, buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{
using namespace mathfunc::minmax;
typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);
static const Caller callers[2][7] =
{ { min_max_multipass_caller<unsigned char>, min_max_multipass_caller<char>,
min_max_multipass_caller<unsigned short>, min_max_multipass_caller<short>,
min_max_multipass_caller<int>, min_max_multipass_caller<float>, 0 },
{ min_max_caller<unsigned char>, min_max_caller<char>,
min_max_caller<unsigned short>, min_max_caller<short>,
min_max_caller<int>, min_max_caller<float>, min_max_caller<double> } };
static const MaskedCaller masked_callers[2][7] =
{ { min_max_mask_multipass_caller<unsigned char>, min_max_mask_multipass_caller<char>,
min_max_mask_multipass_caller<unsigned short>, min_max_mask_multipass_caller<short>,
min_max_mask_multipass_caller<int>, min_max_mask_multipass_caller<float>, 0 },
{ min_max_mask_caller<unsigned char>, min_max_mask_caller<char>,
min_max_mask_caller<unsigned short>, min_max_mask_caller<short>,
min_max_mask_caller<int>, min_max_mask_caller<float>,
min_max_mask_caller<double> } };
CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size bufSize;
get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U);
if (mask.empty())
{
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, minVal, maxVal, buf);
}
else
{
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, mask, minVal, maxVal, buf);
}
}
////////////////////////////////////////////////////////////////////////
// Locate min and max
namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows);
template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
}}}}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
{
GpuMat valbuf, locbuf;
minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valbuf, locbuf);
}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
{
using namespace mathfunc::minmaxloc;
typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
static const Caller callers[2][7] =
{ { min_max_loc_multipass_caller<unsigned char>, min_max_loc_multipass_caller<char>,
min_max_loc_multipass_caller<unsigned short>, min_max_loc_multipass_caller<short>,
min_max_loc_multipass_caller<int>, min_max_loc_multipass_caller<float>, 0 },
{ min_max_loc_caller<unsigned char>, min_max_loc_caller<char>,
min_max_loc_caller<unsigned short>, min_max_loc_caller<short>,
min_max_loc_caller<int>, min_max_loc_caller<float>, min_max_loc_caller<double> } };
static const MaskedCaller masked_callers[2][7] =
{ { min_max_loc_mask_multipass_caller<unsigned char>, min_max_loc_mask_multipass_caller<char>,
min_max_loc_mask_multipass_caller<unsigned short>, min_max_loc_mask_multipass_caller<short>,
min_max_loc_mask_multipass_caller<int>, min_max_loc_mask_multipass_caller<float>, 0 },
{ min_max_loc_mask_caller<unsigned char>, min_max_loc_mask_caller<char>,
min_max_loc_mask_caller<unsigned short>, min_max_loc_mask_caller<short>,
min_max_loc_mask_caller<int>, min_max_loc_mask_caller<float>, min_max_loc_mask_caller<double> } };
CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
int minLoc_[2];
int maxLoc_[2];
Size valbuf_size, locbuf_size;
get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width,
valbuf_size.height, locbuf_size.width, locbuf_size.height);
valbuf.create(valbuf_size, CV_8U);
locbuf.create(locbuf_size, CV_8U);
if (mask.empty())
{
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
}
else
{
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
}
if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
}
//////////////////////////////////////////////////////////////////////////////
// Count non-zero elements
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf);
template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf);
}}}}
int cv::gpu::countNonZero(const GpuMat& src)
{
GpuMat buf;
return countNonZero(src, buf);
}
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc::countnonzero;
typedef int (*Caller)(const DevMem2D src, PtrStep buf);
static const Caller callers[2][7] =
{ { count_non_zero_multipass_caller<unsigned char>, count_non_zero_multipass_caller<char>,
count_non_zero_multipass_caller<unsigned short>, count_non_zero_multipass_caller<short>,
count_non_zero_multipass_caller<int>, count_non_zero_multipass_caller<float>, 0},
{ count_non_zero_caller<unsigned char>, count_non_zero_caller<char>,
count_non_zero_caller<unsigned short>, count_non_zero_caller<short>,
count_non_zero_caller<int>, count_non_zero_caller<float>, count_non_zero_caller<double> } };
CV_Assert(src.channels() == 1);
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
Size buf_size;
get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
buf.create(buf_size, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
return caller(src, buf);
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment