Commit e8f9762e authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

matrix reduction

parent fbf3de43
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -204,34 +204,19 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) ...@@ -204,34 +204,19 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Sum // Sum
namespace cv { namespace gpu { namespace device namespace sum
{ {
namespace matrix_reductions void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
{
namespace sum
{
template <typename T>
void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
template <typename T>
void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
template <typename T>
void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
template <typename T> template <typename T, int cn>
void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn); void run(PtrStepSzb src, void* buf, double* sum);
template <typename T> template <typename T, int cn>
void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn); void runAbs(PtrStepSzb src, void* buf, double* sum);
template <typename T> template <typename T, int cn>
void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn); void runSqr(PtrStepSzb src, void* buf, double* sum);
}
void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
}
}
}}}
Scalar cv::gpu::sum(const GpuMat& src) Scalar cv::gpu::sum(const GpuMat& src)
{ {
...@@ -239,159 +224,115 @@ Scalar cv::gpu::sum(const GpuMat& src) ...@@ -239,159 +224,115 @@ Scalar cv::gpu::sum(const GpuMat& src)
return sum(src, buf); return sum(src, buf);
} }
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{ {
using namespace cv::gpu::device::matrix_reductions::sum; typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
static const func_t funcs[7][5] =
typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
static Caller multipass_callers[] =
{ {
sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, {0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
sumMultipassCaller<int>, sumMultipassCaller<float> {0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
{0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
{0, ::sum::run<int , 1>, ::sum::run<int , 2>, ::sum::run<int , 3>, ::sum::run<int , 4>},
{0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
{0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
}; };
static Caller singlepass_callers[] = { if (src.depth() == CV_64F)
sumCaller<unsigned char>, sumCaller<char>, {
sumCaller<unsigned short>, sumCaller<short>, if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
sumCaller<int>, sumCaller<float> CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}; }
CV_Assert(src.depth() <= CV_32F);
Size buf_size; Size buf_size;
getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
Caller* callers = multipass_callers; const func_t func = funcs[src.depth()][src.channels()];
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Caller caller = callers[src.depth()];
double result[4]; double result[4];
caller(src, buf, result, src.channels()); func(src, buf.data, result);
return Scalar(result[0], result[1], result[2], result[3]); return Scalar(result[0], result[1], result[2], result[3]);
} }
Scalar cv::gpu::absSum(const GpuMat& src) Scalar cv::gpu::absSum(const GpuMat& src)
{ {
GpuMat buf; GpuMat buf;
return absSum(src, buf); return absSum(src, buf);
} }
Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
{ {
using namespace cv::gpu::device::matrix_reductions::sum; typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
static const func_t funcs[7][5] =
typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
static Caller multipass_callers[] =
{
absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
absSumMultipassCaller<int>, absSumMultipassCaller<float>
};
static Caller singlepass_callers[] =
{ {
absSumCaller<unsigned char>, absSumCaller<char>, {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
absSumCaller<unsigned short>, absSumCaller<short>, {0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
absSumCaller<int>, absSumCaller<float> {0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
{0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
{0, ::sum::runAbs<int , 1>, ::sum::runAbs<int , 2>, ::sum::runAbs<int , 3>, ::sum::runAbs<int , 4>},
{0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
{0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
}; };
CV_Assert(src.depth() <= CV_32F);
Size buf_size; Size buf_size;
getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
Caller* callers = multipass_callers; const func_t func = funcs[src.depth()][src.channels()];
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Caller caller = callers[src.depth()];
double result[4]; double result[4];
caller(src, buf, result, src.channels()); func(src, buf.data, result);
return Scalar(result[0], result[1], result[2], result[3]); return Scalar(result[0], result[1], result[2], result[3]);
} }
Scalar cv::gpu::sqrSum(const GpuMat& src) Scalar cv::gpu::sqrSum(const GpuMat& src)
{ {
GpuMat buf; GpuMat buf;
return sqrSum(src, buf); return sqrSum(src, buf);
} }
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{ {
using namespace cv::gpu::device::matrix_reductions::sum; typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
static const func_t funcs[7][5] =
typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
static Caller multipass_callers[] =
{ {
sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, {0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float> {0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
{0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
{0, ::sum::runSqr<int , 1>, ::sum::runSqr<int , 2>, ::sum::runSqr<int , 3>, ::sum::runSqr<int , 4>},
{0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
{0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
}; };
static Caller singlepass_callers[7] =
{
sqrSumCaller<unsigned char>, sqrSumCaller<char>,
sqrSumCaller<unsigned short>, sqrSumCaller<short>,
sqrSumCaller<int>, sqrSumCaller<float>
};
CV_Assert(src.depth() <= CV_32F);
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Size buf_size; Size buf_size;
getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
Caller caller = callers[src.depth()]; const func_t func = funcs[src.depth()][src.channels()];
double result[4]; double result[4];
caller(src, buf, result, src.channels()); func(src, buf.data, result);
return Scalar(result[0], result[1], result[2], result[3]); return Scalar(result[0], result[1], result[2], result[3]);
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Find min or max // minMax
namespace cv { namespace gpu { namespace device namespace minMax
{ {
namespace matrix_reductions void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
{
namespace minmax
{
void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T>
void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
template <typename T>
void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template <typename T>
void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
template <typename T>
void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
}
}}}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask) void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
{ {
...@@ -399,45 +340,22 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp ...@@ -399,45 +340,22 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
minMax(src, minVal, maxVal, mask, buf); minMax(src, minVal, maxVal, mask, buf);
} }
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf) void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{ {
using namespace ::cv::gpu::device::matrix_reductions::minmax; typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
static const func_t funcs[] =
typedef void (*Caller)(const PtrStepSzb, double*, double*, PtrStepb);
typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
static Caller multipass_callers[] =
{
minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
};
static Caller singlepass_callers[] =
{
minMaxCaller<unsigned char>, minMaxCaller<char>,
minMaxCaller<unsigned short>, minMaxCaller<short>,
minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
};
static MaskedCaller masked_multipass_callers[] =
{
minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
};
static MaskedCaller masked_singlepass_callers[] =
{ {
minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, ::minMax::run<uchar>,
minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, ::minMax::run<schar>,
minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double> ::minMax::run<ushort>,
::minMax::run<short>,
::minMax::run<int>,
::minMax::run<float>,
::minMax::run<double>
}; };
CV_Assert(src.depth() <= CV_64F); CV_Assert( src.channels() == 1 );
CV_Assert(src.channels() == 1); CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
if (src.depth() == CV_64F) if (src.depth() == CV_64F)
{ {
...@@ -445,66 +363,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp ...@@ -445,66 +363,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
} }
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size buf_size; Size buf_size;
getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height); ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
if (mask.empty()) const func_t func = funcs[src.depth()];
{
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Caller caller = callers[src.type()];
CV_Assert(caller != 0);
caller(src, minVal, maxVal, buf);
}
else
{
MaskedCaller* callers = masked_multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()]; double temp1, temp2;
CV_Assert(caller != 0); func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
caller(src, mask, minVal, maxVal, buf);
}
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Locate min and max // minMaxLoc
namespace cv { namespace gpu { namespace device namespace minMaxLoc
{ {
namespace matrix_reductions void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
{
namespace minmaxloc
{
void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows);
template <typename T>
void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
template <typename T>
void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
template <typename T>
void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
template <typename T> template <typename T>
void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf); }
}
}
}}}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask) void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
{ {
...@@ -515,42 +393,20 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point ...@@ -515,42 +393,20 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf) const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
{ {
using namespace ::cv::gpu::device::matrix_reductions::minmaxloc; typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
static const func_t funcs[] =
typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
static Caller multipass_callers[] =
{
minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
};
static Caller singlepass_callers[] =
{ {
minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, ::minMaxLoc::run<uchar>,
minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, ::minMaxLoc::run<schar>,
minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> ::minMaxLoc::run<ushort>,
::minMaxLoc::run<short>,
::minMaxLoc::run<int>,
::minMaxLoc::run<float>,
::minMaxLoc::run<double>
}; };
static MaskedCaller masked_multipass_callers[] = CV_Assert( src.channels() == 1 );
{ CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
};
static MaskedCaller masked_singlepass_callers[] =
{
minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
};
CV_Assert(src.depth() <= CV_64F);
CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
if (src.depth() == CV_64F) if (src.depth() == CV_64F)
{ {
...@@ -558,61 +414,28 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point ...@@ -558,61 +414,28 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
} }
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
int minLoc_[2];
int maxLoc_[2];
Size valbuf_size, locbuf_size; Size valbuf_size, locbuf_size;
getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width, ::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
valbuf_size.height, locbuf_size.width, locbuf_size.height);
ensureSizeIsEnough(valbuf_size, CV_8U, valBuf); ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
ensureSizeIsEnough(locbuf_size, CV_8U, locBuf); ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
if (mask.empty()) const func_t func = funcs[src.depth()];
{
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Caller caller = callers[src.type()];
CV_Assert(caller != 0);
caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
}
else
{
MaskedCaller* callers = masked_multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()];
CV_Assert(caller != 0);
caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
}
if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; } double temp1, temp2;
if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; } Point temp3, temp4;
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Count non-zero elements // countNonZero
namespace cv { namespace gpu { namespace device namespace countNonZero
{ {
namespace matrix_reductions void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
{
namespace countnonzero
{
void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
template <typename T> template <typename T>
int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf); int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
}
template <typename T>
int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf);
}
}
}}}
int cv::gpu::countNonZero(const GpuMat& src) int cv::gpu::countNonZero(const GpuMat& src)
{ {
...@@ -620,27 +443,20 @@ int cv::gpu::countNonZero(const GpuMat& src) ...@@ -620,27 +443,20 @@ int cv::gpu::countNonZero(const GpuMat& src)
return countNonZero(src, buf); return countNonZero(src, buf);
} }
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{ {
using namespace ::cv::gpu::device::matrix_reductions::countnonzero; typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
static const func_t funcs[] =
typedef int (*Caller)(const PtrStepSzb src, PtrStepb buf);
static Caller multipass_callers[7] =
{ {
countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>, ::countNonZero::run<uchar>,
countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>, ::countNonZero::run<schar>,
countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 ::countNonZero::run<ushort>,
::countNonZero::run<short>,
::countNonZero::run<int>,
::countNonZero::run<float>,
::countNonZero::run<double>
}; };
static Caller singlepass_callers[7] =
{
countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
CV_Assert(src.depth() <= CV_64F);
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
if (src.depth() == CV_64F) if (src.depth() == CV_64F)
...@@ -650,168 +466,190 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) ...@@ -650,168 +466,190 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
} }
Size buf_size; Size buf_size;
getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height); ::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller* callers = multipass_callers; const func_t func = funcs[src.depth()];
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Caller caller = callers[src.type()]; return func(src, buf);
CV_Assert(caller != 0);
return caller(src, buf);
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// reduce // reduce
namespace cv { namespace gpu { namespace device namespace reduce
{ {
namespace matrix_reductions template <typename T, typename S, typename D>
{ void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream); template <typename T, typename S, typename D>
} void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
}}} }
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream) void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
{ {
using namespace ::cv::gpu::device::matrix_reductions; CV_Assert( src.channels() <= 4 );
CV_Assert( dim == 0 || dim == 1 );
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F); CV_Assert( reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN );
CV_Assert(dim == 0 || dim == 1);
CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);
if (dtype < 0) if (dtype < 0)
dtype = src.depth(); dtype = src.depth();
dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels())); dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
if (dim == 0) if (dim == 0)
{ {
typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream); typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
static const caller_t callers[6][6] =
{ {
{ {
reduceRows_gpu<unsigned char, int, unsigned char>, ::reduce::rows<unsigned char, int, unsigned char>,
0/*reduceRows_gpu<unsigned char, int, signed char>*/, 0/*::reduce::rows<unsigned char, int, signed char>*/,
0/*reduceRows_gpu<unsigned char, int, unsigned short>*/, 0/*::reduce::rows<unsigned char, int, unsigned short>*/,
0/*reduceRows_gpu<unsigned char, int, short>*/, 0/*::reduce::rows<unsigned char, int, short>*/,
reduceRows_gpu<unsigned char, int, int>, ::reduce::rows<unsigned char, int, int>,
reduceRows_gpu<unsigned char, int, float> ::reduce::rows<unsigned char, float, float>,
::reduce::rows<unsigned char, double, double>
},
{
0/*::reduce::rows<signed char, int, unsigned char>*/,
0/*::reduce::rows<signed char, int, signed char>*/,
0/*::reduce::rows<signed char, int, unsigned short>*/,
0/*::reduce::rows<signed char, int, short>*/,
0/*::reduce::rows<signed char, int, int>*/,
0/*::reduce::rows<signed char, float, float>*/,
0/*::reduce::rows<signed char, double, double>*/
}, },
{ {
0/*reduceRows_gpu<signed char, int, unsigned char>*/, 0/*::reduce::rows<unsigned short, int, unsigned char>*/,
0/*reduceRows_gpu<signed char, int, signed char>*/, 0/*::reduce::rows<unsigned short, int, signed char>*/,
0/*reduceRows_gpu<signed char, int, unsigned short>*/, ::reduce::rows<unsigned short, int, unsigned short>,
0/*reduceRows_gpu<signed char, int, short>*/, 0/*::reduce::rows<unsigned short, int, short>*/,
0/*reduceRows_gpu<signed char, int, int>*/, ::reduce::rows<unsigned short, int, int>,
0/*reduceRows_gpu<signed char, int, float>*/ ::reduce::rows<unsigned short, float, float>,
::reduce::rows<unsigned short, double, double>
}, },
{ {
0/*reduceRows_gpu<unsigned short, int, unsigned char>*/, 0/*::reduce::rows<short, int, unsigned char>*/,
0/*reduceRows_gpu<unsigned short, int, signed char>*/, 0/*::reduce::rows<short, int, signed char>*/,
reduceRows_gpu<unsigned short, int, unsigned short>, 0/*::reduce::rows<short, int, unsigned short>*/,
0/*reduceRows_gpu<unsigned short, int, short>*/, ::reduce::rows<short, int, short>,
reduceRows_gpu<unsigned short, int, int>, ::reduce::rows<short, int, int>,
reduceRows_gpu<unsigned short, int, float> ::reduce::rows<short, float, float>,
::reduce::rows<short, double, double>
}, },
{ {
0/*reduceRows_gpu<short, int, unsigned char>*/, 0/*::reduce::rows<int, int, unsigned char>*/,
0/*reduceRows_gpu<short, int, signed char>*/, 0/*::reduce::rows<int, int, signed char>*/,
0/*reduceRows_gpu<short, int, unsigned short>*/, 0/*::reduce::rows<int, int, unsigned short>*/,
reduceRows_gpu<short, int, short>, 0/*::reduce::rows<int, int, short>*/,
reduceRows_gpu<short, int, int>, ::reduce::rows<int, int, int>,
reduceRows_gpu<short, int, float> ::reduce::rows<int, float, float>,
::reduce::rows<int, double, double>
}, },
{ {
0/*reduceRows_gpu<int, int, unsigned char>*/, 0/*::reduce::rows<float, float, unsigned char>*/,
0/*reduceRows_gpu<int, int, signed char>*/, 0/*::reduce::rows<float, float, signed char>*/,
0/*reduceRows_gpu<int, int, unsigned short>*/, 0/*::reduce::rows<float, float, unsigned short>*/,
0/*reduceRows_gpu<int, int, short>*/, 0/*::reduce::rows<float, float, short>*/,
reduceRows_gpu<int, int, int>, 0/*::reduce::rows<float, float, int>*/,
reduceRows_gpu<int, int, float> ::reduce::rows<float, float, float>,
::reduce::rows<float, double, double>
}, },
{ {
0/*reduceRows_gpu<float, float, unsigned char>*/, 0/*::reduce::rows<double, double, unsigned char>*/,
0/*reduceRows_gpu<float, float, signed char>*/, 0/*::reduce::rows<double, double, signed char>*/,
0/*reduceRows_gpu<float, float, unsigned short>*/, 0/*::reduce::rows<double, double, unsigned short>*/,
0/*reduceRows_gpu<float, float, short>*/, 0/*::reduce::rows<double, double, short>*/,
0/*reduceRows_gpu<float, float, int>*/, 0/*::reduce::rows<double, double, int>*/,
reduceRows_gpu<float, float, float> 0/*::reduce::rows<double, double, float>*/,
::reduce::rows<double, double, double>
} }
}; };
const caller_t func = callers[src.depth()][dst.depth()]; const func_t func = funcs[src.depth()][dst.depth()];
if (!func) if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats"); CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream)); func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
} }
else else
{ {
typedef void (*caller_t)(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream); typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
static const caller_t callers[6][6] =
{ {
{ {
reduceCols_gpu<unsigned char, int, unsigned char>, ::reduce::cols<unsigned char, int, unsigned char>,
0/*reduceCols_gpu<unsigned char, int, signed char>*/, 0/*::reduce::cols<unsigned char, int, signed char>*/,
0/*reduceCols_gpu<unsigned char, int, unsigned short>*/, 0/*::reduce::cols<unsigned char, int, unsigned short>*/,
0/*reduceCols_gpu<unsigned char, int, short>*/, 0/*::reduce::cols<unsigned char, int, short>*/,
reduceCols_gpu<unsigned char, int, int>, ::reduce::cols<unsigned char, int, int>,
reduceCols_gpu<unsigned char, int, float> ::reduce::cols<unsigned char, float, float>,
::reduce::cols<unsigned char, double, double>
},
{
0/*::reduce::cols<signed char, int, unsigned char>*/,
0/*::reduce::cols<signed char, int, signed char>*/,
0/*::reduce::cols<signed char, int, unsigned short>*/,
0/*::reduce::cols<signed char, int, short>*/,
0/*::reduce::cols<signed char, int, int>*/,
0/*::reduce::cols<signed char, float, float>*/,
0/*::reduce::cols<signed char, double, double>*/
}, },
{ {
0/*reduceCols_gpu<signed char, int, unsigned char>*/, 0/*::reduce::cols<unsigned short, int, unsigned char>*/,
0/*reduceCols_gpu<signed char, int, signed char>*/, 0/*::reduce::cols<unsigned short, int, signed char>*/,
0/*reduceCols_gpu<signed char, int, unsigned short>*/, ::reduce::cols<unsigned short, int, unsigned short>,
0/*reduceCols_gpu<signed char, int, short>*/, 0/*::reduce::cols<unsigned short, int, short>*/,
0/*reduceCols_gpu<signed char, int, int>*/, ::reduce::cols<unsigned short, int, int>,
0/*reduceCols_gpu<signed char, int, float>*/ ::reduce::cols<unsigned short, float, float>,
::reduce::cols<unsigned short, double, double>
}, },
{ {
0/*reduceCols_gpu<unsigned short, int, unsigned char>*/, 0/*::reduce::cols<short, int, unsigned char>*/,
0/*reduceCols_gpu<unsigned short, int, signed char>*/, 0/*::reduce::cols<short, int, signed char>*/,
reduceCols_gpu<unsigned short, int, unsigned short>, 0/*::reduce::cols<short, int, unsigned short>*/,
0/*reduceCols_gpu<unsigned short, int, short>*/, ::reduce::cols<short, int, short>,
reduceCols_gpu<unsigned short, int, int>, ::reduce::cols<short, int, int>,
reduceCols_gpu<unsigned short, int, float> ::reduce::cols<short, float, float>,
::reduce::cols<short, double, double>
}, },
{ {
0/*reduceCols_gpu<short, int, unsigned char>*/, 0/*::reduce::cols<int, int, unsigned char>*/,
0/*reduceCols_gpu<short, int, signed char>*/, 0/*::reduce::cols<int, int, signed char>*/,
0/*reduceCols_gpu<short, int, unsigned short>*/, 0/*::reduce::cols<int, int, unsigned short>*/,
reduceCols_gpu<short, int, short>, 0/*::reduce::cols<int, int, short>*/,
reduceCols_gpu<short, int, int>, ::reduce::cols<int, int, int>,
reduceCols_gpu<short, int, float> ::reduce::cols<int, float, float>,
::reduce::cols<int, double, double>
}, },
{ {
0/*reduceCols_gpu<int, int, unsigned char>*/, 0/*::reduce::cols<float, float, unsigned char>*/,
0/*reduceCols_gpu<int, int, signed char>*/, 0/*::reduce::cols<float, float, signed char>*/,
0/*reduceCols_gpu<int, int, unsigned short>*/, 0/*::reduce::cols<float, float, unsigned short>*/,
0/*reduceCols_gpu<int, int, short>*/, 0/*::reduce::cols<float, float, short>*/,
reduceCols_gpu<int, int, int>, 0/*::reduce::cols<float, float, int>*/,
reduceCols_gpu<int, int, float> ::reduce::cols<float, float, float>,
::reduce::cols<float, double, double>
}, },
{ {
0/*reduceCols_gpu<float, unsigned char>*/, 0/*::reduce::cols<double, double, unsigned char>*/,
0/*reduceCols_gpu<float, signed char>*/, 0/*::reduce::cols<double, double, signed char>*/,
0/*reduceCols_gpu<float, unsigned short>*/, 0/*::reduce::cols<double, double, unsigned short>*/,
0/*reduceCols_gpu<float, short>*/, 0/*::reduce::cols<double, double, short>*/,
0/*reduceCols_gpu<float, int>*/, 0/*::reduce::cols<double, double, int>*/,
reduceCols_gpu<float, float, float> 0/*::reduce::cols<double, double, float>*/,
::reduce::cols<double, double, double>
} }
}; };
const caller_t func = callers[src.depth()][dst.depth()]; const func_t func = funcs[src.depth()][dst.depth()];
if (!func) if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats"); CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream)); func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
} }
} }
......
...@@ -2982,7 +2982,7 @@ TEST_P(Sum, Sqr) ...@@ -2982,7 +2982,7 @@ TEST_P(Sum, Sqr)
INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine( INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
DIFFERENT_SIZES, DIFFERENT_SIZES,
TYPES(CV_8U, CV_32F, 1, 4), TYPES(CV_8U, CV_64F, 1, 4),
WHOLE_SUBMAT)); WHOLE_SUBMAT));
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
...@@ -3351,7 +3351,14 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, Reduc ...@@ -3351,7 +3351,14 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, Reduc
cv::gpu::setDevice(devInfo.deviceID()); cv::gpu::setDevice(devInfo.deviceID());
type = CV_MAKE_TYPE(depth, channels); type = CV_MAKE_TYPE(depth, channels);
dst_depth = (reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN) ? depth : CV_32F;
if (reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN)
dst_depth = depth;
else if (reduceOp == CV_REDUCE_SUM)
dst_depth = depth == CV_8U ? CV_32S : depth < CV_64F ? CV_32F : depth;
else
dst_depth = depth < CV_32F ? CV_32F : depth;
dst_type = CV_MAKE_TYPE(dst_depth, channels); dst_type = CV_MAKE_TYPE(dst_depth, channels);
} }
...@@ -3392,7 +3399,8 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine( ...@@ -3392,7 +3399,8 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
testing::Values(MatDepth(CV_8U), testing::Values(MatDepth(CV_8U),
MatDepth(CV_16U), MatDepth(CV_16U),
MatDepth(CV_16S), MatDepth(CV_16S),
MatDepth(CV_32F)), MatDepth(CV_32F),
MatDepth(CV_64F)),
ALL_CHANNELS, ALL_CHANNELS,
ALL_REDUCE_CODES, ALL_REDUCE_CODES,
WHOLE_SUBMAT)); WHOLE_SUBMAT));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment