Commit 03b1d133 authored by Alexander Alekhin's avatar Alexander Alekhin Committed by OpenCV Buildbot

Merge pull request #2660 from arkunze:pullreq/140423-filter2D

parents e9be4865 1f8b41f3
......@@ -4379,7 +4379,7 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
typedef std::string (* func_t)(const Mat &);
static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
const func_t func = funcs[depth];
const func_t func = funcs[ddepth];
CV_Assert(func != 0);
return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
......
......@@ -3191,11 +3191,10 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
"BORDER_WRAP", "BORDER_REFLECT_101" };
cv::Mat kernelMat = _kernel.getMat();
std::vector<float> kernelMatDataFloat;
int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
cv::Size sz = _src.size(), wholeSize;
size_t globalsize[2] = { sz.width, sz.height }, localsize[2] = { 0, 1 };
size_t globalsize[2] = { sz.width, sz.height };
size_t localsize_general[2] = {0, 1};
size_t* localsize = NULL;
ocl::Kernel k;
UMat src = _src.getUMat();
......@@ -3210,6 +3209,76 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
size_t tryWorkItems = maxWorkItemSizes[0];
char cvt[2][40];
// For smaller filter kernels, there is a special kernel that is more
// efficient than the general one.
UMat kernalDataUMat;
if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) &&
((ksize.width < 5 && ksize.height < 5) ||
(ksize.width == 5 && ksize.height == 5 && cn == 1)))
{
kernelMat.reshape(0, 1);
String kerStr = ocl::kernelToStr(kernelMat, CV_32F);
int h = isolated ? sz.height : wholeSize.height;
int w = isolated ? sz.width : wholeSize.width;
if ((w < ksize.width) || (h < ksize.height))
return false;
// Figure out what vector size to use for loading the pixels.
int pxLoadNumPixels = ((cn != 1) || sz.width % 4) ? 1 : 4;
int pxLoadVecSize = cn * pxLoadNumPixels;
// Figure out how many pixels per work item to compute in X and Y
// directions. Too many and we run out of registers.
int pxPerWorkItemX = 1;
int pxPerWorkItemY = 1;
if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
{
pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8;
pxPerWorkItemY = sz.width % 2 ? 1 : 2;
}
else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
{
pxPerWorkItemX = sz.width % 2 ? 1 : 2;
pxPerWorkItemY = sz.width % 2 ? 1 : 2;
}
globalsize[0] = sz.width / pxPerWorkItemX;
globalsize[1] = sz.height / pxPerWorkItemY;
// Need some padding in the private array for pixels
int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
// Make the global size a nice round number so the runtime can pick
// from reasonable choices for the workgroup size
const int wgRound = 256;
globalsize[0] = ROUNDUP(globalsize[0], wgRound);
char build_options[1024];
sprintf(build_options, "-D cn=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
"-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
"-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
"-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
"-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
"-D convertToWT=%s -D convertToDstT=%s %s",
cn, anchor.x, anchor.y, ksize.width, ksize.height,
pxLoadVecSize, pxLoadNumPixels,
pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str());
cv::String errmsg;
if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options, &errmsg))
return false;
}
else
{
localsize = localsize_general;
std::vector<float> kernelMatDataFloat;
int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);
for ( ; ; )
......@@ -3217,13 +3286,13 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
size_t BLOCK_SIZE = tryWorkItems;
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
BLOCK_SIZE /= 2;
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
size_t BLOCK_SIZE_Y = 1;
#else
#else
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
BLOCK_SIZE_Y *= 2;
#endif
#endif
if ((size_t)ksize.width > BLOCK_SIZE)
return false;
......@@ -3268,6 +3337,7 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
return false;
tryWorkItems = kernelWorkGroupSize;
}
}
_dst.create(sz, dtype);
UMat dst = _dst.getUMat();
......@@ -3688,9 +3758,20 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
temp = dst;
else
temp.create(dst.size(), dst.type());
// crossCorr doesn't accept non-zero delta with multiple channels
if( src.channels() != 1 && delta != 0 )
{
crossCorr( src, kernel, temp, src.size(),
CV_MAKETYPE(ddepth, src.channels()),
anchor, 0, borderType );
add( temp, delta, temp );
}
else
{
crossCorr( src, kernel, temp, src.size(),
CV_MAKETYPE(ddepth, src.channels()),
anchor, delta, borderType );
}
if( temp.data != dst.data )
temp.copyTo(dst);
return;
......
This diff is collapsed.
......@@ -51,7 +51,7 @@ namespace ocl {
/////////////////////////////////////////////////////////////////////////////////////////////////
// Filter2D
PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
PARAM_TEST_CASE(Filter2D, MatDepth, Channels, int, int, BorderType, bool, bool)
{
static const int kernelMinSize = 2;
static const int kernelMaxSize = 10;
......@@ -60,6 +60,7 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
Size dsize;
Point anchor;
int borderType;
int widthMultiple;
bool useRoi;
Mat kernel;
double delta;
......@@ -70,27 +71,30 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
virtual void SetUp()
{
type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
borderType = GET_PARAM(2) | (GET_PARAM(3) ? BORDER_ISOLATED : 0);
useRoi = GET_PARAM(4);
Size ksize(GET_PARAM(2), GET_PARAM(2));
widthMultiple = GET_PARAM(3);
borderType = GET_PARAM(4) | (GET_PARAM(5) ? BORDER_ISOLATED : 0);
useRoi = GET_PARAM(6);
Mat temp = randomMat(ksize, CV_MAKE_TYPE(((CV_64F == CV_MAT_DEPTH(type)) ? CV_64F : CV_32F), 1), -MAX_VALUE, MAX_VALUE);
cv::normalize(temp, kernel, 1.0, 0.0, NORM_L1);
}
void random_roi()
{
dsize = randomSize(1, MAX_VALUE);
// Make sure the width is a multiple of the requested value, and no more.
dsize.width &= ~((widthMultiple * 2) - 1);
dsize.width += widthMultiple;
Size ksize = randomSize(kernelMinSize, kernelMaxSize);
Mat temp = randomMat(ksize, CV_MAKE_TYPE(((CV_64F == CV_MAT_DEPTH(type)) ? CV_64F : CV_32F), 1), -MAX_VALUE, MAX_VALUE);
cv::normalize(temp, kernel, 1.0, 0.0, NORM_L1);
Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
Size roiSize = randomSize(kernel.size[0], MAX_VALUE, kernel.size[1], MAX_VALUE);
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
randomSubMat(dst, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
anchor.x = randomInt(-1, ksize.width);
anchor.y = randomInt(-1, ksize.height);
anchor.x = randomInt(-1, kernel.size[0]);
anchor.y = randomInt(-1, kernel.size[1]);
delta = randomDouble(-100, 100);
......@@ -122,6 +126,8 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, Filter2D,
Combine(
Values(CV_8U, CV_16U, CV_32F),
OCL_ALL_CHANNELS,
Values(3, 5, 9), // Kernel size
Values(1, 4, 8), // Width mutiple
Values((BorderType)BORDER_CONSTANT,
(BorderType)BORDER_REPLICATE,
(BorderType)BORDER_REFLECT,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment