Merge pull request #2660 from arkunze:pullreq/140423-filter2D

03b1d133 · Alexander Alekhin · OpenCV Buildbot · e9be4865 · 1f8b41f3 · 03b1d133
Commit 03b1d133 authored May 06, 2014 by Alexander Alekhin Committed by OpenCV Buildbot May 06, 2014
4 changed files
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4379,7 +4379,7 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
    typedef std::string (* func_t)(const Mat &);
    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
-    const func_t func = funcs[depth];
+    const func_t func = funcs[ddepth];
    CV_Assert(func != 0);

    return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());

--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -3191,11 +3191,10 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
                                       "BORDER_WRAP", "BORDER_REFLECT_101" };

    cv::Mat kernelMat = _kernel.getMat();
-    std::vector<float> kernelMatDataFloat;
-    int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
-
    cv::Size sz = _src.size(), wholeSize;
-    size_t globalsize[2] = { sz.width, sz.height }, localsize[2] = { 0, 1 };
+    size_t globalsize[2] = { sz.width, sz.height };
+    size_t localsize_general[2] = {0, 1};
+    size_t* localsize = NULL;

    ocl::Kernel k;
    UMat src = _src.getUMat();
@@ -3210,6 +3209,76 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
    size_t tryWorkItems = maxWorkItemSizes[0];
    char cvt[2][40];

+    // For smaller filter kernels, there is a special kernel that is more
+    // efficient than the general one.
+    UMat kernalDataUMat;
+    if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) &&
+        ((ksize.width < 5 && ksize.height < 5) ||
+        (ksize.width == 5 && ksize.height == 5 && cn == 1)))
+    {
+        kernelMat.reshape(0, 1);
+        String kerStr = ocl::kernelToStr(kernelMat, CV_32F);
+        int h = isolated ? sz.height : wholeSize.height;
+        int w = isolated ? sz.width : wholeSize.width;
+
+        if ((w < ksize.width) || (h < ksize.height))
+            return false;
+
+        // Figure out what vector size to use for loading the pixels.
+        int pxLoadNumPixels = ((cn != 1) || sz.width % 4) ? 1 : 4;
+        int pxLoadVecSize = cn * pxLoadNumPixels;
+
+        // Figure out how many pixels per work item to compute in X and Y
+        // directions.  Too many and we run out of registers.
+        int pxPerWorkItemX = 1;
+        int pxPerWorkItemY = 1;
+        if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
+        {
+            pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8;
+            pxPerWorkItemY = sz.width % 2 ? 1 : 2;
+        }
+        else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
+        {
+            pxPerWorkItemX = sz.width % 2 ? 1 : 2;
+            pxPerWorkItemY = sz.width % 2 ? 1 : 2;
+        }
+        globalsize[0] = sz.width / pxPerWorkItemX;
+        globalsize[1] = sz.height / pxPerWorkItemY;
+
+        // Need some padding in the private array for pixels
+        int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
+
+        // Make the global size a nice round number so the runtime can pick
+        // from reasonable choices for the workgroup size
+        const int wgRound = 256;
+        globalsize[0] = ROUNDUP(globalsize[0], wgRound);
+
+        char build_options[1024];
+        sprintf(build_options, "-D cn=%d "
+                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
+                "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
+                "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
+                "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
+                "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
+                "-D convertToWT=%s -D convertToDstT=%s %s",
+                cn, anchor.x, anchor.y, ksize.width, ksize.height,
+                pxLoadVecSize, pxLoadNumPixels,
+                pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
+                isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
+                privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
+                ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
+                ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
+                ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
+                ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str());
+        cv::String errmsg;
+        if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options, &errmsg))
+            return false;
+    }
+    else
+    {
+        localsize = localsize_general;
+        std::vector<float> kernelMatDataFloat;
+        int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
        String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);

        for ( ; ; )
@@ -3217,13 +3286,13 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
            size_t BLOCK_SIZE = tryWorkItems;
            while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
                BLOCK_SIZE /= 2;
-#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
+    #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
            size_t BLOCK_SIZE_Y = 1;
-#else
+    #else
            size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
            while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
                BLOCK_SIZE_Y *= 2;
-#endif
+    #endif

            if ((size_t)ksize.width > BLOCK_SIZE)
                return false;
@@ -3268,6 +3337,7 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
                return false;
            tryWorkItems = kernelWorkGroupSize;
        }
+    }

    _dst.create(sz, dtype);
    UMat dst = _dst.getUMat();
@@ -3688,9 +3758,20 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
            temp = dst;
        else
            temp.create(dst.size(), dst.type());
+        // crossCorr doesn't accept non-zero delta with multiple channels
+        if( src.channels() != 1 && delta != 0 )
+        {
+            crossCorr( src, kernel, temp, src.size(),
+                       CV_MAKETYPE(ddepth, src.channels()),
+                       anchor, 0, borderType );
+            add( temp, delta, temp );
+        }
+        else
+        {
            crossCorr( src, kernel, temp, src.size(),
                       CV_MAKETYPE(ddepth, src.channels()),
                       anchor, delta, borderType );
+        }
        if( temp.data != dst.data )
            temp.copyTo(dst);
        return;

--- a/modules/imgproc/src/opencl/filter2DSmall.cl
+++ b/modules/imgproc/src/opencl/filter2DSmall.cl
--- a/modules/imgproc/test/ocl/test_filter2d.cpp
+++ b/modules/imgproc/test/ocl/test_filter2d.cpp
@@ -51,7 +51,7 @@ namespace ocl {

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Filter2D
-PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
+PARAM_TEST_CASE(Filter2D, MatDepth, Channels, int, int, BorderType, bool, bool)
 {
    static const int kernelMinSize = 2;
    static const int kernelMaxSize = 10;
@@ -60,6 +60,7 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
    Size dsize;
    Point anchor;
    int borderType;
+    int widthMultiple;
    bool useRoi;
    Mat kernel;
    double delta;
@@ -70,27 +71,30 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
    virtual void SetUp()
    {
        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
-        borderType = GET_PARAM(2) | (GET_PARAM(3) ? BORDER_ISOLATED : 0);
-        useRoi = GET_PARAM(4);
+        Size ksize(GET_PARAM(2), GET_PARAM(2));
+        widthMultiple = GET_PARAM(3);
+        borderType = GET_PARAM(4) | (GET_PARAM(5) ? BORDER_ISOLATED : 0);
+        useRoi = GET_PARAM(6);
+        Mat temp = randomMat(ksize, CV_MAKE_TYPE(((CV_64F == CV_MAT_DEPTH(type)) ? CV_64F : CV_32F), 1), -MAX_VALUE, MAX_VALUE);
+        cv::normalize(temp, kernel, 1.0, 0.0, NORM_L1);
    }

    void random_roi()
    {
        dsize = randomSize(1, MAX_VALUE);
+        // Make sure the width is a multiple of the requested value, and no more.
+        dsize.width &= ~((widthMultiple * 2) - 1);
+        dsize.width += widthMultiple;

-        Size ksize = randomSize(kernelMinSize, kernelMaxSize);
-        Mat temp = randomMat(ksize, CV_MAKE_TYPE(((CV_64F == CV_MAT_DEPTH(type)) ? CV_64F : CV_32F), 1), -MAX_VALUE, MAX_VALUE);
-        cv::normalize(temp, kernel, 1.0, 0.0, NORM_L1);
-
-        Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
+        Size roiSize = randomSize(kernel.size[0], MAX_VALUE, kernel.size[1], MAX_VALUE);
        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);

        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE);

-        anchor.x = randomInt(-1, ksize.width);
-        anchor.y = randomInt(-1, ksize.height);
+        anchor.x = randomInt(-1, kernel.size[0]);
+        anchor.y = randomInt(-1, kernel.size[1]);

        delta = randomDouble(-100, 100);

@@ -122,6 +126,8 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, Filter2D,
                            Combine(
                                Values(CV_8U, CV_16U, CV_32F),
                                OCL_ALL_CHANNELS,
+                                Values(3, 5, 9),  // Kernel size
+                                Values(1, 4, 8),   // Width mutiple
                                Values((BorderType)BORDER_CONSTANT,
                                       (BorderType)BORDER_REPLICATE,
                                       (BorderType)BORDER_REFLECT,