ocl: Add dft based convolve implementation.

Match template in ocl module now can be utilized with dft based convolve. Note this feature needs OpenCV to be built with clAmdFft library.

ocl: Add dft based convolve implementation.
Match template in ocl module now can be utilized with dft based convolve. Note this feature needs OpenCV to be built with clAmdFft library.
77501f3e · peng xiao · 67073daf · 77501f3e · 77501f3e · 77501f3e
Commit 77501f3e authored Apr 08, 2013 by peng xiao
5 changed files
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -540,9 +540,29 @@ namespace cv
        CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2);
        CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);

-        //! computes convolution of two images
+        struct CV_EXPORTS ConvolveBuf
+        {
+            Size result_size;
+            Size block_size;
+            Size user_block_size;
+            Size dft_size;
+
+            oclMat image_spect, templ_spect, result_spect;
+            oclMat image_block, templ_block, result_data;
+
+            void create(Size image_size, Size templ_size);
+            static Size estimateBlockSize(Size result_size, Size templ_size);
+        };
+
+        //! computes convolution of two images, may use discrete Fourier transform
        //! support only CV_32FC1 type
-        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result);
+        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
+        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);
+
+        //! Performs a per-element multiplication of two Fourier spectrums.
+        //! Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
+        //! support only CV_32FC2 type
+        CV_EXPORTS void mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int flags, float scale, bool conjB = false);

        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);


--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -25,6 +25,7 @@
 //    Xu Pang, pangxu010@163.com
 //    Wu Zailong, bullet@yeah.net
 //    Wenju He, wenju@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -79,6 +80,7 @@ namespace cv
        extern const char *imgproc_calcHarris;
        extern const char *imgproc_calcMinEigenVal;
        extern const char *imgproc_convolve;
+        extern const char *imgproc_mulAndScaleSpectrums;
        ////////////////////////////////////OpenCL call wrappers////////////////////////////

        template <typename T> struct index_and_sizeof;
@@ -1585,11 +1587,148 @@ namespace cv

    }
 }
+//////////////////////////////////mulSpectrums////////////////////////////////////////////////////
+void cv::ocl::mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int /*flags*/, float scale, bool conjB)
+{
+    CV_Assert(a.type() == CV_32FC2);
+    CV_Assert(b.type() == CV_32FC2);
+
+    c.create(a.size(), CV_32FC2);
+
+    size_t lt[3]  = { 16, 16, 1 };
+    size_t gt[3]  = { a.cols, a.rows, 1 };
+
+    String kernelName = conjB ? "mulAndScaleSpectrumsKernel_CONJ":"mulAndScaleSpectrumsKernel";
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&a.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&b.data ));
+    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&c.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.step ));
+
+    Context *clCxt = Context::getContext();
+    openCLExecuteKernel(clCxt, &imgproc_mulAndScaleSpectrums, kernelName, gt, lt, args, -1, -1);
+}
 //////////////////////////////////convolve////////////////////////////////////////////////////
 inline int divUp(int total, int grain)
 {
    return (total + grain - 1) / grain;
 }
+
+// ported from CUDA module
+void cv::ocl::ConvolveBuf::create(Size image_size, Size templ_size)
+{
+    result_size = Size(image_size.width - templ_size.width + 1,
+                       image_size.height - templ_size.height + 1);
+
+    block_size = user_block_size;
+    if (user_block_size.width == 0 || user_block_size.height == 0)
+        block_size = estimateBlockSize(result_size, templ_size);
+
+    dft_size.width  = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
+    dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
+
+    // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
+    // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
+    //if (dft_size.width > 8192)
+    dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1.);
+    //if (dft_size.height > 8192)
+    dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1.);
+
+    // To avoid wasting time doing small DFTs
+    dft_size.width = std::max(dft_size.width, 512);
+    dft_size.height = std::max(dft_size.height, 512);
+
+    image_block.create(dft_size, CV_32F);
+    templ_block.create(dft_size, CV_32F);
+    result_data.create(dft_size, CV_32F);
+
+    //spect_len = dft_size.height * (dft_size.width / 2 + 1);
+    image_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
+    templ_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
+    result_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
+
+    // Use maximum result matrix block size for the estimated DFT block size
+    block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
+    block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
+}
+
+Size cv::ocl::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
+{
+    int width = (result_size.width + 2) / 3;
+    int height = (result_size.height + 2) / 3;
+    width = std::min(width, result_size.width);
+    height = std::min(height, result_size.height);    
+    return Size(width, height);
+}
+
+static void convolve_run_fft(const oclMat &image, const oclMat &templ, oclMat &result, bool ccorr, ConvolveBuf& buf)
+{
+#if defined HAVE_CLAMDFFT
+    CV_Assert(image.type() == CV_32F);
+    CV_Assert(templ.type() == CV_32F);
+
+    buf.create(image.size(), templ.size());
+    result.create(buf.result_size, CV_32F);
+
+    Size& block_size = buf.block_size;
+    Size& dft_size = buf.dft_size;
+
+    oclMat& image_block = buf.image_block;
+    oclMat& templ_block = buf.templ_block;
+    oclMat& result_data = buf.result_data;
+
+    oclMat& image_spect = buf.image_spect;
+    oclMat& templ_spect = buf.templ_spect;
+    oclMat& result_spect = buf.result_spect;
+
+    oclMat templ_roi = templ;
+    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
+                   templ_block.cols - templ_roi.cols, 0, Scalar());
+
+    cv::ocl::dft(templ_block, templ_spect, dft_size);
+
+    // Process all blocks of the result matrix
+    for (int y = 0; y < result.rows; y += block_size.height)
+    {
+        for (int x = 0; x < result.cols; x += block_size.width)
+        {
+            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                std::min(y + dft_size.height, image.rows) - y);
+            Rect roi0(x, y, image_roi_size.width, image_roi_size.height);
+
+            oclMat image_roi(image, roi0);
+
+            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                           0, image_block.cols - image_roi.cols, 0, Scalar());
+
+            cv::ocl::dft(image_block, image_spect, dft_size);
+
+            mulSpectrums(image_spect, templ_spect, result_spect, 0,
+                                 1.f / dft_size.area(), ccorr);
+
+            cv::ocl::dft(result_spect, result_data, dft_size, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT);
+
+            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                 std::min(y + block_size.height, result.rows) - y);
+            
+            Rect roi1(x, y, result_roi_size.width, result_roi_size.height);
+            Rect roi2(0, 0, result_roi_size.width, result_roi_size.height);
+
+            oclMat result_roi(result, roi1);
+            oclMat result_block(result_data, roi2);
+
+            result_block.copyTo(result_roi);
+        }
+    }
+
+#else
+    CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
+#endif
+}
 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, String kernelName, const char **kernelString)
 {
    CV_Assert(src.depth() == CV_32FC1);
@@ -1630,13 +1769,25 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, St

    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
-void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
+void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y, bool ccorr)
 {
    CV_Assert(x.depth() == CV_32F);
    CV_Assert(t.depth() == CV_32F);
-    CV_Assert(x.type() == y.type() && x.size() == y.size());
    y.create(x.size(), x.type());
    String kernelName = "convolve";
-
-    convolve_run(x, t, y, kernelName, &imgproc_convolve);
+    if(t.cols > 17 || t.rows > 17)
+    {
+        ConvolveBuf buf;
+        convolve_run_fft(x, t, y, ccorr, buf);
+    }
+    else
+    {
+        CV_Assert(ccorr == false);
+        convolve_run(x, t, y, kernelName, &imgproc_convolve);
+    }
+}
+void cv::ocl::convolve(const oclMat &image, const oclMat &templ, oclMat &result, bool ccorr, ConvolveBuf& buf)
+{
+    result.create(image.size(), image.type());
+    convolve_run_fft(image, templ, result, ccorr, buf);
 }
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@@ -98,11 +98,22 @@ namespace cv
        // Evaluates optimal template's area threshold. If
        // template's area is less  than the threshold, we use naive match
        // template version, otherwise FFT-based (if available)
-        static bool useNaive(int , int , Size )
+        static bool useNaive(int method, int depth, Size size)
        {
-            // FIXME!
-            //   always use naive until convolve is imported
+#ifdef HAVE_CLAMDFFT
+            if (method == CV_TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(Context::CL_DOUBLE)))
+            {
+                return true;
+            }
+            else if(method == CV_TM_CCORR || (method == CV_TM_SQDIFF && depth == CV_8U))
+            {
+                return size.height < 18 && size.width < 18;
+            }
+            else
+                return false;
+#else
            return true;
+#endif
        }

        //////////////////////////////////////////////////////////////////////
@@ -223,9 +234,18 @@ namespace cv
        //////////////////////////////////////////////////////////////////////
        // CCORR
        void convolve_32F(
-            const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
+            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
        {
-            CV_Error(-1, "convolve is not fully implemented yet");
+            ConvolveBuf convolve_buf;
+            convolve_buf.user_block_size = buf.user_block_size;
+            if (image.oclchannels() == 1)
+                convolve(image, templ, result, true, convolve_buf);
+            else
+            {
+                oclMat result_;
+                convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
+                extractFirstChannel_32F(result_, result);
+            }
        }

        void matchTemplate_CCORR(

--- a/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
+++ b/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the uintel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business uinterruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+typedef float2 cfloat;
+inline cfloat cmulf(cfloat a, cfloat b)
+{
+    return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
+}
+
+inline cfloat conjf(cfloat a)
+{
+    return (cfloat)( a.x, - a.y );
+}
+
+__kernel void 
+    mulAndScaleSpectrumsKernel(
+    __global const cfloat* a, 
+    __global const cfloat* b, 
+    float scale, 
+    __global cfloat* dst, 
+    uint cols, 
+    uint rows, 
+    uint mstep
+    )
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], b[idx]);
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
+__kernel void 
+    mulAndScaleSpectrumsKernel_CONJ(
+    __global const cfloat* a, 
+    __global const cfloat* b, 
+    float scale, 
+    __global cfloat* dst, 
+    uint cols, 
+    uint rows, 
+    uint mstep
+    )
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], conjf(b[idx]));
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
--- a/modules/ocl/test/test_fft.cpp
+++ b/modules/ocl/test/test_fft.cpp
@@ -103,4 +103,138 @@ INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Dft, testing::Combine(
                            testing::Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20), cv::Size(512, 1), cv::Size(1024, 768)),
                            testing::Values(0, (int)cv::DFT_ROWS, (int)cv::DFT_SCALE) ));

+////////////////////////////////////////////////////////////////////////////
+// MulSpectrums
+
+PARAM_TEST_CASE(MulSpectrums, cv::Size, DftFlags, bool)
+{
+    cv::Size size;
+    int flag;
+    bool ccorr;
+    cv::Mat a, b;
+
+    virtual void SetUp()
+    {
+        size  = GET_PARAM(0);
+        flag  = GET_PARAM(1);
+        ccorr = GET_PARAM(2);
+
+        a = randomMat(size, CV_32FC2);
+        b = randomMat(size, CV_32FC2);
+    }
+};
+
+TEST_P(MulSpectrums, Simple)
+{
+    cv::ocl::oclMat c;
+    cv::ocl::mulSpectrums(cv::ocl::oclMat(a), cv::ocl::oclMat(b), c, flag, 1.0, ccorr);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, ccorr);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2, "");
+}
+
+TEST_P(MulSpectrums, Scaled)
+{
+    float scale = 1.f / size.area();
+
+    cv::ocl::oclMat c;
+    cv::ocl::mulSpectrums(cv::ocl::oclMat(a), cv::ocl::oclMat(b), c, flag, scale, ccorr);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, ccorr);
+    c_gold.convertTo(c_gold, c_gold.type(), scale);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2, "");
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(
+    DIFFERENT_SIZES,
+    testing::Values(DftFlags(0)),
+    testing::Values(false, true)));
+
+
+////////////////////////////////////////////////////////
+// Convolve
+
+void static convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+{
+    // reallocate the output array if needed
+    C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
+    cv::Size dftSize;
+
+    // compute the size of DFT transform
+    dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
+    dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
+
+    // allocate temporary buffers and initialize them with 0s
+    cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
+    cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
+
+    // copy A and B to the top-left corners of tempA and tempB, respectively
+    cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
+    A.copyTo(roiA);
+    cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
+    B.copyTo(roiB);
+
+    // now transform the padded A & B in-place;
+    // use "nonzeroRows" hint for faster processing
+    cv::dft(tempA, tempA, 0, A.rows);
+    cv::dft(tempB, tempB, 0, B.rows);
+
+    // multiply the spectrums;
+    // the function handles packed spectrum representations well
+    cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
+
+    // transform the product back from the frequency domain.
+    // Even though all the result rows will be non-zero,
+    // you need only the first C.rows of them, and thus you
+    // pass nonzeroRows == C.rows
+    cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
+
+    // now copy the result back to C.
+    tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+}
+
+IMPLEMENT_PARAM_CLASS(KSize, int);
+IMPLEMENT_PARAM_CLASS(Ccorr, bool);
+
+PARAM_TEST_CASE(Convolve_DFT, cv::Size, KSize, Ccorr)
+{
+    cv::Size size;
+    int ksize;
+    bool ccorr;
+
+    cv::Mat src;
+    cv::Mat kernel;
+
+    cv::Mat dst_gold;
+
+    virtual void SetUp()
+    {
+        size  = GET_PARAM(0);
+        ksize = GET_PARAM(1);
+        ccorr = GET_PARAM(2);
+    }
+};
+
+TEST_P(Convolve_DFT, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
+    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
+
+    cv::ocl::oclMat dst;
+    cv::ocl::convolve(cv::ocl::oclMat(src), cv::ocl::oclMat(kernel), dst, ccorr);
+    
+    cv::Mat dst_gold;
+    convolveDFT(src, kernel, dst_gold, ccorr);
+
+    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1, "");
+}
+#define DIFFERENT_CONVOLVE_SIZES testing::Values(cv::Size(251, 257), cv::Size(113, 113), cv::Size(200, 480), cv::Size(1300, 1300))
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Convolve_DFT, testing::Combine(
+    DIFFERENT_CONVOLVE_SIZES,
+    testing::Values(KSize(19), KSize(23), KSize(45)),
+    testing::Values(Ccorr(true)/*, Ccorr(false)*/))); // false ccorr cannot pass for some instances
 #endif // HAVE_CLAMDFFT