Change kernel for optimization. Remove restriction to align data

Fix kernel compilation errors on AMD system Fix licanse information in cl file Support CV_64F destination type Change build options of the kernel Optimize sum of square Remove separate kernel for integral square Increase epsilon for perfomance tests Increase epsilon for perfomance tests Test double support on AMD devices Fix some issues Try to fix problems with AMD device Try to solve problem with AMD device Fix error of destination size in kernel Fix warnings

Change kernel for optimization. Remove restriction to align data
Fix kernel compilation errors on AMD system Fix licanse information in cl file Support CV_64F destination type Change build options of the kernel Optimize sum of square Remove separate kernel for integral square Increase epsilon for perfomance tests Increase epsilon for perfomance tests Test double support on AMD devices Fix some issues Try to fix problems with AMD device Try to solve problem with AMD device Fix error of destination size in kernel Fix warnings
09bcc061 · vbystricky · 01a98fae · 09bcc061 · 01a98fae · 09bcc061
Commit 09bcc061 authored Jun 19, 2014 by vbystricky
4 changed files
--- a/modules/imgproc/perf/opencl/perf_imgproc.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgproc.cpp
@@ -231,7 +231,7 @@ OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, O

    OCL_TEST_CYCLE() cv::integral(src, dst, ddepth);

-    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    SANITY_CHECK(dst, 2e-6, ERROR_RELATIVE);
 }

 OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F)))
@@ -243,11 +243,11 @@ OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, O
    checkDeviceMaxMemoryAllocSize(srcSize, ddepth);

    UMat src(srcSize, CV_8UC1), sum(srcSize + Size(1, 1), ddepth), sqsum(srcSize + Size(1, 1), CV_32F);
-    declare.in(src, WARMUP_RNG).out(sum).out(sqsum);
+    declare.in(src, WARMUP_RNG).out(sum, sqsum);

    OCL_TEST_CYCLE() cv::integral(src, sum, sqsum, ddepth, CV_32F);

-    SANITY_CHECK(sum, 1e-6, ERROR_RELATIVE);
+    SANITY_CHECK(sum, 2e-4, ERROR_RELATIVE);
    SANITY_CHECK(sqsum, 5e-5, ERROR_RELATIVE);
 }


--- a/modules/imgproc/src/opencl/integral_sqrsum.cl
+++ b/modules/imgproc/src/opencl/integral_sqrsum.cl
--- a/modules/imgproc/src/opencl/integral_sum.cl
+++ b/modules/imgproc/src/opencl/integral_sum.cl
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -235,97 +235,87 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_

 #ifdef HAVE_OPENCL

-enum { vlen = 4 };
-
 static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
 {
-    if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0  ||
-         !(sdepth == CV_32S || sdepth == CV_32F) )
-        return false;
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

-    ocl::Kernel k1("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc,
-                   format("-D sdepth=%d", sdepth));
-    if (k1.empty())
+    if ( (_src.type() != CV_8UC1) ||
+        !(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
        return false;

-    Size size = _src.size(), t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width),
-            ssize(size.width + 1, size.height + 1);
-    _sum.create(ssize, sdepth);
-    UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat();
-    t_sum = t_sum(Range::all(), Range(0, size.height));
+    static const int tileSize = 16;
+
+    String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
+                                ocl::typeToStr(sdepth), tileSize,
+                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (kcols.empty())
+        return false;

-    int offset = (int)src.offset / vlen;
-    int vcols = (src.cols + vlen - 1) / vlen;
-    int sum_offset = (int)sum.offset / vlen;
+    UMat src = _src.getUMat();
+    Size src_size = src.size();
+    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
+    UMat buf(bufsize, sdepth);
+    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
+    size_t gt = src.cols, lt = tileSize;
+    if (!kcols.run(1, &gt, &lt, false))
+        return false;

-    k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
-            offset, src.rows, src.cols, (int)src.step, (int)t_sum.step);
-    size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
-    if (!k1.run(1, &gt, &lt, false))
+    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (krows.empty())
        return false;

-    ocl::Kernel k2("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc,
-                   format("-D sdepth=%d", sdepth));
-    k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrWriteOnly(sum),
-            t_sum.rows, t_sum.cols, (int)t_sum.step, (int)sum.step, sum_offset);
+    Size sumsize(src_size.width + 1, src_size.height + 1);
+    _sum.create(sumsize, sdepth);
+    UMat sum = _sum.getUMat();

-    size_t gt2 = t_sum.cols  * 32, lt2 = 256;
-    return k2.run(1, &gt2, &lt2, false);
+    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
+    gt = src.rows;
+    return krows.run(1, &gt, &lt, false);
 }

 static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
 {
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

-    if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 ||
-         (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
-        return false;
-
-    char cvt[40];
-    String opts = format("-D sdepth=%d -D sqdepth=%d -D TYPE=%s -D TYPE4=%s4 -D convert_TYPE4=%s%s",
-                         sdepth, sqdepth, ocl::typeToStr(sqdepth), ocl::typeToStr(sqdepth),
-                         ocl::convertTypeStr(sdepth, sqdepth, 4, cvt),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
-    ocl::Kernel k1("integral_cols", ocl::imgproc::integral_sqrsum_oclsrc, opts);
-    if (k1.empty())
+    if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
        return false;

-    Size size = _src.size(), dsize = Size(size.width + 1, size.height + 1),
-            t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width);
-    UMat src = _src.getUMat(), t_sum(t_size, sdepth), t_sqsum(t_size, sqdepth);
-    t_sum = t_sum(Range::all(), Range(0, size.height));
-    t_sqsum = t_sqsum(Range::all(), Range(0, size.height));
-
-    _sum.create(dsize, sdepth);
-    _sqsum.create(dsize, sqdepth);
-    UMat sum = _sum.getUMat(), sqsum = _sqsum.getUMat();
+    static const int tileSize = 16;

-    int offset = (int)src.offset / vlen;
-    int pre_invalid = src.offset % vlen;
-    int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
-    int sum_offset = (int)(sum.offset / sum.elemSize());
-    int sqsum_offset = (int)(sqsum.offset / sqsum.elemSize());
+    String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
+                                ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
+                                tileSize,
+                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");

-    k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
-            ocl::KernelArg::PtrWriteOnly(t_sqsum), offset, pre_invalid, src.rows,
-            src.cols, (int)src.step, (int)t_sum.step, (int)t_sqsum.step);
+    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (kcols.empty())
+        return false;

-    size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
-    if (!k1.run(1, &gt, &lt, false))
+    UMat src = _src.getUMat();
+    Size src_size = src.size();
+    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
+    UMat buf(bufsize, sdepth);
+    UMat buf_sq(bufsize, sqdepth);
+    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
+    size_t gt = src.cols, lt = tileSize;
+    if (!kcols.run(1, &gt, &lt, false))
        return false;

-    ocl::Kernel k2("integral_rows", ocl::imgproc::integral_sqrsum_oclsrc, opts);
-    if (k2.empty())
+    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (krows.empty())
        return false;

-    k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrReadOnly(t_sqsum),
-            ocl::KernelArg::PtrWriteOnly(sum), ocl::KernelArg::PtrWriteOnly(sqsum),
-            t_sum.rows, t_sum.cols, (int)t_sum.step, (int)t_sqsum.step,
-            (int)sum.step, (int)sqsum.step, sum_offset, sqsum_offset);
+    Size sumsize(src_size.width + 1, src_size.height + 1);
+    _sum.create(sumsize, sdepth);
+    UMat sum = _sum.getUMat();
+    _sqsum.create(sumsize, sqdepth);
+    UMat sum_sq = _sqsum.getUMat();

-    size_t gt2 = t_sum.cols  * 32, lt2 = 256;
-    return k2.run(1, &gt2, &lt2, false);
+    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
+    gt = src.rows;
+    return krows.run(1, &gt, &lt, false);
 }

 #endif