Merge pull request #1633 from ilya-lavrenov:ocl_imgproc

bd1a1cc0 · Andrey Pavlenko · OpenCV Buildbot · 599d51ec · 4f68f35a · bd1a1cc0
Commit bd1a1cc0 authored Oct 21, 2013 by Andrey Pavlenko Committed by OpenCV Buildbot Oct 21, 2013
11 changed files
--- a/modules/ocl/src/columnsum.cpp
+++ b/modules/ocl/src/columnsum.cpp
@@ -52,25 +52,24 @@ using namespace cv::ocl;
 void cv::ocl::columnSum(const oclMat &src, oclMat &dst)
 {
    CV_Assert(src.type() == CV_32FC1);
    dst.create(src.size(), src.type());
-    Context *clCxt = src.clCxt;
+    int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
+    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-    const std::string kernelName = "columnSum";
    std::vector< pair<size_t, const void *> > args;
    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset));
    size_t globalThreads[3] = {dst.cols, 1, 1};
    size_t localThreads[3]  = {256, 1, 1};
-    openCLExecuteKernel(clCxt, &imgproc_columnsum, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+    openCLExecuteKernel(src.clCxt, &imgproc_columnsum, "columnSum", globalThreads, localThreads, args, src.oclchannels(), src.depth());
 }
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
--- a/modules/ocl/src/opencl/imgproc_clahe.cl
+++ b/modules/ocl/src/opencl/imgproc_clahe.cl
@@ -53,12 +53,8 @@ int calc_lut(__local int* smem, int val, int tid)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid == 0)
-    {
        for (int i = 1; i < 256; ++i)
-        {
            smem[i] += smem[i - 1];
-        }
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    return smem[tid];
@@ -71,69 +67,51 @@ void reduce(volatile __local int* smem, int val, int tid)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 128)
-    {
        smem[tid] = val += smem[tid + 128];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 64)
-    {
        smem[tid] = val += smem[tid + 64];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 32)
-    {
        smem[tid] += smem[tid + 32];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 16)
-    {
        smem[tid] += smem[tid + 16];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
-    {
        smem[tid] += smem[tid + 8];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 4)
-    {
        smem[tid] += smem[tid + 4];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 2)
-    {
        smem[tid] += smem[tid + 2];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 1)
-    {
        smem[256] = smem[tid] + smem[tid + 1];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
 }
 #else
 void reduce(__local volatile int* smem, int val, int tid)
 {
    smem[tid] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 128)
-    {
        smem[tid] = val += smem[tid + 128];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 64)
-    {
        smem[tid] = val += smem[tid + 64];
-    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 32)
@@ -141,12 +119,17 @@ void reduce(__local volatile int* smem, int val, int tid)
        smem[tid] += smem[tid + 32];
 #if WAVE_SIZE < 32
    } barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16) {
+    if (tid < 16)
+    {
 #endif
        smem[tid] += smem[tid + 16];
 #if WAVE_SIZE < 16
-    } barrier(CLK_LOCAL_MEM_FENCE);
+    }
-    if (tid < 8) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
 #endif
        smem[tid] += smem[tid + 8];
        smem[tid] += smem[tid + 4];
@@ -159,7 +142,8 @@ void reduce(__local volatile int* smem, int val, int tid)
 __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
                      const int srcStep, const int dstStep,
                      const int2 tileSize, const int tilesX,
-                      const int clipLimit, const float lutScale)
+                      const int clipLimit, const float lutScale,
+                      const int src_offset, const int dst_offset)
 {
    __local int smem[512];
@@ -173,25 +157,21 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
    {
-        __global const uchar* srcPtr = src + mad24( ty * tileSize.y + i,
+        __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
-                                                    srcStep, tx * tileSize.x );
        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
        {
            const int data = srcPtr[j];
            atomic_inc(&smem[data]);
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    int tHistVal = smem[tid];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (clipLimit > 0)
    {
        // clip histogram bar
        int clipped = 0;
        if (tHistVal > clipLimit)
        {
@@ -200,7 +180,6 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
        }
        // find number of overall clipped samples
        reduce(smem, clipped, tid);
        barrier(CLK_LOCAL_MEM_FENCE);
 #ifdef CPU
@@ -229,7 +208,7 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
    const int lutVal = calc_lut(smem, tHistVal, tid);
    uint ires = (uint)convert_int_rte(lutScale * lutVal);
-    lut[(ty * tilesX + tx) * dstStep + tid] =
+    lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
        convert_uchar(clamp(ires, (uint)0, (uint)255));
 }
@@ -239,7 +218,8 @@ __kernel void transform(__global __const uchar * src,
                        const int srcStep, const int dstStep, const int lutStep,
                        const int cols, const int rows,
                        const int2 tileSize,
-                        const int tilesX, const int tilesY)
+                        const int tilesX, const int tilesY,
+                        const int src_offset, const int dst_offset, int lut_offset)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
@@ -261,15 +241,15 @@ __kernel void transform(__global __const uchar * src,
    tx1 = max(tx1, 0);
    tx2 = min(tx2, tilesX - 1);
-    const int srcVal = src[mad24(y, srcStep, x)];
+    const int srcVal = src[mad24(y, srcStep, x + src_offset)];
    float res = 0;
-    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya));
+    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
-    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya));
+    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
-    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya));
+    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
-    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya));
+    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
    uint ires = (uint)convert_int_rte(res);
-    dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
+    dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
 }
--- a/modules/ocl/src/opencl/imgproc_columnsum.cl
+++ b/modules/ocl/src/opencl/imgproc_columnsum.cl
@@ -43,38 +43,28 @@
 //
 //M*/
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
 ////////////////////////////////////////////////////////////////////
 ///////////////////////// columnSum ////////////////////////////////
 ////////////////////////////////////////////////////////////////////
-/// CV_32FC1
-__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
+__kernel void columnSum_C1_D5(__global float * src, __global float * dst,
+    int cols, int rows, int src_step, int dst_step, int src_offset, int dst_offset)
 {
    const int x = get_global_id(0);
-    srcStep >>= 2;
+    if (x < cols)
-    dstStep >>= 2;
-    if (x < srcCols)
    {
-        int srcIdx = x ;
+        int srcIdx = x + src_offset;
-        int dstIdx = x ;
+        int dstIdx = x + dst_offset;
        float sum = 0;
-        for (int y = 0; y < srcRows; ++y)
+        for (int y = 0; y < rows; ++y)
        {
            sum += src[srcIdx];
            dst[dstIdx] = sum;
-            srcIdx += srcStep;
+            srcIdx += src_step;
-            dstIdx += dstStep;
+            dstIdx += dst_step;
        }
    }
 }
--- a/modules/ocl/src/opencl/imgproc_convolve.cl
+++ b/modules/ocl/src/opencl/imgproc_convolve.cl
@@ -48,9 +48,12 @@
 #elif defined (__NVIDIA__)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 /************************************** convolve **************************************/
-__kernel void convolve_D5 (__global float *src, __global float *temp1, __global float *dst,
-                                  int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
+__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
+                          int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
+                          int src_offset, int dst_offset, int koffset)
 {
    __local float smem[16 + 2 * 8][16 + 2 * 8];
@@ -65,7 +68,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
            // 0 | 0 0 | 0
            // -----------
            // 0 | 0 0 | 0
-    smem[y][x] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
+    smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
            // 0 | 0 x | x
            // -----------
@@ -73,7 +76,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
            // 0 | 0 0 | 0
            // -----------
            // 0 | 0 0 | 0
-    smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
+    smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
            // 0 | 0 0 | 0
            // -----------
@@ -81,7 +84,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
            // x | x 0 | 0
            // -----------
            // x | x 0 | 0
-    smem[y + 16][x] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
+    smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
            // 0 | 0 0 | 0
            // -----------
@@ -89,21 +92,18 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
            // 0 | 0 x | x
            // -----------
            // 0 | 0 x | x
-    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
+    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (gx < cols && gy < rows)
    {
-       float res = 0;
+        float res = 0;
        for (int i = 0; i < kHeight; ++i)
-        {
            for (int j = 0; j < kWidth; ++j)
-            {
+                res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
-                res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * (k_step>>2) + j];
-            }
+        dst[gy * dst_step + gx + dst_offset] = res;
-        }
+    }
-        dst[gy*(dst_step >> 2)+gx] = res;
-   }
 }
--- a/modules/ocl/src/opencl/imgproc_copymakeboder.cl
+++ b/modules/ocl/src/opencl/imgproc_copymakeboder.cl
@@ -34,6 +34,13 @@
 //
 //
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
 #ifdef BORDER_CONSTANT
 //BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii

--- a/modules/ocl/src/opencl/imgproc_remap.cl
+++ b/modules/ocl/src/opencl/imgproc_remap.cl
--- a/modules/ocl/test/test_filters.cpp
+++ b/modules/ocl/test/test_filters.cpp
@@ -62,8 +62,7 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
                int, // border type, or iteration
                bool) // roi or not
 {
-    int type, borderType;
+    int type, borderType, ksize;
-    int ksize;
    bool useRoi;
    Mat src, dst_whole, src_roi, dst_roi;
@@ -92,8 +91,12 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
    void Near(double threshold = 0.0)
    {
-        EXPECT_MAT_NEAR(dst_whole, Mat(gdst_whole), threshold);
+        Mat roi, whole;
-        EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), threshold);
+        gdst_whole.download(whole);
+        gdst_roi.download(roi);
+        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
+        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
    }
 };

--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
--- a/modules/ocl/test/test_mean_shift.cpp
+++ b/modules/ocl/test/test_mean_shift.cpp
--- a/modules/ocl/test/test_warp.cpp
+++ b/modules/ocl/test/test_warp.cpp