Merge pull request #808 from bitwangyaoyao:2.4_mac

36028bd8 · Andrey Kamaev · OpenCV Buildbot · d2de68c1 · 719e8674 · 36028bd8
Commit 36028bd8 authored Apr 12, 2013 by Andrey Kamaev Committed by OpenCV Buildbot Apr 12, 2013
9 changed files
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -77,7 +77,7 @@ namespace cv

                size_t wave_size = 0;
                queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
-                std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast<int>(wave_size));
+                std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
                OPTION_INIT = true;
            }
            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);

--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -277,8 +277,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
    char compile_option[128];
    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s", 
        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], 
-        rectKernel?"-D RECTKERNEL":"",
-        s);
+        s, rectKernel?"-D RECTKERNEL":"");
    vector< pair<size_t, const void *> > args;
    args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
    args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));

--- a/modules/ocl/src/opencl/arithm_flip.cl
+++ b/modules/ocl/src/opencl/arithm_flip.cl
@@ -330,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
    if (x < thread_cols && y < rows)
    {
        int src_index_0 = mad24(y, src_step, (x)           + src_offset);
-        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
-
-        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
        int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
-
        uchar data0 = *(src + src_index_0);
-        uchar data1 = *(src + src_index_1);
+        *(dst + dst_index_1) = data0;

+        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
+        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
+        uchar data1 = *(src + src_index_1);
        *(dst + dst_index_0) = data1;
-        *(dst + dst_index_1) = data0;
    }
 }
 __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,

--- a/modules/ocl/src/opencl/filter_sep_row.cl
+++ b/modules/ocl/src/opencl/filter_sep_row.cl
--- a/modules/ocl/src/opencl/filtering_laplacian.cl
+++ b/modules/ocl/src/opencl/filtering_laplacian.cl
--- a/modules/ocl/src/opencl/imgproc_integral.cl
+++ b/modules/ocl/src/opencl/imgproc_integral.cl
@@ -44,7 +44,11 @@
 //M*/

 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 #define LSIZE 256
 #define LSIZE_1 255
@@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
    gid = gid << 1;
    for(int i = 0; i < rows; i =i + LSIZE_1)
    {
-        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
-        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
+        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
+        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);

        sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[1] =  (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
        barrier(CLK_LOCAL_MEM_FENCE);

        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
@@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
-        if(lid > 0 && (i+lid) <= rows){
+        if(lid > 0 && (i+lid) <= rows)
+        {
            lm_sum[0][bf_loc] += sum_t[0];
            lm_sum[1][bf_loc] += sum_t[1];
            lm_sqsum[0][bf_loc] += sqsum_t[0];
@@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
    src_step = src_step >> 4;
    for(int i = 0; i < rows; i =i + LSIZE_1)
    {
-        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
-        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0;
-        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
-        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0;
+        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
+        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
+        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
+        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;

        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[0] =  (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] =  (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[1] =  (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
        barrier(CLK_LOCAL_MEM_FENCE);

        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
@@ -228,14 +233,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
        barrier(CLK_LOCAL_MEM_FENCE);
        if(gid == 0 && (i + lid) <= rows)
        {
-           sum[sum_offset + i + lid] = 0;
-           sqsum[sqsum_offset + i + lid] = 0;
+            sum[sum_offset + i + lid] = 0;
+            sqsum[sqsum_offset + i + lid] = 0;
        }
        if(i + lid == 0)
        {
            int loc0 = gid * 2 * sum_step;
            int loc1 = gid * 2 * sqsum_step;
-            for(int k = 1;k <= 8;k++)
+            for(int k = 1; k <= 8; k++)
            {
                if(gid * 8 + k > cols) break;
                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
@@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
        }
        int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
        int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
-        if(lid > 0 && (i+lid) <= rows){
+        if(lid > 0 && (i+lid) <= rows)
+        {
            lm_sum[0][bf_loc] += sum_t[0];
            lm_sum[1][bf_loc] += sum_t[1];
            lm_sqsum[0][bf_loc] += sqsum_t[0];

--- a/modules/ocl/src/opencl/imgproc_warpAffine.cl
+++ b/modules/ocl/src/opencl/imgproc_warpAffine.cl
--- a/modules/ocl/src/opencl/imgproc_warpPerspective.cl
+++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl
--- a/modules/ocl/src/opencl/match_template.cl
+++ b/modules/ocl/src/opencl/match_template.cl
@@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0
            __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
            for(j = 0; j < tpl_cols; j ++)
            {
-                sum = mad24(img_ptr[j], tpl_ptr[j], sum);
+                sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
            }
        }
-        res[res_idx] = sum;
+        res[res_idx] = (float)sum;
    }
 }

@@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0
                sum   = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
            }
        }
-        res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+        res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
    }
 }

@@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0

    if(gidx < res_cols && gidy < res_rows)
    {
-        float sum = (float)(
-                        (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-                        - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+        float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+                            -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
        res[res_idx] -= sum * tpl_sum;
    }
 }