• vbystricky's avatar
    Change kernel for optimization. Remove restriction to align data · 09bcc061
    vbystricky authored
    Fix kernel compilation errors on AMD system
    
    Fix licanse information in cl file
    
    Support CV_64F destination type
    
    Change build options of the kernel
    
    Optimize sum of square
    
    Remove separate kernel for integral square
    
    Increase epsilon for perfomance tests
    
    Increase epsilon for perfomance tests
    
    Test double support on AMD devices
    
    Fix some issues
    
    Try to fix problems with AMD device
    
    Try to solve problem with AMD device
    
    Fix error of destination size in kernel
    
    Fix warnings
    09bcc061
integral_sum.cl 6.15 KB
/*M///////////////////////////////////////////////////////////////////////////////////////
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//M*/

#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif

#ifndef LOCAL_SUM_SIZE
#define LOCAL_SUM_SIZE      16
#endif

#define LOCAL_SUM_STRIDE    (LOCAL_SUM_SIZE + 1)


kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,
                              __global uchar *buf_ptr, int buf_step, int buf_offset
#ifdef SUM_SQUARE
                              ,__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset
#endif
                              )
{
    __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#ifdef SUM_SQUARE
    __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#endif
    int lid = get_local_id(0);
    int gid = get_group_id(0);

    int x = get_global_id(0);
    int src_index = x + src_offset;

    sumT accum = 0;
#ifdef SUM_SQUARE
    sumSQT accum_sq = 0;
#endif
    for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)
    {
        int lsum_index = lid;
        #pragma unroll
        for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)
        {
            if ((x < cols) && (y + yin < rows))
            {
                __global const uchar *src = src_ptr + src_index;
                accum += src[0];
#ifdef SUM_SQUARE
                sumSQT temp = src[0] * src[0];
                accum_sq += temp;
#endif
            }
            lm_sum[lsum_index] = accum;
#ifdef SUM_SQUARE
            lm_sum_sq[lsum_index] = accum_sq;
#endif
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        //int buf_index = buf_offset + buf_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
        int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));
#ifdef SUM_SQUARE
        int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));
#endif

        lsum_index = LOCAL_SUM_STRIDE * lid;
        #pragma unroll
        for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)
        {
            __global sumT *buf = (__global sumT *)(buf_ptr + buf_index);
            buf[0] = lm_sum[lsum_index];
            buf_index += buf_step;
#ifdef SUM_SQUARE
            __global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);
            bufsq[0] = lm_sum_sq[lsum_index];
            buf_sq_index += buf_sq_step;
#endif
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}

kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,
#ifdef SUM_SQUARE
                              __global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,
#endif
                              __global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols
#ifdef SUM_SQUARE
                              ,__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset
#endif
                              )
{
    __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#ifdef SUM_SQUARE
    __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#endif
    int lid = get_local_id(0);
    int gid = get_group_id(0);

    int gs = get_global_size(0);

    int x = get_global_id(0);

    __global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);
    for (int xin = x; xin < cols; xin += gs)
    {
        dst[xin] = 0;
    }
    dst_offset += dst_step;

    if (x < rows - 1)
    {
        dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));
        dst[0] = 0;
    }

    int buf_index = mad24((int)sizeof(sumT), x, buf_offset);
    sumT accum = 0;

#ifdef SUM_SQUARE
    __global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);
    for (int xin = x; xin < cols; xin += gs)
    {
        dst_sq[xin] = 0;
    }
    dst_sq_offset += dst_sq_step;

    dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));
    dst_sq[0] = 0;

    int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);
    sumSQT accum_sq = 0;
#endif

    for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)
    {
        int lsum_index = lid;
        #pragma unroll
        for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)
        {
            __global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);
            accum += buf[0];
            lm_sum[lsum_index] = accum;
            buf_index += buf_step;
#ifdef SUM_SQUARE
            __global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);
            accum_sq += buf_sq[0];
            lm_sum_sq[lsum_index] = accum_sq;
            buf_sq_index += buf_sq_step;
#endif
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if (y + lid < cols)
        {
            //int dst_index = dst_offset + dst_step *  LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
            int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));
#ifdef SUM_SQUARE
            int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));
#endif
            lsum_index = LOCAL_SUM_STRIDE * lid;
            int yin_max = min(rows - 1 -  LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);
            #pragma unroll
            for (int yin = 0; yin < yin_max; yin++, lsum_index++)
            {
                dst = (__global sumT *)(dst_ptr + dst_index);
                dst[0] = lm_sum[lsum_index];
                dst_index += dst_step;
#ifdef SUM_SQUARE
                dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);
                dst_sq[0] = lm_sum_sq[lsum_index];
                dst_sq_index += dst_sq_step;
#endif
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}