gaussianBlur5x5.cl 5.64 KB
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

#define DIG(a) a,
__constant float kx[] = { KERNEL_MATRIX_X };
__constant float ky[] = { KERNEL_MATRIX_Y };

#define OP(y, x) (convert_float4(arr[y * 5 + x]) * ky[y] * kx[x])

#define FILL_ARR(s1, s2, n, e1, e2)                                                   \
    arr[5 * n + 0] = row_s ? (uchar4)(s1, s2, line[n].s23) : (uchar4)(line[n].s0123); \
    arr[5 * n + 1] = row_s ? (uchar4)(s2, line[n].s234) : (uchar4)(line[n].s1234);    \
    arr[5 * n + 2] = (uchar4)(line[n].s2345);                                         \
    arr[5 * n + 3] = row_e ? (uchar4)(line[n].s345, e1) : (uchar4)(line[n].s3456);    \
    arr[5 * n + 4] = row_e ? (uchar4)(line[n].s45, e1, e2) : (uchar4)(line[n].s4567);

__kernel void gaussianBlur5x5_8UC1_cols4(__global const uchar* src, int src_step,
                                         __global uint* dst, int dst_step, int rows, int cols)
{
    int x = get_global_id(0) * 4;
    int y = get_global_id(1);

    if (x >= cols || y >= rows) return;

    uchar8 line[5];
    int offset, src_index;

    src_index = x + (y - 2) * src_step - 2;
    offset = max(0, src_index + 2 * src_step);
    line[2] = vload8(0, src + offset);
    if (offset == 0) line[2] = (uchar8)(0, 0, line[2].s0123, line[2].s45);

#if defined BORDER_CONSTANT || defined BORDER_REPLICATE
    uchar8 tmp;
#ifdef BORDER_CONSTANT
    tmp = (uchar8)0;
#elif defined BORDER_REPLICATE
    tmp = line[2];
#endif
    line[0] = line[1] = tmp;
    if (y > 1)
    {
        offset = max(0, src_index);
        line[0] = vload8(0, src + offset);
        if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
    }

    if (y > 0)
    {
        offset = max(0, src_index + src_step);
        line[1] = vload8(0, src + offset);
        if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);
    }

    line[3] = (y == (rows - 1)) ? tmp : vload8(0, src + src_index + 3 * src_step);
    line[4] = (y >= (rows - 2)) ? tmp : vload8(0, src + src_index + 4 * src_step);
#elif BORDER_REFLECT
    int t;
    t = (y <= 1) ? (abs(y - 1) - y + 2) : 0;
    offset = max(0, src_index + t * src_step);
    line[0] = vload8(0, src + offset);
    if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);

    if (y == 0)
        line[1] = line[2];
    else
    {
        offset = max(0, src_index + 1 * src_step);
        line[1] = vload8(0, src + offset);
        if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[0].s45);
    }

    line[3] = (y == (rows - 1)) ? line[2] : vload8(0, src + src_index + 3 * src_step);

    t = (y >= (rows - 2)) ? (abs(y - (rows - 1)) - (y - (rows - 2)) + 2) : 4;
    line[4] = vload8(0, src + src_index + t * src_step);
#elif BORDER_REFLECT_101
    if (y == 1)
        line[0] = line[2];
    else
    {
        offset = (y == 0) ? (src_index + 4 * src_step) : max(0, src_index);
        line[0] = vload8(0, src + offset);
        if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
    }

    offset = (y == 0) ? (src_index + 3 * src_step) : max(0, src_index + 1 * src_step);
    line[1] = vload8(0, src + offset);
    if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);

    line[3] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 3) * src_step);
    if (y == (rows - 2))
        line[4] = line[2];
    else
    {
        line[4] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 4) * src_step);
    }
#endif

    bool row_s = (x == 0);
    bool row_e = ((x + 4) == cols);
    uchar4 arr[25];
    uchar s, e;

#ifdef BORDER_CONSTANT
    s = e = 0;

    FILL_ARR(s, s, 0, e, e);
    FILL_ARR(s, s, 1, e, e);
    FILL_ARR(s, s, 2, e, e);
    FILL_ARR(s, s, 3, e, e);
    FILL_ARR(s, s, 4, e, e);
#elif defined BORDER_REPLICATE
    s = line[0].s2;
    e = line[0].s5;
    FILL_ARR(s, s, 0, e, e);

    s = line[1].s2;
    e = line[1].s5;
    FILL_ARR(s, s, 1, e, e);

    s = line[2].s2;
    e = line[2].s5;
    FILL_ARR(s, s, 2, e, e);

    s = line[3].s2;
    e = line[3].s5;
    FILL_ARR(s, s, 3, e, e);

    s = line[4].s2;
    e = line[4].s5;
    FILL_ARR(s, s, 4, e, e);
#elif BORDER_REFLECT
    uchar s1, s2;
    uchar e1, e2;

    s1 = line[0].s3;
    s2 = line[0].s2;
    e1 = line[0].s5;
    e2 = line[0].s4;
    FILL_ARR(s1, s2, 0, e1, e2);

    s1 = line[1].s3;
    s2 = line[1].s2;
    e1 = line[1].s5;
    e2 = line[1].s4;
    FILL_ARR(s1, s2, 1, e1, e2);

    s1 = line[2].s3;
    s2 = line[2].s2;
    e1 = line[2].s5;
    e2 = line[2].s4;
    FILL_ARR(s1, s2, 2, e1, e2);

    s1 = line[3].s3;
    s2 = line[3].s2;
    e1 = line[3].s5;
    e2 = line[3].s4;
    FILL_ARR(s1, s2, 3, e1, e2);

    s1 = line[4].s3;
    s2 = line[4].s2;
    e1 = line[4].s5;
    e2 = line[4].s4;
    FILL_ARR(s1, s2, 4, e1, e2);
#elif BORDER_REFLECT_101
    s = line[0].s4;
    e = line[0].s3;
    FILL_ARR(s, e, 0, s, e);

    s = line[1].s4;
    e = line[1].s3;
    FILL_ARR(s, e, 1, s, e);

    s = line[2].s4;
    e = line[2].s3;
    FILL_ARR(s, e, 2, s, e);

    s = line[3].s4;
    e = line[3].s3;
    FILL_ARR(s, e, 3, s, e);

    s = line[4].s4;
    e = line[4].s3;
    FILL_ARR(s, e, 4, s, e);
#endif

    float4 sum;
    sum = OP(0, 0) + OP(0, 1) + OP(0, 2) + OP(0, 3) + OP(0, 4) +
          OP(1, 0) + OP(1, 1) + OP(1, 2) + OP(1, 3) + OP(1, 4) +
          OP(2, 0) + OP(2, 1) + OP(2, 2) + OP(2, 3) + OP(2, 4) +
          OP(3, 0) + OP(3, 1) + OP(3, 2) + OP(3, 3) + OP(3, 4) +
          OP(4, 0) + OP(4, 1) + OP(4, 2) + OP(4, 3) + OP(4, 4);

    int dst_index = (x / 4) + y * (dst_step / 4);
    dst[dst_index] = as_uint(convert_uchar4_sat_rte(sum));
}