/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other oclMaterials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

//data (which is float) is aligend in 32 bytes
#define WIDTH_MULTIPLE (32 >> 2)

/////////////////////////////////////////////////////////
//------------------------------------------------------
// basicretinafilter
//////////////// _spatiotemporalLPfilter ////////////////
//_horizontalCausalFilter_addInput
kernel void horizontalCausalFilter_addInput(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int in_offset,
    const int out_offset,
    const float _tau,
    const float _a
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global const float * iptr =
        input  + mad24(gid, elements_per_row, in_offset / 4);
    global float * optr =
        output + mad24(gid, elements_per_row, out_offset / 4);

    float res;
    float4 in_v4, out_v4, sum_v4, res_v4 = (float4)(0);
    //vectorize to increase throughput
    for(int i = 0; i < cols / 4; ++i, iptr += 4, optr += 4)
    {
        in_v4  = vload4(0, iptr);
        out_v4 = vload4(0, optr) * _tau;
        sum_v4 = in_v4 + out_v4;

        res_v4.x = sum_v4.x + _a * res_v4.w;
        res_v4.y = sum_v4.y + _a * res_v4.x;
        res_v4.z = sum_v4.z + _a * res_v4.y;
        res_v4.w = sum_v4.w + _a * res_v4.z;

        vstore4(res_v4, 0, optr);
    }

    optr = output + mad24(gid + 1, elements_per_row, -4 + out_offset / 4);
    res_v4 = (float4)(0);
    for(int i = 0; i < elements_per_row / 4; ++i, optr -= 4)
    {
        // shift left, `offset` is type `size_t` so it cannot be negative
        out_v4 = vload4(0, optr);

        res_v4.w = out_v4.w + _a * res_v4.x;
        res_v4.z = out_v4.z + _a * res_v4.w;
        res_v4.y = out_v4.y + _a * res_v4.z;
        res_v4.x = out_v4.x + _a * res_v4.y;

        vstore4(res_v4, 0, optr);
    }
}

//_verticalCausalFilter
kernel void verticalCausalFilter(
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const float _a,
    const float _gain
)
{
    int gid = get_global_id(0) * 2;
    if(gid >= cols)
    {
        return;
    }

    global float * optr = output + gid + out_offset / 4;
    float2 input;
    float2 result = (float2)0;
    for(int i = 0; i < rows; ++i, optr += elements_per_row)
    {
        input = vload2(0, optr);
        result = input + _a * result;
        vstore2(result, 0, optr);
    }

    optr = output + (rows - 1) * elements_per_row + gid + out_offset / 4;
    result = (float2)0;
    for(int i = 0; i < rows; ++i, optr -= elements_per_row)
    {
        input = vload2(0, optr);
        result = input + _a * result;
        vstore2(_gain * result, 0, optr);
    }
}

kernel void verticalCausalFilter_multichannel(
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const float _a,
    const float _gain
)
{
    int gid = get_global_id(0) * 2;
    if(gid >= cols)
    {
        return;
    }

    global float * optr[3];
    float2 input[3];
    float2 result[3] = { (float2)0, (float2)0, (float2)0 };

    optr[0] = output + gid + out_offset / 4;
    optr[1] = output + gid + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + gid + out_offset / 4 + 2 * rows * elements_per_row;

    for(int i = 0; i < rows; ++i)
    {
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);

        result[0] = input[0] + _a * result[0];
        result[1] = input[1] + _a * result[1];
        result[2] = input[2] + _a * result[2];

        vstore2(result[0], 0, optr[0]);
        vstore2(result[1], 0, optr[1]);
        vstore2(result[2], 0, optr[2]);

        optr[0] += elements_per_row;
        optr[1] += elements_per_row;
        optr[2] += elements_per_row;
    }

    optr[0] = output + (rows - 1) * elements_per_row + gid + out_offset / 4;
    optr[1] = output + (rows - 1) * elements_per_row + gid + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + (rows - 1) * elements_per_row + gid + out_offset / 4 + 2 * rows * elements_per_row;
    result[0] = result[1] = result[2] = (float2)0;

    for(int i = 0; i < rows; ++i)
    {
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);

        result[0] = input[0] + _a * result[0];
        result[1] = input[1] + _a * result[1];
        result[2] = input[2] + _a * result[2];

        vstore2(_gain * result[0], 0, optr[0]);
        vstore2(_gain * result[1], 0, optr[1]);
        vstore2(_gain * result[2], 0, optr[2]);

        optr[0] -= elements_per_row;
        optr[1] -= elements_per_row;
        optr[2] -= elements_per_row;
    }
}

//
// end of _spatiotemporalLPfilter
/////////////////////////////////////////////////////////////////////

//////////////// verticalCausalFilter_Irregular ////////////////
//////////////// verticalCausalFilter_Irregular ////////////////
kernel void verticalCausalFilter_Irregular(
    global float * output,
    global float * buffer,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const int buffer_offset,
    const float gain
)
{
    int gid = get_global_id(0) * 2;
    if(gid >= cols)
    {
        return;
    }

    global float * optr[3];
    global float * bptr = buffer + gid + buffer_offset / 4;
    float2 result[3] = { (float2)0, (float2)0, (float2)0 };
    float2 grad, input[3];
    optr[0] = output + gid + out_offset / 4;
    optr[1] = output + gid + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + gid + out_offset / 4 + 2 * rows * elements_per_row;
    for(int i = 0; i < rows; ++i, bptr += elements_per_row)
    {
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);
        grad = vload2(0, bptr);
        result[0] = input[0] + grad * result[0];
        result[1] = input[1] + grad * result[1];
        result[2] = input[2] + grad * result[2];
        vstore2(result[0], 0, optr[0]);
        vstore2(result[1], 0, optr[1]);
        vstore2(result[2], 0, optr[2]);
        optr[0] += elements_per_row;
        optr[1] += elements_per_row;
        optr[2] += elements_per_row;
    }

    int start_idx = mad24(rows - 1, elements_per_row, gid);
    optr[0] = output + start_idx + out_offset / 4;
    optr[1] = output + start_idx + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + start_idx + out_offset / 4 + 2 * rows * elements_per_row;
    bptr = buffer + start_idx + buffer_offset / 4;
    result[0] = result[1] = result[2] = (float2)0;
    for(int i = 0; i < rows; ++i, bptr -= elements_per_row)
    {
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);
        grad = vload2(0, bptr);
        result[0] = input[0] + grad * result[0];
        result[1] = input[1] + grad * result[1];
        result[2] = input[2] + grad * result[2];
        vstore2(gain * result[0], 0, optr[0]);
        vstore2(gain * result[1], 0, optr[1]);
        vstore2(gain * result[2], 0, optr[2]);
        optr[0] -= elements_per_row;
        optr[1] -= elements_per_row;
        optr[2] -= elements_per_row;
    }
}

//////////////// _adaptiveHorizontalCausalFilter_addInput ////////////////
kernel void adaptiveHorizontalCausalFilter_addInput(
    global const float * input,
    global const float * gradient,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int in_offset,
    const int grad_offset,
    const int out_offset
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global const float * iptr =
        input + mad24(gid, elements_per_row, in_offset / 4);
    global const float * gptr =
        gradient + mad24(gid, elements_per_row, grad_offset / 4);
    global float * optr =
        output + mad24(gid, elements_per_row, out_offset / 4);

    float4 in_v4, grad_v4, out_v4, res_v4 = (float4)(0);
    for(int i = 0; i < cols / 4; ++i, iptr += 4, gptr += 4, optr += 4)
    {
        in_v4   = vload4(0, iptr);
        grad_v4 = vload4(0, gptr);

        res_v4.x = in_v4.x + grad_v4.x * res_v4.w;
        res_v4.y = in_v4.y + grad_v4.y * res_v4.x;
        res_v4.z = in_v4.z + grad_v4.z * res_v4.y;
        res_v4.w = in_v4.w + grad_v4.w * res_v4.z;

        vstore4(res_v4, 0, optr);
    }

    optr = output + mad24(gid + 1, elements_per_row, -4 + out_offset / 4);
    gptr = gradient + mad24(gid + 1, elements_per_row, -4 + grad_offset / 4);
    res_v4 = (float4)(0);

    for(int i = 0; i < cols / 4; ++i, gptr -= 4, optr -= 4)
    {
        grad_v4 = vload4(0, gptr);
        out_v4 = vload4(0, optr);

        res_v4.w = out_v4.w + grad_v4.w * res_v4.x;
        res_v4.z = out_v4.z + grad_v4.z * res_v4.w;
        res_v4.y = out_v4.y + grad_v4.y * res_v4.z;
        res_v4.x = out_v4.x + grad_v4.x * res_v4.y;

        vstore4(res_v4, 0, optr);
    }
}

//////////////// _localLuminanceAdaptation ////////////////
// FIXME:
//  This kernel seems to have precision problem on GPU
kernel void localLuminanceAdaptation(
    global const float * luma,
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float _localLuminanceAddon,
    const float _localLuminanceFactor,
    const float _maxInputValue
)
{
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);
    float4 luma_vec = vload4(0, luma + offset);
    float4 X0 = luma_vec * _localLuminanceFactor + _localLuminanceAddon;
    float4 input_val = vload4(0, input + offset);
    // output of the following line may be different between GPU and CPU
    float4 out_vec = (_maxInputValue + X0) * input_val / (input_val + X0 + 0.00000000001f);
    vstore4(out_vec, 0, output + offset);
}
// end of basicretinafilter
//------------------------------------------------------
/////////////////////////////////////////////////////////



/////////////////////////////////////////////////////////
//------------------------------------------------------
// magno
// TODO: this kernel has too many buffer accesses, better to make it
//   vector read/write for fetch efficiency
kernel void amacrineCellsComputing(
    global const float * opl_on,
    global const float * opl_off,
    global float * prev_in_on,
    global float * prev_in_off,
    global float * out_on,
    global float * out_off,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float coeff
)
{
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
    opl_on      += offset;
    opl_off     += offset;
    prev_in_on  += offset;
    prev_in_off += offset;
    out_on      += offset;
    out_off     += offset;

    float4 val_opl_on = vload4(0, opl_on);
    float4 val_opl_off = vload4(0, opl_off);

    float4 magnoXonPixelResult = coeff * (vload4(0, out_on) + val_opl_on - vload4(0, prev_in_on));
    vstore4(fmax(magnoXonPixelResult, 0), 0, out_on);
    float4 magnoXoffPixelResult = coeff * (vload4(0, out_off) + val_opl_off - vload4(0, prev_in_off));
    vstore4(fmax(magnoXoffPixelResult, 0), 0, out_off);

    vstore4(val_opl_on, 0, prev_in_on);
    vstore4(val_opl_off, 0, prev_in_off);
}

/////////////////////////////////////////////////////////
//------------------------------------------------------
// parvo
// TODO: this kernel has too many buffer accesses, needs optimization
kernel void OPL_OnOffWaysComputing(
    global float4 * photo_out,
    global float4 * horiz_out,
    global float4 * bipol_on,
    global float4 * bipol_off,
    global float4 * parvo_on,
    global float4 * parvo_off,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx * 4 >= cols || gidy >= rows)
    {
        return;
    }
    // we assume elements_per_row must be multiples of 4
    int offset = mad24(gidy, elements_per_row >> 2, gidx);
    photo_out += offset;
    horiz_out += offset;
    bipol_on  += offset;
    bipol_off += offset;
    parvo_on  += offset;
    parvo_off += offset;

    float4 diff = *photo_out - *horiz_out;
    float4 isPositive = convert_float4(abs(diff > (float4)0.0f));
    float4 res_on  = isPositive * diff;
    float4 res_off = (isPositive - (float4)(1.0f)) * diff;

    *bipol_on = res_on;
    *parvo_on = res_on;

    *bipol_off = res_off;
    *parvo_off = res_off;
}

/////////////////////////////////////////////////////////
//------------------------------------------------------
// retinacolor
inline int bayerSampleOffset(int step, int rows, int x, int y)
{
    return mad24(y, step, x) +
           ((y % 2) + (x % 2)) * rows * step;
}


/////// colorMultiplexing //////
kernel void runColorMultiplexingBayer(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
    float4 val;
    val.x = input[bayerSampleOffset(elements_per_row, rows, gidx + 0, gidy)];
    val.y = input[bayerSampleOffset(elements_per_row, rows, gidx + 1, gidy)];
    val.z = input[bayerSampleOffset(elements_per_row, rows, gidx + 2, gidy)];
    val.w = input[bayerSampleOffset(elements_per_row, rows, gidx + 3, gidy)];
    vstore4(val, 0, output + offset);
}

kernel void runColorDemultiplexingBayer(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
    float4 val = vload4(0, input + offset);
    output[bayerSampleOffset(elements_per_row, rows, gidx + 0, gidy)] = val.x;
    output[bayerSampleOffset(elements_per_row, rows, gidx + 1, gidy)] = val.y;
    output[bayerSampleOffset(elements_per_row, rows, gidx + 2, gidy)] = val.z;
    output[bayerSampleOffset(elements_per_row, rows, gidx + 3, gidy)] = val.w;
}

kernel void demultiplexAssign(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = bayerSampleOffset(elements_per_row, rows, gidx, gidy);
    output[offset] = input[offset];
}


//// normalizeGrayOutputCentredSigmoide
kernel void normalizeGrayOutputCentredSigmoide(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float meanval,
    const float X0
)

{
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);

    float4 input_val = vload4(0, input + offset);
    input_val =  meanval + (meanval + X0) * (input_val - meanval) / (fabs(input_val - meanval) + X0);
    vstore4(input_val, 0, output + offset);
}

//// normalize by photoreceptors density
kernel void normalizePhotoDensity(
    global const float * chroma,
    global const float * colorDensity,
    global const float * multiplex,
    global float * luma,
    global float * demultiplex,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float pG
)
{
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    int index = offset;

    float4 Cr = vload4(0, chroma + index) * vload4(0, colorDensity + index);
    index += elements_per_row * rows;
    float4 Cg = vload4(0, chroma + index) * vload4(0, colorDensity + index);
    index += elements_per_row * rows;
    float4 Cb = vload4(0, chroma + index) * vload4(0, colorDensity + index);

    const float4 luma_res = (Cr + Cg + Cb) * pG;
    vstore4(luma_res, 0, luma + offset);
    float4 res_v4 = vload4(0, multiplex + offset) - luma_res;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 0, gidy)] = res_v4.x;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 1, gidy)] = res_v4.y;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 2, gidy)] = res_v4.z;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 3, gidy)] = res_v4.w;
}



//////// computeGradient ///////
// TODO:
// this function maybe accelerated by image2d_t or lds
kernel void computeGradient(
    global const float * luma,
    global float * gradient,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0) + 2, gidy = get_global_id(1) + 2;
    if(gidx >= cols - 2 || gidy >= rows - 2)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);
    luma += offset;

    // horizontal and vertical local gradients
    const float v_grad = fabs(luma[elements_per_row] - luma[- elements_per_row]);
    const float h_grad = fabs(luma[1] - luma[-1]);

    // neighborhood horizontal and vertical gradients
    const float cur_val  = luma[0];
    const float v_grad_p = fabs(cur_val - luma[- 2 * elements_per_row]);
    const float h_grad_p = fabs(cur_val - luma[- 2]);
    const float v_grad_n = fabs(cur_val - luma[2 * elements_per_row]);
    const float h_grad_n = fabs(cur_val - luma[2]);

    const float horiz_grad = 0.5f * h_grad + 0.25f * (h_grad_p + h_grad_n);
    const float verti_grad = 0.5f * v_grad + 0.25f * (v_grad_p + v_grad_n);
    const bool is_vertical_greater = (horiz_grad < verti_grad) &&
                                     ((verti_grad - horiz_grad) > 1e-5);

    gradient[offset + elements_per_row * rows] = is_vertical_greater ? 0.06f : 0.57f;
    gradient[offset                          ] = is_vertical_greater ? 0.57f : 0.06f;
}


/////// substractResidual ///////
kernel void substractResidual(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float pR,
    const float pG,
    const float pB
)
{
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int indices [3] =
    {
        mad24(gidy, elements_per_row, gidx),
        mad24(gidy + rows, elements_per_row, gidx),
        mad24(gidy + 2 * rows, elements_per_row, gidx)
    };
    float4 vals[3];
    vals[0] = vload4(0, input + indices[0]);
    vals[1] = vload4(0, input + indices[1]);
    vals[2] = vload4(0, input + indices[2]);

    float4 residu = pR * vals[0] + pG * vals[1] + pB * vals[2];
    vstore4(vals[0] - residu, 0, input + indices[0]);
    vstore4(vals[1] - residu, 0, input + indices[1]);
    vstore4(vals[2] - residu, 0, input + indices[2]);
}

///// clipRGBOutput_0_maxInputValue /////
kernel void clipRGBOutput_0_maxInputValue(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float maxVal
)
{
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    float4 val = vload4(0, input + offset);
    val = clamp(val, 0.0f, maxVal);
    vstore4(val, 0, input + offset);
}

//// normalizeGrayOutputNearZeroCentreredSigmoide ////
kernel void normalizeGrayOutputNearZeroCentreredSigmoide(
    global float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float maxVal,
    const float X0cube
)
{
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    float4 currentCubeLuminance = vload4(0, input + offset);
    currentCubeLuminance = currentCubeLuminance * currentCubeLuminance * currentCubeLuminance;
    float4 val = currentCubeLuminance * X0cube / (X0cube + currentCubeLuminance);
    vstore4(val, 0, output + offset);
}

//// centerReductImageLuminance ////
kernel void centerReductImageLuminance(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float mean,
    const float std_dev
)
{
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);

    float4 val = vload4(0, input + offset);
    val = (val - mean) / std_dev;
    vstore4(val, 0, input + offset);
}

//// inverseValue ////
kernel void inverseValue(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    float4 val = vload4(0, input + offset);
    val = 1.f / val;
    vstore4(val, 0, input + offset);
}

#define CV_PI 3.1415926535897932384626433832795

//// _processRetinaParvoMagnoMapping ////
kernel void processRetinaParvoMagnoMapping(
    global float * parvo,
    global float * magno,
    global float * output,
    const int cols,
    const int rows,
    const int halfCols,
    const int halfRows,
    const int elements_per_row,
    const float minDistance
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);

    float distanceToCenter =
        sqrt(((float)(gidy - halfRows) * (gidy - halfRows) + (gidx - halfCols) * (gidx - halfCols)));

    float a = distanceToCenter < minDistance ?
              (0.5f + 0.5f * (float)cos(CV_PI * distanceToCenter / minDistance)) : 0;
    float b = 1.f - a;

    output[offset] = parvo[offset] * a + magno[offset] * b;
}