retina_kernel.cl 24.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
28
//     and/or other materials provided with the distribution.
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

//data (which is float) is aligend in 32 bytes
#define WIDTH_MULTIPLE (32 >> 2)

/////////////////////////////////////////////////////////
50
//------------------------------------------------------
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
// basicretinafilter
//////////////// _spatiotemporalLPfilter ////////////////
//_horizontalCausalFilter_addInput
kernel void horizontalCausalFilter_addInput(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int in_offset,
    const int out_offset,
    const float _tau,
    const float _a
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global const float * iptr =
        input  + mad24(gid, elements_per_row, in_offset / 4);
    global float * optr =
        output + mad24(gid, elements_per_row, out_offset / 4);

    float res;
78
    float4 in_v4, out_v4, sum_v4, res_v4 = (float4)(0);
79 80 81 82
    //vectorize to increase throughput
    for(int i = 0; i < cols / 4; ++i, iptr += 4, optr += 4)
    {
        in_v4  = vload4(0, iptr);
83 84
        out_v4 = vload4(0, optr) * _tau;
        sum_v4 = in_v4 + out_v4;
85

86 87 88 89
        res_v4.x = sum_v4.x + _a * res_v4.w;
        res_v4.y = sum_v4.y + _a * res_v4.x;
        res_v4.z = sum_v4.z + _a * res_v4.y;
        res_v4.w = sum_v4.w + _a * res_v4.z;
90 91 92 93

        vstore4(res_v4, 0, optr);
    }

94 95 96
    optr = output + mad24(gid + 1, elements_per_row, -4 + out_offset / 4);
    res_v4 = (float4)(0);
    for(int i = 0; i < elements_per_row / 4; ++i, optr -= 4)
97 98 99 100
    {
        // shift left, `offset` is type `size_t` so it cannot be negative
        out_v4 = vload4(0, optr);

101 102 103 104
        res_v4.w = out_v4.w + _a * res_v4.x;
        res_v4.z = out_v4.z + _a * res_v4.w;
        res_v4.y = out_v4.y + _a * res_v4.z;
        res_v4.x = out_v4.x + _a * res_v4.y;
105

106
        vstore4(res_v4, 0, optr);
107 108 109 110 111 112 113 114 115 116
    }
}

//_verticalCausalFilter
kernel void verticalCausalFilter(
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
117 118
    const float _a,
    const float _gain
119 120
)
{
121
    int gid = get_global_id(0) * 2;
122 123 124 125 126 127
    if(gid >= cols)
    {
        return;
    }

    global float * optr = output + gid + out_offset / 4;
128 129
    float2 input;
    float2 result = (float2)0;
130 131
    for(int i = 0; i < rows; ++i, optr += elements_per_row)
    {
132 133 134 135 136 137 138 139 140 141 142 143
        input = vload2(0, optr);
        result = input + _a * result;
        vstore2(result, 0, optr);
    }

    optr = output + (rows - 1) * elements_per_row + gid + out_offset / 4;
    result = (float2)0;
    for(int i = 0; i < rows; ++i, optr -= elements_per_row)
    {
        input = vload2(0, optr);
        result = input + _a * result;
        vstore2(_gain * result, 0, optr);
144 145 146
    }
}

147
kernel void verticalCausalFilter_multichannel(
148 149 150 151 152 153 154 155 156
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const float _a,
    const float _gain
)
{
157
    int gid = get_global_id(0) * 2;
158 159 160 161 162
    if(gid >= cols)
    {
        return;
    }

163 164 165
    global float * optr[3];
    float2 input[3];
    float2 result[3] = { (float2)0, (float2)0, (float2)0 };
166

167 168 169
    optr[0] = output + gid + out_offset / 4;
    optr[1] = output + gid + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + gid + out_offset / 4 + 2 * rows * elements_per_row;
170

171
    for(int i = 0; i < rows; ++i)
172
    {
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);

        result[0] = input[0] + _a * result[0];
        result[1] = input[1] + _a * result[1];
        result[2] = input[2] + _a * result[2];

        vstore2(result[0], 0, optr[0]);
        vstore2(result[1], 0, optr[1]);
        vstore2(result[2], 0, optr[2]);

        optr[0] += elements_per_row;
        optr[1] += elements_per_row;
        optr[2] += elements_per_row;
188
    }
189 190 191 192 193 194 195

    optr[0] = output + (rows - 1) * elements_per_row + gid + out_offset / 4;
    optr[1] = output + (rows - 1) * elements_per_row + gid + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + (rows - 1) * elements_per_row + gid + out_offset / 4 + 2 * rows * elements_per_row;
    result[0] = result[1] = result[2] = (float2)0;

    for(int i = 0; i < rows; ++i)
196
    {
197 198 199
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);
200

201 202 203
        result[0] = input[0] + _a * result[0];
        result[1] = input[1] + _a * result[1];
        result[2] = input[2] + _a * result[2];
204

205 206 207 208 209 210 211
        vstore2(_gain * result[0], 0, optr[0]);
        vstore2(_gain * result[1], 0, optr[1]);
        vstore2(_gain * result[2], 0, optr[2]);

        optr[0] -= elements_per_row;
        optr[1] -= elements_per_row;
        optr[2] -= elements_per_row;
212 213 214
    }
}

215 216 217 218 219
//
// end of _spatiotemporalLPfilter
/////////////////////////////////////////////////////////////////////

//////////////// verticalCausalFilter_Irregular ////////////////
220 221 222 223 224 225 226 227
//////////////// verticalCausalFilter_Irregular ////////////////
kernel void verticalCausalFilter_Irregular(
    global float * output,
    global float * buffer,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
228 229
    const int buffer_offset,
    const float gain
230 231
)
{
232
    int gid = get_global_id(0) * 2;
233 234 235 236 237
    if(gid >= cols)
    {
        return;
    }

238
    global float * optr[3];
239
    global float * bptr = buffer + gid + buffer_offset / 4;
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
    float2 result[3] = { (float2)0, (float2)0, (float2)0 };
    float2 grad, input[3];
    optr[0] = output + gid + out_offset / 4;
    optr[1] = output + gid + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + gid + out_offset / 4 + 2 * rows * elements_per_row;
    for(int i = 0; i < rows; ++i, bptr += elements_per_row)
    {
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);
        grad = vload2(0, bptr);
        result[0] = input[0] + grad * result[0];
        result[1] = input[1] + grad * result[1];
        result[2] = input[2] + grad * result[2];
        vstore2(result[0], 0, optr[0]);
        vstore2(result[1], 0, optr[1]);
        vstore2(result[2], 0, optr[2]);
        optr[0] += elements_per_row;
        optr[1] += elements_per_row;
        optr[2] += elements_per_row;
    }

    int start_idx = mad24(rows - 1, elements_per_row, gid);
    optr[0] = output + start_idx + out_offset / 4;
    optr[1] = output + start_idx + out_offset / 4 + rows * elements_per_row;
    optr[2] = output + start_idx + out_offset / 4 + 2 * rows * elements_per_row;
    bptr = buffer + start_idx + buffer_offset / 4;
    result[0] = result[1] = result[2] = (float2)0;
    for(int i = 0; i < rows; ++i, bptr -= elements_per_row)
    {
        input[0] = vload2(0, optr[0]);
        input[1] = vload2(0, optr[1]);
        input[2] = vload2(0, optr[2]);
        grad = vload2(0, bptr);
        result[0] = input[0] + grad * result[0];
        result[1] = input[1] + grad * result[1];
        result[2] = input[2] + grad * result[2];
        vstore2(gain * result[0], 0, optr[0]);
        vstore2(gain * result[1], 0, optr[1]);
        vstore2(gain * result[2], 0, optr[2]);
        optr[0] -= elements_per_row;
        optr[1] -= elements_per_row;
        optr[2] -= elements_per_row;
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
    }
}

//////////////// _adaptiveHorizontalCausalFilter_addInput ////////////////
kernel void adaptiveHorizontalCausalFilter_addInput(
    global const float * input,
    global const float * gradient,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int in_offset,
    const int grad_offset,
    const int out_offset
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global const float * iptr =
        input + mad24(gid, elements_per_row, in_offset / 4);
    global const float * gptr =
        gradient + mad24(gid, elements_per_row, grad_offset / 4);
    global float * optr =
        output + mad24(gid, elements_per_row, out_offset / 4);

    float4 in_v4, grad_v4, out_v4, res_v4 = (float4)(0);
    for(int i = 0; i < cols / 4; ++i, iptr += 4, gptr += 4, optr += 4)
    {
        in_v4   = vload4(0, iptr);
        grad_v4 = vload4(0, gptr);

        res_v4.x = in_v4.x + grad_v4.x * res_v4.w;
        res_v4.y = in_v4.y + grad_v4.y * res_v4.x;
        res_v4.z = in_v4.z + grad_v4.z * res_v4.y;
        res_v4.w = in_v4.w + grad_v4.w * res_v4.z;

        vstore4(res_v4, 0, optr);
    }

326 327 328
    optr = output + mad24(gid + 1, elements_per_row, -4 + out_offset / 4);
    gptr = gradient + mad24(gid + 1, elements_per_row, -4 + grad_offset / 4);
    res_v4 = (float4)(0);
329

330 331 332 333
    for(int i = 0; i < cols / 4; ++i, gptr -= 4, optr -= 4)
    {
        grad_v4 = vload4(0, gptr);
        out_v4 = vload4(0, optr);
334

335 336 337 338
        res_v4.w = out_v4.w + grad_v4.w * res_v4.x;
        res_v4.z = out_v4.z + grad_v4.z * res_v4.w;
        res_v4.y = out_v4.y + grad_v4.y * res_v4.z;
        res_v4.x = out_v4.x + grad_v4.x * res_v4.y;
339

340
        vstore4(res_v4, 0, optr);
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
    }
}

//////////////// _localLuminanceAdaptation ////////////////
// FIXME:
//  This kernel seems to have precision problem on GPU
kernel void localLuminanceAdaptation(
    global const float * luma,
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float _localLuminanceAddon,
    const float _localLuminanceFactor,
    const float _maxInputValue
)
{
359
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
360 361 362 363 364
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);
365 366 367
    float4 luma_vec = vload4(0, luma + offset);
    float4 X0 = luma_vec * _localLuminanceFactor + _localLuminanceAddon;
    float4 input_val = vload4(0, input + offset);
368
    // output of the following line may be different between GPU and CPU
369 370
    float4 out_vec = (_maxInputValue + X0) * input_val / (input_val + X0 + 0.00000000001f);
    vstore4(out_vec, 0, output + offset);
371 372
}
// end of basicretinafilter
373
//------------------------------------------------------
374 375 376 377 378
/////////////////////////////////////////////////////////



/////////////////////////////////////////////////////////
379
//------------------------------------------------------
380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
// magno
// TODO: this kernel has too many buffer accesses, better to make it
//   vector read/write for fetch efficiency
kernel void amacrineCellsComputing(
    global const float * opl_on,
    global const float * opl_off,
    global float * prev_in_on,
    global float * prev_in_off,
    global float * out_on,
    global float * out_off,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float coeff
)
{
396
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
397 398 399 400 401 402 403 404 405 406 407 408 409
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
    opl_on      += offset;
    opl_off     += offset;
    prev_in_on  += offset;
    prev_in_off += offset;
    out_on      += offset;
    out_off     += offset;

410 411
    float4 val_opl_on = vload4(0, opl_on);
    float4 val_opl_off = vload4(0, opl_off);
412

413 414 415 416 417 418 419
    float4 magnoXonPixelResult = coeff * (vload4(0, out_on) + val_opl_on - vload4(0, prev_in_on));
    vstore4(fmax(magnoXonPixelResult, 0), 0, out_on);
    float4 magnoXoffPixelResult = coeff * (vload4(0, out_off) + val_opl_off - vload4(0, prev_in_off));
    vstore4(fmax(magnoXoffPixelResult, 0), 0, out_off);

    vstore4(val_opl_on, 0, prev_in_on);
    vstore4(val_opl_off, 0, prev_in_off);
420 421 422
}

/////////////////////////////////////////////////////////
423
//------------------------------------------------------
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
// parvo
// TODO: this kernel has too many buffer accesses, needs optimization
kernel void OPL_OnOffWaysComputing(
    global float4 * photo_out,
    global float4 * horiz_out,
    global float4 * bipol_on,
    global float4 * bipol_off,
    global float4 * parvo_on,
    global float4 * parvo_off,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx * 4 >= cols || gidy >= rows)
    {
        return;
    }
    // we assume elements_per_row must be multiples of 4
    int offset = mad24(gidy, elements_per_row >> 2, gidx);
    photo_out += offset;
    horiz_out += offset;
    bipol_on  += offset;
    bipol_off += offset;
    parvo_on  += offset;
    parvo_off += offset;

    float4 diff = *photo_out - *horiz_out;
453
    float4 isPositive = convert_float4(abs(diff > (float4)0.0f));
454 455 456 457 458 459 460 461 462 463 464
    float4 res_on  = isPositive * diff;
    float4 res_off = (isPositive - (float4)(1.0f)) * diff;

    *bipol_on = res_on;
    *parvo_on = res_on;

    *bipol_off = res_off;
    *parvo_off = res_off;
}

/////////////////////////////////////////////////////////
465
//------------------------------------------------------
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
// retinacolor
inline int bayerSampleOffset(int step, int rows, int x, int y)
{
    return mad24(y, step, x) +
           ((y % 2) + (x % 2)) * rows * step;
}


/////// colorMultiplexing //////
kernel void runColorMultiplexingBayer(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
483
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
484 485 486 487 488 489
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
490 491 492 493 494 495
    float4 val;
    val.x = input[bayerSampleOffset(elements_per_row, rows, gidx + 0, gidy)];
    val.y = input[bayerSampleOffset(elements_per_row, rows, gidx + 1, gidy)];
    val.z = input[bayerSampleOffset(elements_per_row, rows, gidx + 2, gidy)];
    val.w = input[bayerSampleOffset(elements_per_row, rows, gidx + 3, gidy)];
    vstore4(val, 0, output + offset);
496 497 498 499 500 501 502 503 504 505
}

kernel void runColorDemultiplexingBayer(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
506
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
507 508 509 510 511 512
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
513 514 515 516 517
    float4 val = vload4(0, input + offset);
    output[bayerSampleOffset(elements_per_row, rows, gidx + 0, gidy)] = val.x;
    output[bayerSampleOffset(elements_per_row, rows, gidx + 1, gidy)] = val.y;
    output[bayerSampleOffset(elements_per_row, rows, gidx + 2, gidy)] = val.z;
    output[bayerSampleOffset(elements_per_row, rows, gidx + 3, gidy)] = val.w;
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
}

kernel void demultiplexAssign(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = bayerSampleOffset(elements_per_row, rows, gidx, gidy);
    output[offset] = input[offset];
}


//// normalizeGrayOutputCentredSigmoide
kernel void normalizeGrayOutputCentredSigmoide(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float meanval,
    const float X0
)

{
551
    int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
552 553 554 555 556 557
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);

558 559 560
    float4 input_val = vload4(0, input + offset);
    input_val =  meanval + (meanval + X0) * (input_val - meanval) / (fabs(input_val - meanval) + X0);
    vstore4(input_val, 0, output + offset);
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
}

//// normalize by photoreceptors density
kernel void normalizePhotoDensity(
    global const float * chroma,
    global const float * colorDensity,
    global const float * multiplex,
    global float * luma,
    global float * demultiplex,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float pG
)
{
576
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
577 578 579 580 581 582 583
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    int index = offset;

584
    float4 Cr = vload4(0, chroma + index) * vload4(0, colorDensity + index);
585
    index += elements_per_row * rows;
586
    float4 Cg = vload4(0, chroma + index) * vload4(0, colorDensity + index);
587
    index += elements_per_row * rows;
588 589 590 591 592 593 594 595 596
    float4 Cb = vload4(0, chroma + index) * vload4(0, colorDensity + index);

    const float4 luma_res = (Cr + Cg + Cb) * pG;
    vstore4(luma_res, 0, luma + offset);
    float4 res_v4 = vload4(0, multiplex + offset) - luma_res;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 0, gidy)] = res_v4.x;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 1, gidy)] = res_v4.y;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 2, gidy)] = res_v4.z;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx + 3, gidy)] = res_v4.w;
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
}



//////// computeGradient ///////
// TODO:
// this function maybe accelerated by image2d_t or lds
kernel void computeGradient(
    global const float * luma,
    global float * gradient,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0) + 2, gidy = get_global_id(1) + 2;
    if(gidx >= cols - 2 || gidy >= rows - 2)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);
    luma += offset;

    // horizontal and vertical local gradients
    const float v_grad = fabs(luma[elements_per_row] - luma[- elements_per_row]);
    const float h_grad = fabs(luma[1] - luma[-1]);

    // neighborhood horizontal and vertical gradients
    const float cur_val  = luma[0];
    const float v_grad_p = fabs(cur_val - luma[- 2 * elements_per_row]);
    const float h_grad_p = fabs(cur_val - luma[- 2]);
    const float v_grad_n = fabs(cur_val - luma[2 * elements_per_row]);
    const float h_grad_n = fabs(cur_val - luma[2]);

    const float horiz_grad = 0.5f * h_grad + 0.25f * (h_grad_p + h_grad_n);
    const float verti_grad = 0.5f * v_grad + 0.25f * (v_grad_p + v_grad_n);
633 634
    const bool is_vertical_greater = (horiz_grad < verti_grad) &&
                                     ((verti_grad - horiz_grad) > 1e-5);
635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651

    gradient[offset + elements_per_row * rows] = is_vertical_greater ? 0.06f : 0.57f;
    gradient[offset                          ] = is_vertical_greater ? 0.57f : 0.06f;
}


/////// substractResidual ///////
kernel void substractResidual(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float pR,
    const float pG,
    const float pB
)
{
652
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
653 654 655 656 657 658 659 660 661 662
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int indices [3] =
    {
        mad24(gidy, elements_per_row, gidx),
        mad24(gidy + rows, elements_per_row, gidx),
        mad24(gidy + 2 * rows, elements_per_row, gidx)
    };
663 664 665 666 667 668 669 670 671
    float4 vals[3];
    vals[0] = vload4(0, input + indices[0]);
    vals[1] = vload4(0, input + indices[1]);
    vals[2] = vload4(0, input + indices[2]);

    float4 residu = pR * vals[0] + pG * vals[1] + pB * vals[2];
    vstore4(vals[0] - residu, 0, input + indices[0]);
    vstore4(vals[1] - residu, 0, input + indices[1]);
    vstore4(vals[2] - residu, 0, input + indices[2]);
672 673 674 675 676 677 678 679 680 681 682
}

///// clipRGBOutput_0_maxInputValue /////
kernel void clipRGBOutput_0_maxInputValue(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float maxVal
)
{
683
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
684 685 686 687 688
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
689
    float4 val = vload4(0, input + offset);
690
    val = clamp(val, 0.0f, maxVal);
691
    vstore4(val, 0, input + offset);
692 693 694 695 696 697 698 699 700 701 702 703 704
}

//// normalizeGrayOutputNearZeroCentreredSigmoide ////
kernel void normalizeGrayOutputNearZeroCentreredSigmoide(
    global float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float maxVal,
    const float X0cube
)
{
705
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
706 707 708 709 710
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
711
    float4 currentCubeLuminance = vload4(0, input + offset);
712
    currentCubeLuminance = currentCubeLuminance * currentCubeLuminance * currentCubeLuminance;
713 714
    float4 val = currentCubeLuminance * X0cube / (X0cube + currentCubeLuminance);
    vstore4(val, 0, output + offset);
715 716 717 718 719 720 721 722 723 724 725 726
}

//// centerReductImageLuminance ////
kernel void centerReductImageLuminance(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float mean,
    const float std_dev
)
{
727
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
728 729 730 731 732 733
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);

734 735 736
    float4 val = vload4(0, input + offset);
    val = (val - mean) / std_dev;
    vstore4(val, 0, input + offset);
737 738 739 740 741 742 743 744 745 746
}

//// inverseValue ////
kernel void inverseValue(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
747
    const int gidx = get_global_id(0) * 4, gidy = get_global_id(1);
748 749 750 751 752
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
753 754 755
    float4 val = vload4(0, input + offset);
    val = 1.f / val;
    vstore4(val, 0, input + offset);
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
}

#define CV_PI 3.1415926535897932384626433832795

//// _processRetinaParvoMagnoMapping ////
kernel void processRetinaParvoMagnoMapping(
    global float * parvo,
    global float * magno,
    global float * output,
    const int cols,
    const int rows,
    const int halfCols,
    const int halfRows,
    const int elements_per_row,
    const float minDistance
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);

    float distanceToCenter =
        sqrt(((float)(gidy - halfRows) * (gidy - halfRows) + (gidx - halfCols) * (gidx - halfCols)));

    float a = distanceToCenter < minDistance ?
              (0.5f + 0.5f * (float)cos(CV_PI * distanceToCenter / minDistance)) : 0;
    float b = 1.f - a;

    output[offset] = parvo[offset] * a + magno[offset] * b;
}