/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Jia Haipeng, jiahaipeng95@gmail.com
//    Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

/**************************************PUBLICFUNC*************************************/

#if depth == 0
    #define DATA_TYPE uchar
    #define MAX_NUM  255
    #define HALF_MAX_NUM 128
    #define COEFF_TYPE int
    #define SAT_CAST(num) convert_uchar_sat(num)
    #define DEPTH_0
#elif depth == 2
    #define DATA_TYPE ushort
    #define MAX_NUM  65535
    #define HALF_MAX_NUM 32768
    #define COEFF_TYPE int
    #define SAT_CAST(num) convert_ushort_sat(num)
    #define DEPTH_2
#elif depth == 5
    #define DATA_TYPE float
    #define MAX_NUM  1.0f
    #define HALF_MAX_NUM 0.5f
    #define COEFF_TYPE float
    #define SAT_CAST(num) (num)
    #define DEPTH_5
#else
    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
#endif

#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))

enum
{
    yuv_shift  = 14,
    xyz_shift  = 12,
    hsv_shift  = 12,
    R2Y        = 4899,
    G2Y        = 9617,
    B2Y        = 1868,
    BLOCK_SIZE = 256
};

//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
#define B2YF 0.114f
#define G2YF 0.587f
#define R2YF 0.299f
//to YCbCr
#define YCBF 0.564f
#define YCRF 0.713f
#define YCBI 9241
#define YCRI 11682
//to YUV
#define B2UF 0.492f
#define R2VF 0.877f
#define B2UI 8061
#define R2VI 14369
//from YUV
#define U2BF 2.032f
#define U2GF -0.395f
#define V2GF -0.581f
#define V2RF 1.140f
#define U2BI 33292
#define U2GI -6472
#define V2GI -9519
#define V2RI 18678
//from YCrCb
#define CR2RF 1.403f
#define CB2GF -0.344f
#define CR2GF -0.714f
#define CB2BF 1.773f
#define CR2RI 22987
#define CB2GI -5636
#define CR2GI -11698
#define CB2BI 29049

#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)

#ifndef hscale
#define hscale 0
#endif

#ifndef hrange
#define hrange 0
#endif

#if bidx == 0
#define R_COMP z
#define G_COMP y
#define B_COMP x
#else
#define R_COMP x
#define G_COMP y
#define B_COMP z
#endif

#ifndef uidx
#define uidx 0
#endif

#ifndef yidx
#define yidx 0
#endif

#ifndef PIX_PER_WI_X
#define PIX_PER_WI_X 1
#endif

#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)

#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
#define DATA_TYPE_3 CAT(DATA_TYPE, 3)

///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////

__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
                       __global uchar * dstptr, int dst_step, int dst_offset,
                       int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
                DATA_TYPE_3 src_pix = vload3(0, src);
#ifdef DEPTH_5
                dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
#else
                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
#endif
                ++y;
                src_index += src_step;
                dst_index += dst_step;
            }
        }
    }
}

__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
                       __global uchar * dstptr, int dst_step, int dst_offset,
                       int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
                DATA_TYPE val = src[0];
#if dcn == 3 || defined DEPTH_5
                dst[0] = dst[1] = dst[2] = val;
#if dcn == 4
                dst[3] = MAX_NUM;
#endif
#else
                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
#endif
                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

///////////////////////////////////// RGB <-> YUV //////////////////////////////////////

__constant float c_RGB2YUVCoeffs_f[5]  = { B2YF, G2YF, R2YF, B2UF, R2VF };
__constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, B2UI, R2VI };

__kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dt_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
                DATA_TYPE_3 src_pix = vload3(0, src);
                DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;

#ifdef DEPTH_5
                __constant float * coeffs = c_RGB2YUVCoeffs_f;
                const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));
                const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX_NUM);
                const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX_NUM);
#else
                __constant int * coeffs = c_RGB2YUVCoeffs_i;
                const int delta = HALF_MAX_NUM * (1 << yuv_shift);
                const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);
                const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
                const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
#endif

                dst[0] = SAT_CAST( Y );
                dst[1] = SAT_CAST( U );
                dst[2] = SAT_CAST( V );

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__constant float c_YUV2RGBCoeffs_f[4] = { U2BF, U2GF, V2GF, V2RF };
__constant int   c_YUV2RGBCoeffs_i[4] = { U2BI, U2GI, V2GI, V2RI };

__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dt_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
                DATA_TYPE_4 src_pix = vload4(0, src);
                DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;

#ifdef DEPTH_5
                __constant float * coeffs = c_YUV2RGBCoeffs_f;
                float r = fma(V - HALF_MAX_NUM, coeffs[3], Y);
                float g = fma(V - HALF_MAX_NUM, coeffs[2], fma(U - HALF_MAX_NUM, coeffs[1], Y));
                float b = fma(U - HALF_MAX_NUM, coeffs[0], Y);
#else
                __constant int * coeffs = c_YUV2RGBCoeffs_i;
                const int r = Y + CV_DESCALE(mul24(V - HALF_MAX_NUM, coeffs[3]), yuv_shift);
                const int g = Y + CV_DESCALE(mad24(V - HALF_MAX_NUM, coeffs[2], mul24(U - HALF_MAX_NUM, coeffs[1])), yuv_shift);
                const int b = Y + CV_DESCALE(mul24(U - HALF_MAX_NUM, coeffs[0]), yuv_shift);
#endif

                dst[bidx] = SAT_CAST( b );
                dst[1] = SAT_CAST( g );
                dst[bidx^2] = SAT_CAST( r );
#if dcn == 4
                dst[3] = MAX_NUM;
#endif
                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}
__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,
                                            -0.812999725f, 1.5959997177f };

__kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,
                            __global uchar* dstptr, int dst_step, int dt_offset,
                            int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols / 2)
    {
        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows / 2 )
            {
                __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
                __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
                __global uchar*       dst2 = dst1 + dst_step;

                float Y1 = ysrc[0];
                float Y2 = ysrc[1];
                float Y3 = ysrc[src_step];
                float Y4 = ysrc[src_step + 1];

                float U  = ((float)usrc[uidx]) - HALF_MAX_NUM;
                float V  = ((float)usrc[1-uidx]) - HALF_MAX_NUM;

                __constant float* coeffs = c_YUV2RGBCoeffs_420;
                float ruv = fma(coeffs[4], V, 0.5f);
                float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
                float buv = fma(coeffs[1], U, 0.5f);

                Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
                dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
                dst1[1]        = convert_uchar_sat(Y1 + guv);
                dst1[bidx]     = convert_uchar_sat(Y1 + buv);
#if dcn == 4
                dst1[3]        = 255;
#endif

                Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
                dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
                dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
                dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
#if dcn == 4
                dst1[7]        = 255;
#endif

                Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
                dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
                dst2[1]        = convert_uchar_sat(Y3 + guv);
                dst2[bidx]     = convert_uchar_sat(Y3 + buv);
#if dcn == 4
                dst2[3]        = 255;
#endif

                Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
                dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
                dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
                dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
#if dcn == 4
                dst2[7]        = 255;
#endif
            }
            ++y;
        }
    }
}

#if uidx < 2

__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
                                __global uchar* dstptr, int dst_step, int dt_offset,
                                int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols / 2)
    {
        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows / 2 )
            {
                __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
                __global uchar*       dst2 = dst1 + dst_step;

                float Y1 = ysrc[0];
                float Y2 = ysrc[1];
                float Y3 = ysrc[src_step];
                float Y4 = ysrc[src_step + 1];

#ifdef SRC_CONT
                __global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);
                int u_ind = mad24(y, cols >> 1, x);
                float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX_NUM, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX_NUM };
#else
                int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};
                __global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);
                __global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
                float uv[2] = { ((float)usrc[0]) - HALF_MAX_NUM, ((float)vsrc[0]) - HALF_MAX_NUM };
#endif
                float U = uv[uidx];
                float V = uv[1-uidx];

                __constant float* coeffs = c_YUV2RGBCoeffs_420;
                float ruv = fma(coeffs[4], V, 0.5f);
                float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
                float buv = fma(coeffs[1], U, 0.5f);

                Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
                dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
                dst1[1]        = convert_uchar_sat(Y1 + guv);
                dst1[bidx]     = convert_uchar_sat(Y1 + buv);
#if dcn == 4
                dst1[3]        = 255;
#endif

                Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
                dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
                dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
                dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
#if dcn == 4
                dst1[7]        = 255;
#endif

                Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
                dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
                dst2[1]        = convert_uchar_sat(Y3 + guv);
                dst2[bidx]     = convert_uchar_sat(Y3 + buv);
#if dcn == 4
                dst2[3]        = 255;
#endif

                Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
                dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
                dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
                dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
#if dcn == 4
                dst2[7]        = 255;
#endif
            }
            ++y;
        }
    }
}

#endif

#if uidx < 2

__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
                                            0.438999176f, -0.3679990768f, -0.0709991455f };

__kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
                                __global uchar* dstptr, int dst_step, int dst_offset,
                                int rows, int cols)
{
    int x = get_global_id(0) * PIX_PER_WI_X;
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols/2)
    {
        int src_index  = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
        int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
        int y_rows = rows / 3 * 2;
        int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
        __constant float* coeffs = c_RGB2YUVCoeffs_420;

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows / 3)
            {
                __global const uchar* src1 = srcptr + src_index;
                __global const uchar* src2 = src1 + src_step;
                __global uchar* ydst1 = dstptr + ydst_index;
                __global uchar* ydst2 = ydst1 + dst_step;

                __global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
                __global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);

#if PIX_PER_WI_X == 2
                int s11 = *((__global const int*) src1);
                int s12 = *((__global const int*) src1 + 1);
                int s13 = *((__global const int*) src1 + 2);
#if scn == 4
                int s14 = *((__global const int*) src1 + 3);
#endif
                int s21 = *((__global const int*) src2);
                int s22 = *((__global const int*) src2 + 1);
                int s23 = *((__global const int*) src2 + 2);
#if scn == 4
                int s24 = *((__global const int*) src2 + 3);
#endif
                float src_pix1[scn * 4], src_pix2[scn * 4];

                *((float4*) src_pix1)     = convert_float4(as_uchar4(s11));
                *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
                *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
#if scn == 4
                *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
#endif
                *((float4*) src_pix2)     = convert_float4(as_uchar4(s21));
                *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
                *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
#if scn == 4
                *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
#endif
                uchar4 y1, y2;
                y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[      2-bidx], fma(coeffs[1], src_pix1[      1], fma(coeffs[2], src_pix1[      bidx], 16.5f))));
                y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[  scn+2-bidx], fma(coeffs[1], src_pix1[  scn+1], fma(coeffs[2], src_pix1[  scn+bidx], 16.5f))));
                y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
                y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
                y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[      2-bidx], fma(coeffs[1], src_pix2[      1], fma(coeffs[2], src_pix2[      bidx], 16.5f))));
                y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[  scn+2-bidx], fma(coeffs[1], src_pix2[  scn+1], fma(coeffs[2], src_pix2[  scn+bidx], 16.5f))));
                y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
                y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));

                *((__global int*) ydst1) = as_int(y1);
                *((__global int*) ydst2) = as_int(y2);

                float uv[4] = { fma(coeffs[3], src_pix1[      2-bidx], fma(coeffs[4], src_pix1[      1], fma(coeffs[5], src_pix1[      bidx], 128.5f))),
                                fma(coeffs[5], src_pix1[      2-bidx], fma(coeffs[6], src_pix1[      1], fma(coeffs[7], src_pix1[      bidx], 128.5f))),
                                fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
                                fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };

                udst[0] = convert_uchar_sat(uv[uidx]    );
                vdst[0] = convert_uchar_sat(uv[1 - uidx]);
                udst[1] = convert_uchar_sat(uv[2 + uidx]);
                vdst[1] = convert_uchar_sat(uv[3 - uidx]);
#else
                float4 src_pix1 = convert_float4(vload4(0, src1));
                float4 src_pix2 = convert_float4(vload4(0, src1+scn));
                float4 src_pix3 = convert_float4(vload4(0, src2));
                float4 src_pix4 = convert_float4(vload4(0, src2+scn));

                ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
                ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
                ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
                ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));

                float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
                                fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };

                udst[0] = convert_uchar_sat(uv[uidx]  );
                vdst[0] = convert_uchar_sat(uv[1-uidx]);
#endif
                ++y;
                src_index += 2*src_step;
                ydst_index += 2*dst_step;
            }
        }
    }
}

#endif

__kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,
                          __global uchar* dstptr, int dst_step, int dst_offset,
                          int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols / 2)
    {
        __global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
        __global uchar*       dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows )
            {
                __constant float* coeffs = c_YUV2RGBCoeffs_420;

#ifndef USE_OPTIMIZED_LOAD
                float U = ((float) src[uidx]) - HALF_MAX_NUM;
                float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX_NUM;
                float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
                float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
#else
                int load_src = *((__global int*) src);
                float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
                float U = vec_src[uidx] - HALF_MAX_NUM;
                float V = vec_src[(2 + uidx) % 4] - HALF_MAX_NUM;
                float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];
                float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];
#endif

                float ruv = fma(coeffs[4], V, 0.5f);
                float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
                float buv = fma(coeffs[1], U, 0.5f);

                dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
                dst[1]        = convert_uchar_sat(y00 + guv);
                dst[bidx]     = convert_uchar_sat(y00 + buv);
#if dcn == 4
                dst[3]        = 255;
#endif

                dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
                dst[dcn + 1]        = convert_uchar_sat(y01 + guv);
                dst[dcn + bidx]     = convert_uchar_sat(y01 + buv);
#if dcn == 4
                dst[7]        = 255;
#endif
            }
            ++y;
            src += src_step;
            dst += dst_step;
        }
    }
}

///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////

__constant float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
__constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, YCRI, YCBI};

__kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,
                        __global uchar* dstptr, int dst_step, int dt_offset,
                        int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
                DATA_TYPE_4 src_pix = vload4(0, src);
                DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;

#ifdef DEPTH_5
                __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
                DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));
                DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX_NUM);
                DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX_NUM);
#else
                __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
                int delta = HALF_MAX_NUM * (1 << yuv_shift);
                int Y =  CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);
                int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
                int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
#endif

                dst[0] = SAT_CAST( Y );
                dst[1] = SAT_CAST( Cr );
                dst[2] = SAT_CAST( Cb );

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__constant float c_YCrCb2RGBCoeffs_f[4] = { CR2RF, CR2GF, CB2GF, CB2BF };
__constant int   c_YCrCb2RGBCoeffs_i[4] = { CR2RI, CR2GI, CB2GI, CB2BI };

__kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
                        __global uchar* dst, int dst_step, int dst_offset,
                        int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);
                __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);

                DATA_TYPE_4 src_pix = vload4(0, srcptr);
                DATA_TYPE yp = src_pix.x, cr = src_pix.y, cb = src_pix.z;

#ifdef DEPTH_5
                __constant float * coeff = c_YCrCb2RGBCoeffs_f;
                float r = fma(coeff[0], cr - HALF_MAX_NUM, yp);
                float g = fma(coeff[1], cr - HALF_MAX_NUM, fma(coeff[2], cb - HALF_MAX_NUM, yp));
                float b = fma(coeff[3], cb - HALF_MAX_NUM, yp);
#else
                __constant int * coeff = c_YCrCb2RGBCoeffs_i;
                int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX_NUM), yuv_shift);
                int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX_NUM, coeff[2] * (cb - HALF_MAX_NUM)), yuv_shift);
                int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX_NUM), yuv_shift);
#endif

                dstptr[(bidx^2)] = SAT_CAST(r);
                dstptr[1] = SAT_CAST(g);
                dstptr[bidx] = SAT_CAST(b);
#if dcn == 4
                dstptr[3] = MAX_NUM;
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////

__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
                      __global uchar * dstptr, int dst_step, int dst_offset,
                      int rows, int cols, __constant COEFF_TYPE * coeffs)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1) * PIX_PER_WI_Y;

    if (dx < cols)
    {
        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (dy < rows)
            {
                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);

                DATA_TYPE_4 src_pix = vload4(0, src);
                DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;

#ifdef DEPTH_5
                float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
                float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
                float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
#else
                int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
                int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
                int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
#endif
                dst[0] = SAT_CAST(x);
                dst[1] = SAT_CAST(y);
                dst[2] = SAT_CAST(z);

                ++dy;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
                      __global uchar * dstptr, int dst_step, int dst_offset,
                      int rows, int cols, __constant COEFF_TYPE * coeffs)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1) * PIX_PER_WI_Y;

    if (dx < cols)
    {
        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (dy < rows)
            {
                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);

                DATA_TYPE_4 src_pix = vload4(0, src);
                DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;

#ifdef DEPTH_5
                float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
                float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
                float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
#else
                int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
                int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
                int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
#endif

                DATA_TYPE dst0 = SAT_CAST(b);
                DATA_TYPE dst1 = SAT_CAST(g);
                DATA_TYPE dst2 = SAT_CAST(r);
#if dcn == 3 || defined DEPTH_5
                dst[0] = dst0;
                dst[1] = dst1;
                dst[2] = dst2;
#if dcn == 4
                dst[3] = MAX_NUM;
#endif
#else
                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
#endif

                ++dy;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////

__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
                  __global uchar* dstptr, int dst_step, int dst_offset,
                  int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
                DATA_TYPE_4 src_pix = vload4(0, src);

#ifdef REVERSE
                dst[0] = src_pix.z;
                dst[1] = src_pix.y;
                dst[2] = src_pix.x;
#else
                dst[0] = src_pix.x;
                dst[1] = src_pix.y;
                dst[2] = src_pix.z;
#endif

#if dcn == 4
#if scn == 3
                dst[3] = MAX_NUM;
#else
                dst[3] = src[3];
#endif
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////

__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
                         __global uchar* dst, int dst_step, int dst_offset,
                         int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                ushort t = *((__global const ushort*)(src + src_index));

#if greenbits == 6
                dst[dst_index + bidx] = (uchar)(t << 3);
                dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
                dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
#else
                dst[dst_index + bidx] = (uchar)(t << 3);
                dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
                dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
#endif

#if dcn == 4
#if greenbits == 6
                dst[dst_index + 3] = 255;
#else
                dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
#endif
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
                         __global uchar* dst, int dst_step, int dst_offset,
                         int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                uchar4 src_pix = vload4(0, src + src_index);

#if greenbits == 6
                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
#elif scn == 3
                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
#else
                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
                        ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////

__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
                          __global uchar* dst, int dst_step, int dst_offset,
                          int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, dst_offset + x);

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                int t = *((__global const ushort*)(src + src_index));

#if greenbits == 6
                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
#else
                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
#endif
                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
                          __global uchar* dst, int dst_step, int dst_offset,
                          int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, src_offset + x);
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                int t = src[src_index];

#if greenbits == 6
                *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
#else
                t >>= 3;
                *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
#endif
                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

//////////////////////////////////// RGB <-> HSV //////////////////////////////////////

__constant int sector_data[][3] = { { 1, 3, 0 },
                                    { 1, 0, 2 },
                                    { 3, 0, 1 },
                                    { 0, 2, 1 },
                                    { 0, 1, 3 },
                                    { 2, 1, 0 } };

#ifdef DEPTH_0

__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
                      __global uchar* dst, int dst_step, int dst_offset,
                      int rows, int cols,
                      __constant int * sdiv_table, __constant int * hdiv_table)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                uchar4 src_pix = vload4(0, src + src_index);

                int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
                int h, s, v = b;
                int vmin = b, diff;
                int vr, vg;

                v = max(v, g);
                v = max(v, r);
                vmin = min(vmin, g);
                vmin = min(vmin, r);

                diff = v - vmin;
                vr = v == r ? -1 : 0;
                vg = v == g ? -1 : 0;

                s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
                h = (vr & (g - b)) +
                    (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
                h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
                h += h < 0 ? hrange : 0;

                dst[dst_index] = convert_uchar_sat_rte(h);
                dst[dst_index + 1] = (uchar)s;
                dst[dst_index + 2] = (uchar)v;

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
                      __global uchar* dst, int dst_step, int dst_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                uchar4 src_pix = vload4(0, src + src_index);

                float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
                float b, g, r;

                if (s != 0)
                {
                    float tab[4];
                    int sector;
                    h *= hscale;
                    if( h < 0 )
                        do h += 6; while( h < 0 );
                    else if( h >= 6 )
                        do h -= 6; while( h >= 6 );
                    sector = convert_int_sat_rtn(h);
                    h -= sector;
                    if( (unsigned)sector >= 6u )
                    {
                        sector = 0;
                        h = 0.f;
                    }

                    tab[0] = v;
                    tab[1] = v*(1.f - s);
                    tab[2] = v*(1.f - s*h);
                    tab[3] = v*(1.f - s*(1.f - h));

                    b = tab[sector_data[sector][0]];
                    g = tab[sector_data[sector][1]];
                    r = tab[sector_data[sector][2]];
                }
                else
                    b = g = r = v;

                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
                dst[dst_index + 3] = MAX_NUM;
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#elif defined DEPTH_5

__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dst_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);
                float4 src_pix = vload4(0, src);

                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
                float h, s, v;

                float vmin, diff;

                v = vmin = r;
                if( v < g ) v = g;
                if( v < b ) v = b;
                if( vmin > g ) vmin = g;
                if( vmin > b ) vmin = b;

                diff = v - vmin;
                s = diff/(float)(fabs(v) + FLT_EPSILON);
                diff = (float)(60.f/(diff + FLT_EPSILON));
                if( v == r )
                    h = (g - b)*diff;
                else if( v == g )
                    h = fma(b - r, diff, 120.f);
                else
                    h = fma(r - g, diff, 240.f);

                if( h < 0 )
                    h += 360.f;

                dst[0] = h*hscale;
                dst[1] = s;
                dst[2] = v;

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dst_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {

                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);
                float4 src_pix = vload4(0, src);

                float h = src_pix.x, s = src_pix.y, v = src_pix.z;
                float b, g, r;

                if (s != 0)
                {
                    float tab[4];
                    int sector;
                    h *= hscale;
                    if(h < 0)
                        do h += 6; while (h < 0);
                    else if (h >= 6)
                        do h -= 6; while (h >= 6);
                    sector = convert_int_sat_rtn(h);
                    h -= sector;
                    if ((unsigned)sector >= 6u)
                    {
                        sector = 0;
                        h = 0.f;
                    }

                    tab[0] = v;
                    tab[1] = v*(1.f - s);
                    tab[2] = v*(1.f - s*h);
                    tab[3] = v*(1.f - s*(1.f - h));

                    b = tab[sector_data[sector][0]];
                    g = tab[sector_data[sector][1]];
                    r = tab[sector_data[sector][2]];
                }
                else
                    b = g = r = v;

                dst[bidx] = b;
                dst[1] = g;
                dst[bidx^2] = r;
#if dcn == 4
                dst[3] = MAX_NUM;
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#endif

///////////////////////////////////// RGB <-> HLS //////////////////////////////////////

#ifdef DEPTH_0

__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
                      __global uchar* dst, int dst_step, int dst_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                uchar4 src_pix = vload4(0, src + src_index);

                float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
                float h = 0.f, s = 0.f, l;
                float vmin, vmax, diff;

                vmax = vmin = r;
                if (vmax < g) vmax = g;
                if (vmax < b) vmax = b;
                if (vmin > g) vmin = g;
                if (vmin > b) vmin = b;

                diff = vmax - vmin;
                l = (vmax + vmin)*0.5f;

                if (diff > FLT_EPSILON)
                {
                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
                    diff = 60.f/diff;

                    if( vmax == r )
                        h = (g - b)*diff;
                    else if( vmax == g )
                        h = fma(b - r, diff, 120.f);
                    else
                        h = fma(r - g, diff, 240.f);

                    if( h < 0.f )
                        h += 360.f;
                }

                dst[dst_index] = convert_uchar_sat_rte(h*hscale);
                dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
                dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
                      __global uchar* dst, int dst_step, int dst_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                uchar4 src_pix = vload4(0, src + src_index);

                float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
                float b, g, r;

                if (s != 0)
                {
                    float tab[4];

                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
                    float p1 = 2*l - p2;

                    h *= hscale;
                    if( h < 0 )
                        do h += 6; while( h < 0 );
                    else if( h >= 6 )
                        do h -= 6; while( h >= 6 );

                    int sector = convert_int_sat_rtn(h);
                    h -= sector;

                    tab[0] = p2;
                    tab[1] = p1;
                    tab[2] = fma(p2 - p1, 1-h, p1);
                    tab[3] = fma(p2 - p1, h, p1);

                    b = tab[sector_data[sector][0]];
                    g = tab[sector_data[sector][1]];
                    r = tab[sector_data[sector][2]];
                }
                else
                    b = g = r = l;

                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
                dst[dst_index + 3] = MAX_NUM;
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#elif defined DEPTH_5

__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dst_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);
                float4 src_pix = vload4(0, src);

                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
                float h = 0.f, s = 0.f, l;
                float vmin, vmax, diff;

                vmax = vmin = r;
                if (vmax < g) vmax = g;
                if (vmax < b) vmax = b;
                if (vmin > g) vmin = g;
                if (vmin > b) vmin = b;

                diff = vmax - vmin;
                l = (vmax + vmin)*0.5f;

                if (diff > FLT_EPSILON)
                {
                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
                    diff = 60.f/diff;

                    if( vmax == r )
                        h = (g - b)*diff;
                    else if( vmax == g )
                        h = fma(b - r, diff, 120.f);
                    else
                        h = fma(r - g, diff, 240.f);

                    if( h < 0.f ) h += 360.f;
                }

                dst[0] = h*hscale;
                dst[1] = l;
                dst[2] = s;

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dst_offset,
                      int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);
                float4 src_pix = vload4(0, src);

                float h = src_pix.x, l = src_pix.y, s = src_pix.z;
                float b, g, r;

                if (s != 0)
                {
                    float tab[4];
                    int sector;

                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
                    float p1 = 2*l - p2;

                    h *= hscale;
                    if( h < 0 )
                        do h += 6; while( h < 0 );
                    else if( h >= 6 )
                        do h -= 6; while( h >= 6 );

                    sector = convert_int_sat_rtn(h);
                    h -= sector;

                    tab[0] = p2;
                    tab[1] = p1;
                    tab[2] = fma(p2 - p1, 1-h, p1);
                    tab[3] = fma(p2 - p1, h, p1);

                    b = tab[sector_data[sector][0]];
                    g = tab[sector_data[sector][1]];
                    r = tab[sector_data[sector][2]];
                }
                else
                    b = g = r = l;

                dst[bidx] = b;
                dst[1] = g;
                dst[bidx^2] = r;
#if dcn == 4
                dst[3] = MAX_NUM;
#endif

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#endif

/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////

#ifdef DEPTH_0

__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
                         __global uchar* dst, int dst_step, int dst_offset,
                         int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, src_offset + (x << 2));
        int dst_index = mad24(y, dst_step, dst_offset + (x << 2));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);

                *(__global uchar4 *)(dst + dst_index) =
                    (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
                             mad24(src_pix.y, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
                             mad24(src_pix.z, src_pix.w, HALF_MAX_NUM) / MAX_NUM, src_pix.w);

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
                         __global uchar* dst, int dst_step, int dst_offset,
                         int rows, int cols)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
                uchar v3 = src_pix.w, v3_half = v3 / 2;

                if (v3 == 0)
                    *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
                else
                    *(__global uchar4 *)(dst + dst_index) =
                        (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
                                 mad24(src_pix.y, MAX_NUM, v3_half) / v3,
                                 mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#endif

/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////

#define lab_shift xyz_shift
#define gamma_shift 3
#define lab_shift2 (lab_shift + gamma_shift)
#define GAMMA_TAB_SIZE 1024
#define GammaTabScale (float)GAMMA_TAB_SIZE

inline float splineInterpolate(float x, __global const float * tab, int n)
{
    int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
    x -= ix;
    tab += ix << 2;
    return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
}

#ifdef DEPTH_0

__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
                      __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
                      __constant int * coeffs, int Lscale, int Lshift)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const uchar* src_ptr = src + src_index;
                __global uchar* dst_ptr = dst + dst_index;
                uchar4 src_pix = vload4(0, src_ptr);

                int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
                    C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
                    C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];

                int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
                int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
                int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
                int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];

                int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
                int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
                int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );

                dst_ptr[0] = SAT_CAST(L);
                dst_ptr[1] = SAT_CAST(a);
                dst_ptr[2] = SAT_CAST(b);

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#elif defined DEPTH_5

__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __constant float * coeffs, float _1_3, float _a)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);
                float4 src_pix = vload4(0, src);

                float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
                      C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
                      C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];

                float R = clamp(src_pix.x, 0.0f, 1.0f);
                float G = clamp(src_pix.y, 0.0f, 1.0f);
                float B = clamp(src_pix.z, 0.0f, 1.0f);

#ifdef SRGB
                R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif

                // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
                float X = fma(R, C0, fma(G, C1, B*C2));
                float Y = fma(R, C3, fma(G, C4, B*C5));
                float Z = fma(R, C6, fma(G, C7, B*C8));

                float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
                float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
                float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);

                float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
                float a = 500.f * (FX - FY);
                float b = 200.f * (FY - FZ);

                dst[0] = L;
                dst[1] = a;
                dst[2] = b;

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#endif

inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __constant float * coeffs, float lThresh, float fThresh)
{
    float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];

    float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
          C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
          C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];

    float y, fy;
    // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
    if (li <= lThresh)
    {
        y = li / 903.3f;
        fy = fma(7.787f, y, 16.0f / 116.0f);
    }
    else
    {
        fy = (li + 16.0f) / 116.0f;
        y = fy * fy * fy;
    }

    float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };

    #pragma unroll
    for (int j = 0; j < 2; j++)
        if (fxz[j] <= fThresh)
            fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
        else
            fxz[j] = fxz[j] * fxz[j] * fxz[j];

    float x = fxz[0], z = fxz[1];
    float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
    float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
    float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);

#ifdef SRGB
    ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
    go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
    bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif

    dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
}

#ifdef DEPTH_0

__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __constant float * coeffs, float lThresh, float fThresh)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const uchar* src_ptr = src + src_index;
                __global uchar * dst_ptr = dst + dst_index;
                uchar4 src_pix = vload4(0, src_ptr);

                float srcbuf[3], dstbuf[3];
                srcbuf[0] = src_pix.x*(100.f/255.f);
                srcbuf[1] = convert_float(src_pix.y - 128);
                srcbuf[2] = convert_float(src_pix.z - 128);

                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
                    gammaTab,
#endif
                    coeffs, lThresh, fThresh);

#if dcn == 3
                dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
                dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
                dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
#else
                *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
                    SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
#endif
                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#elif defined DEPTH_5

__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __constant float * coeffs, float lThresh, float fThresh)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
        {
            if (y < rows)
            {
                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);
                float4 src_pix = vload4(0, src);

                float srcbuf[3], dstbuf[3];
                srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;

                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
                    gammaTab,
#endif
                    coeffs, lThresh, fThresh);

                dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
#if dcn == 4
                dst[3] = MAX_NUM;
#endif
                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
        }
    }
}

#endif

/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////

#define LAB_CBRT_TAB_SIZE 1024
#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))

__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;

#ifdef DEPTH_5

__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
            if (y < rows)
            {
                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);

                float R = src[0], G = src[1], B = src[2];

                R = clamp(R, 0.f, 1.f);
                G = clamp(G, 0.f, 1.f);
                B = clamp(B, 0.f, 1.f);

#ifdef SRGB
                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));

                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
                L = fma(116.f, L, -16.f);

                float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
                float u = L*fma(X, d, -_un);
                float v = L*fma(2.25f, Y*d, -_vn);

                dst[0] = L;
                dst[1] = u;
                dst[2] = v;

                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
    }
}

#elif defined DEPTH_0

__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
            if (y < rows)
            {
                float scale = 1.0f / 255.0f;
                float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;

#ifdef SRGB
                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));

                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
                L = 116.f*L - 16.f;

                float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
                float u = L*(X*d - _un);
                float v = L*fma(2.25f, Y*d, -_vn);

                dst[0] = SAT_CAST(L * 2.55f);
                //0.72033 = 255/(220+134), 96.525 = 134*255/(220+134)
                dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
                //0.9732 = 255/(140+122), 136.259 = 140*255/(140+122)
                dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));

                ++y;
                dst += dst_step;
                src += src_step;
            }
    }
}

#endif

#ifdef DEPTH_5

__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __constant float * coeffs, float _un, float _vn)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
            if (y < rows)
            {
                __global const float * src = (__global const float *)(srcptr + src_index);
                __global float * dst = (__global float *)(dstptr + dst_index);

                float L = src[0], u = src[1], v = src[2], X, Y, Z;
                if(L >= 8)
                {
                    Y = fma(L, 1.f/116.f, 16.f/116.f);
                    Y = Y*Y*Y;
                }
                else
                {
                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
                }
                float up = 3.f*fma(L, _un, u);
                float vp = 0.25f/fma(L, _vn, v);
                vp = clamp(vp, -0.25f, 0.25f);
                X = 3.f*Y*up*vp;
                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);

                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));

                R = clamp(R, 0.f, 1.f);
                G = clamp(G, 0.f, 1.f);
                B = clamp(B, 0.f, 1.f);

#ifdef SRGB
                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif

                dst[0] = R;
                dst[1] = G;
                dst[2] = B;
#if dcn == 4
                dst[3] = MAX_NUM;
#endif
                ++y;
                dst_index += dst_step;
                src_index += src_step;
            }
    }
}

#elif defined DEPTH_0

__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
                      __global const float * gammaTab,
#endif
                      __constant float * coeffs, float _un, float _vn)
{
    int x = get_global_id(0);
    int y = get_global_id(1) * PIX_PER_WI_Y;

    if (x < cols)
    {
        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));

        #pragma unroll
        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
            if (y < rows)
            {
                float d, X, Y, Z;
                float L = src[0]*(100.f/255.f);
                // 1.388235294117647 = (220+134)/255
                float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
                // 1.027450980392157 = (140+122)/255
                float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
                if(L >= 8)
                {
                    Y = fma(L, 1.f/116.f, 16.f/116.f);
                    Y = Y*Y*Y;
                }
                else
                {
                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
                }
                float up = 3.f*fma(L, _un, u);
                float vp = 0.25f/fma(L, _vn, v);
                vp = clamp(vp, -0.25f, 0.25f);
                X = 3.f*Y*up*vp;
                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);

                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));

                R = clamp(R, 0.f, 1.f);
                G = clamp(G, 0.f, 1.f);
                B = clamp(B, 0.f, 1.f);

#ifdef SRGB
                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif

                uchar dst0 = SAT_CAST(R * 255.0f);
                uchar dst1 = SAT_CAST(G * 255.0f);
                uchar dst2 = SAT_CAST(B * 255.0f);

#if dcn == 4
                *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
#else
                dst[0] = dst0;
                dst[1] = dst1;
                dst[2] = dst2;
#endif

                ++y;
                dst += dst_step;
                src += src_step;
            }
    }
}

#endif