Commit fd90efc9 authored by Andrey Pavlenko's avatar Andrey Pavlenko Committed by OpenCV Buildbot

Merge pull request #2593 from arkunze:pullreq/140319-color

parents 6bfbcf02 785acc18
...@@ -2716,6 +2716,8 @@ struct mRGBA2RGBA ...@@ -2716,6 +2716,8 @@ struct mRGBA2RGBA
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
{ {
bool ok = false; bool ok = false;
...@@ -2729,6 +2731,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2729,6 +2731,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
if (depth != CV_8U && depth != CV_16U && depth != CV_32F) if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
return false; return false;
cv::String opts = format("-D depth=%d -D scn=%d ", depth, scn);
ocl::Device dev = ocl::Device::getDefault();
int pxPerWIy = 1;
if (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU))
{
pxPerWIy = 4;
}
globalsize[1] = DIVUP(globalsize[1], pxPerWIy);
opts += format("-D PIX_PER_WI_Y=%d ", pxPerWIy);
switch (code) switch (code)
{ {
case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR: case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
...@@ -2738,7 +2751,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2738,7 +2751,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3; dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR); bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
k.create("RGB", ocl::imgproc::cvtcolor_oclsrc, k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=0 -D %s", depth, scn, dcn, opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
reverse ? "REVERSE" : "ORDER")); reverse ? "REVERSE" : "ORDER"));
break; break;
} }
...@@ -2752,7 +2765,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2752,7 +2765,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB || int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5; code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc, k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=2 -D dcn=%d -D bidx=%d -D greenbits=%d", depth, dcn, bidx, greenbits)); opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
break; break;
} }
case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555: case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
...@@ -2765,7 +2778,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2765,7 +2778,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5; code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
dcn = 2; dcn = 2;
k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc, k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=2 -D bidx=%d -D greenbits=%d", depth, scn, bidx, greenbits)); opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
break; break;
} }
case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY: case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
...@@ -2774,7 +2787,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2774,7 +2787,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dcn = 1; dcn = 1;
int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5; int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc, k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=2 -D dcn=1 -D bidx=0 -D greenbits=%d", depth, greenbits)); opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
break; break;
} }
case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555: case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
...@@ -2783,7 +2796,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2783,7 +2796,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dcn = 2; dcn = 2;
int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5; int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc, k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=1 -D dcn=2 -D bidx=0 -D greenbits=%d", depth, greenbits)); opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
break; break;
} }
case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY: case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
...@@ -2793,8 +2806,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2793,8 +2806,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2; bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
dcn = 1; dcn = 1;
k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc, k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
depth, scn, bidx, stripeSize)); bidx, stripeSize));
globalsize[0] = (src.cols + stripeSize-1)/stripeSize; globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
break; break;
} }
...@@ -2804,7 +2817,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2804,7 +2817,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
CV_Assert(scn == 1); CV_Assert(scn == 1);
dcn = code == COLOR_GRAY2BGRA ? 4 : 3; dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc, k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D bidx=0 -D scn=1 -D dcn=%d", depth, dcn)); opts + format("-D bidx=0 -D dcn=%d", dcn));
break; break;
} }
case COLOR_BGR2YUV: case COLOR_BGR2YUV:
...@@ -2814,7 +2827,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2814,7 +2827,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
bidx = code == COLOR_RGB2YUV ? 0 : 2; bidx = code == COLOR_RGB2YUV ? 0 : 2;
dcn = 3; dcn = 3;
k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc, k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); opts + format("-D dcn=3 -D bidx=%d", bidx));
break; break;
} }
case COLOR_YUV2BGR: case COLOR_YUV2BGR:
...@@ -2824,7 +2837,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2824,7 +2837,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
CV_Assert(dcn == 3 || dcn == 4); CV_Assert(dcn == 3 || dcn == 4);
bidx = code == COLOR_YUV2RGB ? 0 : 2; bidx = code == COLOR_YUV2RGB ? 0 : 2;
k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc, k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx)); opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
break; break;
} }
case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12:
...@@ -2837,7 +2850,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2837,7 +2850,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dstSz = Size(sz.width, sz.height * 2 / 3); dstSz = Size(sz.width, sz.height * 2 / 3);
k.create("YUV2RGB_NV12", ocl::imgproc::cvtcolor_oclsrc, k.create("YUV2RGB_NV12", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx)); opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
break; break;
} }
case COLOR_BGR2YCrCb: case COLOR_BGR2YCrCb:
...@@ -2847,7 +2860,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2847,7 +2860,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
bidx = code == COLOR_BGR2YCrCb ? 0 : 2; bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
dcn = 3; dcn = 3;
k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc, k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); opts + format("-D dcn=3 -D bidx=%d", bidx));
break; break;
} }
case COLOR_YCrCb2BGR: case COLOR_YCrCb2BGR:
...@@ -2858,7 +2871,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2858,7 +2871,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
CV_Assert(scn == 3 && (dcn == 3 || dcn == 4)); CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
bidx = code == COLOR_YCrCb2BGR ? 0 : 2; bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc, k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=%d", depth, scn, dcn, bidx)); opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
break; break;
} }
case COLOR_BGR2XYZ: case COLOR_RGB2XYZ: case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
...@@ -2904,7 +2917,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2904,7 +2917,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dst = _dst.getUMat(); dst = _dst.getUMat();
k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc, k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); opts + format("-D dcn=3 -D bidx=%d", bidx));
if (k.empty()) if (k.empty())
return false; return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c)); k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
...@@ -2955,7 +2968,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -2955,7 +2968,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dst = _dst.getUMat(); dst = _dst.getUMat();
k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc, k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx)); opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
if (k.empty()) if (k.empty())
return false; return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c)); k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
...@@ -3010,8 +3023,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -3010,8 +3023,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
_dst.create(dstSz, CV_8UC3); _dst.create(dstSz, CV_8UC3);
dst = _dst.getUMat(); dst = _dst.getUMat();
k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc, format("-D depth=%d -D hrange=%d -D bidx=%d -D dcn=3 -D scn=%d", k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
depth, hrange, bidx, scn)); opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
hrange, bidx));
if (k.empty()) if (k.empty())
return false; return false;
...@@ -3023,7 +3037,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -3023,7 +3037,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
} }
else else
k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc, k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D hscale=%ff -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn)); opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
hrange*(1.f/360.f), bidx));
break; break;
} }
case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
...@@ -3041,8 +3056,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -3041,8 +3056,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB"; String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc, k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%ff", opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
depth, dcn, bidx, hrange, 6.f/hrange)); dcn, bidx, hrange, 6.f/hrange));
break; break;
} }
case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA: case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
...@@ -3051,7 +3066,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -3051,7 +3066,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dcn = 4; dcn = 4;
k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc, k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D dcn=4 -D scn=4 -D bidx=3", depth)); opts + "-D dcn=4 -D bidx=3");
break; break;
} }
case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab: case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
...@@ -3063,8 +3078,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -3063,8 +3078,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
dcn = 3; dcn = 3;
k.create("BGR2Lab", ocl::imgproc::cvtcolor_oclsrc, k.create("BGR2Lab", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D dcn=3 -D scn=%d -D bidx=%d%s", opts + format("-D dcn=3 -D bidx=%d%s",
depth, scn, bidx, srgb ? " -D SRGB" : "")); bidx, srgb ? " -D SRGB" : ""));
if (k.empty()) if (k.empty())
return false; return false;
...@@ -3165,8 +3180,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ...@@ -3165,8 +3180,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB; bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB;
k.create("Lab2BGR", ocl::imgproc::cvtcolor_oclsrc, k.create("Lab2BGR", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d%s", opts + format("-D dcn=%d -D bidx=%d%s",
depth, dcn, bidx, srgb ? " -D SRGB" : "")); dcn, bidx, srgb ? " -D SRGB" : ""));
if (k.empty()) if (k.empty())
return false; return false;
......
...@@ -99,64 +99,81 @@ enum ...@@ -99,64 +99,81 @@ enum
#define hrange 0 #define hrange 0
#endif #endif
#if bidx == 0
#define R_COMP z
#define G_COMP y
#define B_COMP x
#elif bidx == 2
#define R_COMP x
#define G_COMP y
#define B_COMP z
#elif bidx == 3
// The only kernel that uses bidx == 3 doesn't use these macros.
// But we still need to make the compiler happy.
#define R_COMP w
#define G_COMP w
#define B_COMP w
#endif
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset, __global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols) int rows, int cols)
{ {
#if 1 int x = get_global_id(0);
const int x = get_global_id(0); int y = get_global_id(1) * PIX_PER_WI_Y;
const int y = get_global_id(1);
if (y < rows && x < cols)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
#ifdef DEPTH_5
dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
#else
dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift);
#endif
}
#else
const int x_min = get_global_id(0)*STRIPE_SIZE;
const int x_max = min(x_min + STRIPE_SIZE, cols);
const int y = get_global_id(1);
if( y < rows ) if (x < cols)
{ {
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + #pragma unroll
mad24(y, srcstep, srcoffset)) + x_min*scn; for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset)); {
int x; if (y < rows)
for( x = x_min; x < x_max; x++, src += scn ) {
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE_4 src_pix = vload4(0, src);
#ifdef DEPTH_5 #ifdef DEPTH_5
dst[x] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f; dst[0] = src_pix.B_COMP * 0.114f + src_pix.G_COMP * 0.587f + src_pix.R_COMP * 0.299f;
#else #else
dst[x] = (DATA_TYPE)(mad24(src[bidx], B2Y, mad24(src[1], G2Y, dst[0] = (DATA_TYPE)CV_DESCALE((src_pix.B_COMP * B2Y + src_pix.G_COMP * G2Y + src_pix.R_COMP * R2Y), yuv_shift);
mad24(src[(bidx^2)], R2Y, 1 << (yuv_shift-1)))) >> yuv_shift);
#endif #endif
}
++y;
}
} }
#endif
} }
__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, __kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset, __global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols) int rows, int cols)
{ {
const int x = get_global_id(0); int x = get_global_id(0);
const int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); #pragma unroll
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
DATA_TYPE val = src[0]; {
dst[0] = dst[1] = dst[2] = val; if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE val = src[0];
dst[0] = dst[1] = dst[2] = val;
#if dcn == 4 #if dcn == 4
dst[3] = MAX_NUM; dst[3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -170,30 +187,39 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset, ...@@ -170,30 +187,39 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset,
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); #pragma unroll
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; {
if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeffs = c_RGB2YUVCoeffs_f; __constant float * coeffs = c_RGB2YUVCoeffs_f;
const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX; const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX; const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
#else #else
__constant int * coeffs = c_RGB2YUVCoeffs_i; __constant int * coeffs = c_RGB2YUVCoeffs_i;
const int delta = HALF_MAX * (1 << yuv_shift); const int delta = HALF_MAX * (1 << yuv_shift);
const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift); const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift); const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
#endif #endif
dst[0] = SAT_CAST( Y ); dst[0] = SAT_CAST( Y );
dst[1] = SAT_CAST( U ); dst[1] = SAT_CAST( U );
dst[2] = SAT_CAST( V ); dst[2] = SAT_CAST( V );
}
++y;
}
} }
} }
...@@ -205,32 +231,41 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, ...@@ -205,32 +231,41 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); #pragma unroll
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
DATA_TYPE Y = src[0], U = src[1], V = src[2]; {
if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeffs = c_YUV2RGBCoeffs_f; __constant float * coeffs = c_YUV2RGBCoeffs_f;
const float r = Y + (V - HALF_MAX) * coeffs[3]; const float r = Y + (V - HALF_MAX) * coeffs[3];
const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
const float b = Y + (U - HALF_MAX) * coeffs[0]; const float b = Y + (U - HALF_MAX) * coeffs[0];
#else #else
__constant int * coeffs = c_YUV2RGBCoeffs_i; __constant int * coeffs = c_YUV2RGBCoeffs_i;
const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift); const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift); const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
#endif #endif
dst[bidx] = SAT_CAST( b ); dst[bidx] = SAT_CAST( b );
dst[1] = SAT_CAST( g ); dst[1] = SAT_CAST( g );
dst[bidx^2] = SAT_CAST( r ); dst[bidx^2] = SAT_CAST( r );
#if dcn == 4 #if dcn == 4
dst[3] = MAX_NUM; dst[3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -246,58 +281,66 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoff ...@@ -246,58 +281,66 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoff
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows / 2 && x < cols / 2 ) if (x < cols / 2)
{ {
__global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset); #pragma unroll
__global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
__global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset); {
__global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset); if (y < rows / 2 )
{
int Y1 = ysrc[0]; __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
int Y2 = ysrc[1]; __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
int Y3 = ysrc[srcstep]; __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset);
int Y4 = ysrc[srcstep + 1]; __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset);
int U = usrc[0] - 128; int Y1 = ysrc[0];
int V = usrc[1] - 128; int Y2 = ysrc[1];
int Y3 = ysrc[srcstep];
int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V; int Y4 = ysrc[srcstep + 1];
int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U; int U = usrc[0] - 128;
int V = usrc[1] - 128;
Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT); int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT); int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT); int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4 #if dcn == 4
dst1[3] = 255; dst1[3] = 255;
#endif #endif
Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY; Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT); dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT); dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT); dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4 #if dcn == 4
dst1[7] = 255; dst1[7] = 255;
#endif #endif
Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY; Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT); dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT); dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT); dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4 #if dcn == 4
dst2[3] = 255; dst2[3] = 255;
#endif #endif
Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY; Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT); dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT); dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT); dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4 #if dcn == 4
dst2[7] = 255; dst2[7] = 255;
#endif #endif
}
++y;
}
} }
} }
...@@ -311,30 +354,39 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset ...@@ -311,30 +354,39 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); #pragma unroll
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; {
if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeffs = c_RGB2YCrCbCoeffs_f; __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0]; DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0];
DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX; DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX; DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
#else #else
__constant int * coeffs = c_RGB2YCrCbCoeffs_i; __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
int delta = HALF_MAX * (1 << yuv_shift); int delta = HALF_MAX * (1 << yuv_shift);
int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift); int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift);
int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift); int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift); int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
#endif #endif
dst[0] = SAT_CAST( Y ); dst[0] = SAT_CAST( Y );
dst[1] = SAT_CAST( Cr ); dst[1] = SAT_CAST( Cr );
dst[2] = SAT_CAST( Cb ); dst[2] = SAT_CAST( Cb );
}
++y;
}
} }
} }
...@@ -346,35 +398,44 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset, ...@@ -346,35 +398,44 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
__global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx); {
__global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx); if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
__global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx);
__global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx);
DATA_TYPE y = srcptr[0], cr = srcptr[1], cb = srcptr[2]; DATA_TYPE_4 src_pix = vload4(0, srcptr);
DATA_TYPE y = src_pix.x, cr = src_pix.y, cb = src_pix.z;
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeff = c_YCrCb2RGBCoeffs_f; __constant float * coeff = c_YCrCb2RGBCoeffs_f;
float r = y + coeff[0] * (cr - HALF_MAX); float r = y + coeff[0] * (cr - HALF_MAX);
float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX); float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX);
float b = y + coeff[3] * (cb - HALF_MAX); float b = y + coeff[3] * (cb - HALF_MAX);
#else #else
__constant int * coeff = c_YCrCb2RGBCoeffs_i; __constant int * coeff = c_YCrCb2RGBCoeffs_i;
int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift); int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift); int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift);
int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift); int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
#endif #endif
dstptr[(bidx^2)] = SAT_CAST(r); dstptr[(bidx^2)] = SAT_CAST(r);
dstptr[1] = SAT_CAST(g); dstptr[1] = SAT_CAST(g);
dstptr[bidx] = SAT_CAST(b); dstptr[bidx] = SAT_CAST(b);
#if dcn == 4 #if dcn == 4
dstptr[3] = MAX_NUM; dstptr[3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -385,30 +446,39 @@ __kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offse ...@@ -385,30 +446,39 @@ __kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offse
int rows, int cols, __constant COEFF_TYPE * coeffs) int rows, int cols, __constant COEFF_TYPE * coeffs)
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1) * PIX_PER_WI_Y;
if (dy < rows && dx < cols) if (dx < cols)
{ {
int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); #pragma unroll
int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
DATA_TYPE r = src[0], g = src[1], b = src[2]; DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
#ifdef DEPTH_5 #ifdef DEPTH_5
float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2]; float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5]; float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8]; float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
#else #else
int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift); int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift); int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift); int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
#endif #endif
dst[0] = SAT_CAST(x); dst[0] = SAT_CAST(x);
dst[1] = SAT_CAST(y); dst[1] = SAT_CAST(y);
dst[2] = SAT_CAST(z); dst[2] = SAT_CAST(z);
}
++dy;
}
} }
} }
...@@ -417,33 +487,42 @@ __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offse ...@@ -417,33 +487,42 @@ __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offse
int rows, int cols, __constant COEFF_TYPE * coeffs) int rows, int cols, __constant COEFF_TYPE * coeffs)
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1) * PIX_PER_WI_Y;
if (dy < rows && dx < cols) if (dx < cols)
{ {
int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); #pragma unroll
int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
DATA_TYPE x = src[0], y = src[1], z = src[2]; DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
#ifdef DEPTH_5 #ifdef DEPTH_5
float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2]; float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5]; float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8]; float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
#else #else
int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift); int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift); int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift); int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
#endif #endif
dst[0] = SAT_CAST(b); dst[0] = SAT_CAST(b);
dst[1] = SAT_CAST(g); dst[1] = SAT_CAST(g);
dst[2] = SAT_CAST(r); dst[2] = SAT_CAST(r);
#if dcn == 4 #if dcn == 4
dst[3] = MAX_NUM; dst[3] = MAX_NUM;
#endif #endif
}
++dy;
}
} }
} }
...@@ -454,33 +533,42 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset, ...@@ -454,33 +533,42 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
DATA_TYPE_4 src_pix = vload4(0, src);
#ifdef REVERSE #ifdef REVERSE
dst[0] = src[2]; dst[0] = src_pix.z;
dst[1] = src[1]; dst[1] = src_pix.y;
dst[2] = src[0]; dst[2] = src_pix.x;
#else #else
dst[0] = src[0]; dst[0] = src_pix.x;
dst[1] = src[1]; dst[1] = src_pix.y;
dst[2] = src[2]; dst[2] = src_pix.z;
#endif #endif
#if dcn == 4 #if dcn == 4
#if scn == 3 #if scn == 3
dst[3] = MAX_NUM; dst[3] = MAX_NUM;
#else #else
dst[3] = src[3]; dst[3] = src[3];
#endif #endif
#endif #endif
}
++y;
}
} }
} }
...@@ -491,31 +579,39 @@ __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset ...@@ -491,31 +579,39 @@ __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
ushort t = *((__global const ushort*)(src + src_idx)); {
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
ushort t = *((__global const ushort*)(src + src_idx));
#if greenbits == 6 #if greenbits == 6
dst[dst_idx + bidx] = (uchar)(t << 3); dst[dst_idx + bidx] = (uchar)(t << 3);
dst[dst_idx + 1] = (uchar)((t >> 3) & ~3); dst[dst_idx + 1] = (uchar)((t >> 3) & ~3);
dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7); dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7);
#else #else
dst[dst_idx + bidx] = (uchar)(t << 3); dst[dst_idx + bidx] = (uchar)(t << 3);
dst[dst_idx + 1] = (uchar)((t >> 2) & ~7); dst[dst_idx + 1] = (uchar)((t >> 2) & ~7);
dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7); dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7);
#endif #endif
#if dcn == 4 #if dcn == 4
#if greenbits == 6 #if greenbits == 6
dst[dst_idx + 3] = 255; dst[dst_idx + 3] = 255;
#else #else
dst[dst_idx + 3] = t & 0x8000 ? 255 : 0; dst[dst_idx + 3] = t & 0x8000 ? 255 : 0;
#endif #endif
#endif #endif
}
++y;
}
} }
} }
...@@ -524,21 +620,30 @@ __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset ...@@ -524,21 +620,30 @@ __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
uchar4 src_pix = vload4(0, src + src_idx);
#if greenbits == 6 #if greenbits == 6
*((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~3) << 3)|((src[src_idx + (bidx^2)]&~7) << 8)); *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
#elif scn == 3 #elif scn == 3
*((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|((src[src_idx + (bidx^2)]&~7) << 7)); *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
#else #else
*((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)| *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
((src[src_idx + (bidx^2)]&~7) << 7)|(src[src_idx + 3] ? 0x8000 : 0)); ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
#endif #endif
}
++y;
}
} }
} }
...@@ -549,23 +654,31 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse ...@@ -549,23 +654,31 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
int t = *((__global const ushort*)(src + src_idx)); {
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x);
int t = *((__global const ushort*)(src + src_idx));
#if greenbits == 6 #if greenbits == 6
dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
((t >> 3) & 0xfc)*G2Y + ((t >> 3) & 0xfc)*G2Y +
((t >> 8) & 0xf8)*R2Y, yuv_shift); ((t >> 8) & 0xf8)*R2Y, yuv_shift);
#else #else
dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
((t >> 2) & 0xf8)*G2Y + ((t >> 2) & 0xf8)*G2Y +
((t >> 7) & 0xf8)*R2Y, yuv_shift); ((t >> 7) & 0xf8)*R2Y, yuv_shift);
#endif #endif
}
++y;
}
} }
} }
...@@ -574,20 +687,28 @@ __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offse ...@@ -574,20 +687,28 @@ __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offse
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
int t = src[src_idx]; {
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
int t = src[src_idx];
#if greenbits == 6 #if greenbits == 6
*((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8)); *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
#else #else
t >>= 3; t >>= 3;
*((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10)); *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10));
#endif #endif
}
++y;
}
} }
} }
...@@ -608,36 +729,45 @@ __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset, ...@@ -608,36 +729,45 @@ __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
__constant int * sdiv_table, __constant int * hdiv_table) __constant int * sdiv_table, __constant int * hdiv_table)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
int b = src[src_idx + bidx], g = src[src_idx + 1], r = src[src_idx + (bidx^2)]; if (y < rows)
int h, s, v = b; {
int vmin = b, diff; int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int vr, vg; int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
uchar4 src_pix = vload4(0, src + src_idx);
v = max( v, g );
v = max( v, r ); int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
vmin = min( vmin, g ); int h, s, v = b;
vmin = min( vmin, r ); int vmin = b, diff;
int vr, vg;
diff = v - vmin;
vr = v == r ? -1 : 0; v = max( v, g );
vg = v == g ? -1 : 0; v = max( v, r );
vmin = min( vmin, g );
s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift; vmin = min( vmin, r );
h = (vr & (g - b)) +
(~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); diff = v - vmin;
h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift; vr = v == r ? -1 : 0;
h += h < 0 ? hrange : 0; vg = v == g ? -1 : 0;
dst[dst_idx] = convert_uchar_sat_rte(h); s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
dst[dst_idx + 1] = (uchar)s; h = (vr & (g - b)) +
dst[dst_idx + 2] = (uchar)v; (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
h += h < 0 ? hrange : 0;
dst[dst_idx] = convert_uchar_sat_rte(h);
dst[dst_idx + 1] = (uchar)s;
dst[dst_idx + 2] = (uchar)v;
}
++y;
}
} }
} }
...@@ -646,51 +776,60 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset, ...@@ -646,51 +776,60 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
float h = src[src_idx], s = src[src_idx + 1]*(1/255.f), v = src[src_idx + 2]*(1/255.f);
float b, g, r;
if (s != 0)
{ {
float tab[4]; if (y < rows)
int sector;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
sector = convert_int_sat_rtn(h);
h -= sector;
if( (unsigned)sector >= 6u )
{ {
sector = 0; int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
h = 0.f; int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
} uchar4 src_pix = vload4(0, src + src_idx);
tab[0] = v; float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
tab[1] = v*(1.f - s); float b, g, r;
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h)); if (s != 0)
{
b = tab[sector_data[sector][0]]; float tab[4];
g = tab[sector_data[sector][1]]; int sector;
r = tab[sector_data[sector][2]]; h *= hscale;
} if( h < 0 )
else do h += 6; while( h < 0 );
b = g = r = v; else if( h >= 6 )
do h -= 6; while( h >= 6 );
dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); sector = convert_int_sat_rtn(h);
dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); h -= sector;
dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); if( (unsigned)sector >= 6u )
{
sector = 0;
h = 0.f;
}
tab[0] = v;
tab[1] = v*(1.f - s);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h));
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = v;
dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4 #if dcn == 4
dst[dst_idx + 3] = MAX_NUM; dst[dst_idx + 3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -701,42 +840,51 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset ...@@ -701,42 +840,51 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
__global const float * src = (__global const float *)(srcptr + src_idx); if (y < rows)
__global float * dst = (__global float *)(dstptr + dst_idx); {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
float b = src[bidx], g = src[1], r = src[bidx^2]; int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
float h, s, v;
__global const float * src = (__global const float *)(srcptr + src_idx);
float vmin, diff; __global float * dst = (__global float *)(dstptr + dst_idx);
float4 src_pix = vload4(0, src);
v = vmin = r;
if( v < g ) v = g; float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
if( v < b ) v = b; float h, s, v;
if( vmin > g ) vmin = g;
if( vmin > b ) vmin = b; float vmin, diff;
diff = v - vmin; v = vmin = r;
s = diff/(float)(fabs(v) + FLT_EPSILON); if( v < g ) v = g;
diff = (float)(60.f/(diff + FLT_EPSILON)); if( v < b ) v = b;
if( v == r ) if( vmin > g ) vmin = g;
h = (g - b)*diff; if( vmin > b ) vmin = b;
else if( v == g )
h = (b - r)*diff + 120.f; diff = v - vmin;
else s = diff/(float)(fabs(v) + FLT_EPSILON);
h = (r - g)*diff + 240.f; diff = (float)(60.f/(diff + FLT_EPSILON));
if( v == r )
if( h < 0 ) h += 360.f; h = (g - b)*diff;
else if( v == g )
dst[0] = h*hscale; h = (b - r)*diff + 120.f;
dst[1] = s; else
dst[2] = v; h = (r - g)*diff + 240.f;
if( h < 0 ) h += 360.f;
dst[0] = h*hscale;
dst[1] = s;
dst[2] = v;
}
++y;
}
} }
} }
...@@ -745,54 +893,63 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset ...@@ -745,54 +893,63 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
__global const float * src = (__global const float *)(srcptr + src_idx);
__global float * dst = (__global float *)(dstptr + dst_idx);
float h = src[0], s = src[1], v = src[2];
float b, g, r;
if (s != 0)
{ {
float tab[4]; if (y < rows)
int sector;
h *= hscale;
if(h < 0)
do h += 6; while (h < 0);
else if (h >= 6)
do h -= 6; while (h >= 6);
sector = convert_int_sat_rtn(h);
h -= sector;
if ((unsigned)sector >= 6u)
{ {
sector = 0; int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
h = 0.f; int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
}
__global const float * src = (__global const float *)(srcptr + src_idx);
tab[0] = v; __global float * dst = (__global float *)(dstptr + dst_idx);
tab[1] = v*(1.f - s); float4 src_pix = vload4(0, src);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h)); float h = src_pix.x, s = src_pix.y, v = src_pix.z;
float b, g, r;
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]]; if (s != 0)
r = tab[sector_data[sector][2]]; {
} float tab[4];
else int sector;
b = g = r = v; h *= hscale;
if(h < 0)
dst[bidx] = b; do h += 6; while (h < 0);
dst[1] = g; else if (h >= 6)
dst[bidx^2] = r; do h -= 6; while (h >= 6);
sector = convert_int_sat_rtn(h);
h -= sector;
if ((unsigned)sector >= 6u)
{
sector = 0;
h = 0.f;
}
tab[0] = v;
tab[1] = v*(1.f - s);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h));
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = v;
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4 #if dcn == 4
dst[3] = MAX_NUM; dst[3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -807,44 +964,53 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset, ...@@ -807,44 +964,53 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
float b = src[src_idx + bidx]*(1/255.f), g = src[src_idx + 1]*(1/255.f), r = src[src_idx + (bidx^2)]*(1/255.f);
float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{ {
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); if (y < rows)
diff = 60.f/diff; {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
if( vmax == r ) int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
h = (g - b)*diff; uchar4 src_pix = vload4(0, src + src_idx);
else if( vmax == g )
h = (b - r)*diff + 120.f; float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
else float h = 0.f, s = 0.f, l;
h = (r - g)*diff + 240.f; float vmin, vmax, diff;
if( h < 0.f ) h += 360.f; vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
diff = 60.f/diff;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
h = (b - r)*diff + 120.f;
else
h = (r - g)*diff + 240.f;
if( h < 0.f ) h += 360.f;
}
dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
}
++y;
} }
dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
} }
} }
...@@ -853,50 +1019,59 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset, ...@@ -853,50 +1019,59 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
float h = src[src_idx], l = src[src_idx + 1]*(1.f/255.f), s = src[src_idx + 2]*(1.f/255.f);
float b, g, r;
if (s != 0)
{ {
float tab[4]; if (y < rows)
{
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
float p1 = 2*l - p2; int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
uchar4 src_pix = vload4(0, src + src_idx);
h *= hscale;
if( h < 0 ) float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
do h += 6; while( h < 0 ); float b, g, r;
else if( h >= 6 )
do h -= 6; while( h >= 6 ); if (s != 0)
{
int sector = convert_int_sat_rtn(h); float tab[4];
h -= sector;
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
tab[0] = p2; float p1 = 2*l - p2;
tab[1] = p1;
tab[2] = p1 + (p2 - p1)*(1-h); h *= hscale;
tab[3] = p1 + (p2 - p1)*h; if( h < 0 )
do h += 6; while( h < 0 );
b = tab[sector_data[sector][0]]; else if( h >= 6 )
g = tab[sector_data[sector][1]]; do h -= 6; while( h >= 6 );
r = tab[sector_data[sector][2]];
} int sector = convert_int_sat_rtn(h);
else h -= sector;
b = g = r = l;
tab[0] = p2;
dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); tab[1] = p1;
dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); tab[2] = p1 + (p2 - p1)*(1-h);
dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); tab[3] = p1 + (p2 - p1)*h;
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = l;
dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4 #if dcn == 4
dst[dst_idx + 3] = MAX_NUM; dst[dst_idx + 3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -907,47 +1082,56 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset ...@@ -907,47 +1082,56 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
__global const float * src = (__global const float *)(srcptr + src_idx);
__global float * dst = (__global float *)(dstptr + dst_idx);
float b = src[bidx], g = src[1], r = src[bidx^2];
float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{ {
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); if (y < rows)
diff = 60.f/diff; {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
if( vmax == r ) int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
h = (g - b)*diff;
else if( vmax == g ) __global const float * src = (__global const float *)(srcptr + src_idx);
h = (b - r)*diff + 120.f; __global float * dst = (__global float *)(dstptr + dst_idx);
else float4 src_pix = vload4(0, src);
h = (r - g)*diff + 240.f;
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
if( h < 0.f ) h += 360.f; float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
diff = 60.f/diff;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
h = (b - r)*diff + 120.f;
else
h = (r - g)*diff + 240.f;
if( h < 0.f ) h += 360.f;
}
dst[0] = h*hscale;
dst[1] = l;
dst[2] = s;
}
++y;
} }
dst[0] = h*hscale;
dst[1] = l;
dst[2] = s;
} }
} }
...@@ -956,54 +1140,63 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset ...@@ -956,54 +1140,63 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
__global const float * src = (__global const float *)(srcptr + src_idx);
__global float * dst = (__global float *)(dstptr + dst_idx);
float h = src[0], l = src[1], s = src[2];
float b, g, r;
if (s != 0)
{ {
float tab[4]; if (y < rows)
int sector; {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
float p1 = 2*l - p2;
__global const float * src = (__global const float *)(srcptr + src_idx);
h *= hscale; __global float * dst = (__global float *)(dstptr + dst_idx);
if( h < 0 ) float4 src_pix = vload4(0, src);
do h += 6; while( h < 0 );
else if( h >= 6 ) float h = src_pix.x, l = src_pix.y, s = src_pix.z;
do h -= 6; while( h >= 6 ); float b, g, r;
sector = convert_int_sat_rtn(h); if (s != 0)
h -= sector; {
float tab[4];
tab[0] = p2; int sector;
tab[1] = p1;
tab[2] = p1 + (p2 - p1)*(1-h); float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
tab[3] = p1 + (p2 - p1)*h; float p1 = 2*l - p2;
b = tab[sector_data[sector][0]]; h *= hscale;
g = tab[sector_data[sector][1]]; if( h < 0 )
r = tab[sector_data[sector][2]]; do h += 6; while( h < 0 );
} else if( h >= 6 )
else do h -= 6; while( h >= 6 );
b = g = r = l;
sector = convert_int_sat_rtn(h);
dst[bidx] = b; h -= sector;
dst[1] = g;
dst[bidx^2] = r; tab[0] = p2;
tab[1] = p1;
tab[2] = p1 + (p2 - p1)*(1-h);
tab[3] = p1 + (p2 - p1)*h;
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = l;
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4 #if dcn == 4
dst[3] = MAX_NUM; dst[3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -1018,21 +1211,29 @@ __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset ...@@ -1018,21 +1211,29 @@ __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
x <<= 2; #pragma unroll
int src_idx = mad24(y, src_step, src_offset + x); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
int dst_idx = mad24(y, dst_step, dst_offset + x); {
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
uchar4 src_pix = vload4(0, src + src_idx);
uchar v0 = src[src_idx], v1 = src[src_idx + 1]; uchar v0 = src_pix.x, v1 = src_pix.y;
uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; uchar v2 = src_pix.z, v3 = src_pix.w;
dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM; dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM; dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM; dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
dst[dst_idx + 3] = v3; dst[dst_idx + 3] = v3;
}
++y;
}
} }
} }
...@@ -1041,22 +1242,30 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset ...@@ -1041,22 +1242,30 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset
int rows, int cols) int rows, int cols)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
x <<= 2; #pragma unroll
int src_idx = mad24(y, src_step, src_offset + x); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
int dst_idx = mad24(y, dst_step, dst_offset + x); {
if (y < rows)
uchar v0 = src[src_idx], v1 = src[src_idx + 1]; {
uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; int src_idx = mad24(y, src_step, src_offset + (x << 2));
uchar v3_half = v3 / 2; int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
uchar4 src_pix = vload4(0, src + src_idx);
dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3; uchar v0 = src_pix.x, v1 = src_pix.y;
dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3; uchar v2 = src_pix.z, v3 = src_pix.w;
dst[dst_idx + 3] = v3; uchar v3_half = v3 / 2;
dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
dst[dst_idx + 3] = v3;
}
++y;
}
} }
} }
...@@ -1086,32 +1295,41 @@ __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset, ...@@ -1086,32 +1295,41 @@ __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
__constant int * coeffs, int Lscale, int Lshift) __constant int * coeffs, int Lscale, int Lshift)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
src += src_idx; __global const uchar* src_ptr = src + src_idx;
dst += dst_idx; __global uchar* dst_ptr = dst + dst_idx;
uchar4 src_pix = vload4(0, src_ptr);
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
int R = gammaTab[src[0]], G = gammaTab[src[1]], B = gammaTab[src[2]]; int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)]; int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)]; int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)]; int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 ); int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 ); int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 ); int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
dst[0] = SAT_CAST(L); dst_ptr[0] = SAT_CAST(L);
dst[1] = SAT_CAST(a); dst_ptr[1] = SAT_CAST(a);
dst[2] = SAT_CAST(b); dst_ptr[2] = SAT_CAST(b);
}
++y;
}
} }
} }
...@@ -1125,45 +1343,54 @@ __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offse ...@@ -1125,45 +1343,54 @@ __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offse
__constant float * coeffs, float _1_3, float _a) __constant float * coeffs, float _1_3, float _a)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
__global const float * src = (__global const float *)(srcptr + src_idx); __global const float * src = (__global const float *)(srcptr + src_idx);
__global float * dst = (__global float *)(dstptr + dst_idx); __global float * dst = (__global float *)(dstptr + dst_idx);
float4 src_pix = vload4(0, src);
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
float R = clamp(src[0], 0.0f, 1.0f); float R = clamp(src_pix.x, 0.0f, 1.0f);
float G = clamp(src[1], 0.0f, 1.0f); float G = clamp(src_pix.y, 0.0f, 1.0f);
float B = clamp(src[2], 0.0f, 1.0f); float B = clamp(src_pix.z, 0.0f, 1.0f);
#ifdef SRGB #ifdef SRGB
R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif #endif
float X = R*C0 + G*C1 + B*C2; float X = R*C0 + G*C1 + B*C2;
float Y = R*C3 + G*C4 + B*C5; float Y = R*C3 + G*C4 + B*C5;
float Z = R*C6 + G*C7 + B*C8; float Z = R*C6 + G*C7 + B*C8;
float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a); float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a); float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a); float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y); float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
float a = 500.f * (FX - FY); float a = 500.f * (FX - FY);
float b = 200.f * (FY - FZ); float b = 200.f * (FY - FZ);
dst[0] = L; dst[0] = L;
dst[1] = a; dst[1] = a;
dst[2] = b; dst[2] = b;
}
++y;
}
} }
} }
...@@ -1225,33 +1452,42 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset, ...@@ -1225,33 +1452,42 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
__constant float * coeffs, float lThresh, float fThresh) __constant float * coeffs, float lThresh, float fThresh)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
src += src_idx; __global const uchar* src_ptr = src + src_idx;
dst += dst_idx; __global uchar* dst_ptr = dst + dst_idx;
uchar4 src_pix = vload4(0, src_ptr);
float srcbuf[3], dstbuf[3]; float srcbuf[3], dstbuf[3];
srcbuf[0] = src[0]*(100.f/255.f); srcbuf[0] = src_pix.x*(100.f/255.f);
srcbuf[1] = convert_float(src[1] - 128); srcbuf[1] = convert_float(src_pix.y - 128);
srcbuf[2] = convert_float(src[2] - 128); srcbuf[2] = convert_float(src_pix.z - 128);
Lab2BGR_f(&srcbuf[0], &dstbuf[0], Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB #ifdef SRGB
gammaTab, gammaTab,
#endif #endif
coeffs, lThresh, fThresh); coeffs, lThresh, fThresh);
dst[0] = SAT_CAST(dstbuf[0] * 255.0f); dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
dst[1] = SAT_CAST(dstbuf[1] * 255.0f); dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
dst[2] = SAT_CAST(dstbuf[2] * 255.0f); dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
#if dcn == 4 #if dcn == 4
dst[3] = MAX_NUM; dst_ptr[3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
...@@ -1265,29 +1501,38 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse ...@@ -1265,29 +1501,38 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse
__constant float * coeffs, float lThresh, float fThresh) __constant float * coeffs, float lThresh, float fThresh)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * PIX_PER_WI_Y;
if (y < rows && x < cols) if (x < cols)
{ {
int src_idx = mad24(y, src_step, src_offset + x * scnbytes); #pragma unroll
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
__global const float * src = (__global const float *)(srcptr + src_idx); __global const float * src = (__global const float *)(srcptr + src_idx);
__global float * dst = (__global float *)(dstptr + dst_idx); __global float * dst = (__global float *)(dstptr + dst_idx);
float4 src_pix = vload4(0, src);
float srcbuf[3], dstbuf[3]; float srcbuf[3], dstbuf[3];
srcbuf[0] = src[0], srcbuf[1] = src[1], srcbuf[2] = src[2]; srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
Lab2BGR_f(&srcbuf[0], &dstbuf[0], Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB #ifdef SRGB
gammaTab, gammaTab,
#endif #endif
coeffs, lThresh, fThresh); coeffs, lThresh, fThresh);
dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2]; dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
#if dcn == 4 #if dcn == 4
dst[3] = MAX_NUM; dst[3] = MAX_NUM;
#endif #endif
}
++y;
}
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment