Commit 554eae56 authored by Rostislav Vasilikhin's avatar Rostislav Vasilikhin Committed by Alexander Alekhin

Merge pull request #13708 from savuor:yuv42x_wide

YUV42x color conversions rewritten to wide intrinsics (#13708)

* a*b+c -> fma

* YUV420sp2RGB initially vectorized

* shorter var names

* loops by 4

* yuv420p2rgb vectorized

* yuv422toRGB vectorized

* reg arrays

* rgb2yuv420 vectorized

* warnings fixed

* try to fix align error
parent 4e66d078
......@@ -123,7 +123,7 @@ struct RGB2YCrCb_f<float>
}
v_float32 y, cr, cb;
y = b*vc0 + g*vc1 + r*vc2;
y = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
if(bidx)
std::swap(r, b);
......@@ -963,22 +963,22 @@ struct YCrCb2RGB_i<ushort>
///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
const int ITUR_BT_601_CY = 1220542;
const int ITUR_BT_601_CUB = 2116026;
const int ITUR_BT_601_CUG = -409993;
const int ITUR_BT_601_CVG = -852492;
const int ITUR_BT_601_CVR = 1673527;
const int ITUR_BT_601_SHIFT = 20;
static const int ITUR_BT_601_CY = 1220542;
static const int ITUR_BT_601_CUB = 2116026;
static const int ITUR_BT_601_CUG = -409993;
static const int ITUR_BT_601_CVG = -852492;
static const int ITUR_BT_601_CVR = 1673527;
static const int ITUR_BT_601_SHIFT = 20;
// Coefficients for RGB to YUV420p conversion
const int ITUR_BT_601_CRY = 269484;
const int ITUR_BT_601_CGY = 528482;
const int ITUR_BT_601_CBY = 102760;
const int ITUR_BT_601_CRU = -155188;
const int ITUR_BT_601_CGU = -305135;
const int ITUR_BT_601_CBU = 460324;
const int ITUR_BT_601_CGV = -385875;
const int ITUR_BT_601_CBV = -74448;
static const int ITUR_BT_601_CRY = 269484;
static const int ITUR_BT_601_CGY = 528482;
static const int ITUR_BT_601_CBY = 102760;
static const int ITUR_BT_601_CRU = -155188;
static const int ITUR_BT_601_CGU = -305135;
static const int ITUR_BT_601_CBU = 460324;
static const int ITUR_BT_601_CGV = -385875;
static const int ITUR_BT_601_CBV = -74448;
//R = 1.164(Y - 16) + 1.596(V - 128)
//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
......@@ -988,49 +988,146 @@ const int ITUR_BT_601_CBV = -74448;
//G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
//B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20
static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv)
{
int uu, vv;
uu = int(u) - 128;
vv = int(v) - 128;
ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv;
guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu;
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
}
static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
v_int32 (&ruv)[4],
v_int32 (&guv)[4],
v_int32 (&buv)[4])
{
v_uint8 v128 = vx_setall_u8(128);
v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128));
v_int16 uu0, uu1, vv0, vv1;
v_expand(su, uu0, uu1);
v_expand(sv, vv0, vv1);
v_int32 uu[4], vv[4];
v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]);
v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]);
v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1));
v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR);
v_int32 vg = vx_setall_s32(ITUR_BT_601_CVG);
v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG);
v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB);
for (int k = 0; k < 4; k++)
{
ruv[k] = vshift + vr * vv[k];
guv[k] = vshift + vg * vv[k] + ug * uu[k];
buv[k] = vshift + ub * uu[k];
}
}
static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv,
uchar& r, uchar& g, uchar& b, uchar& a)
{
int yy = int(vy);
int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
g = saturate_cast<uchar>((y + guv) >> ITUR_BT_601_SHIFT);
b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
a = uchar(0xff);
}
static inline void yRGBuvToRGBA(const v_uint8& vy,
const v_int32 (&ruv)[4],
const v_int32 (&guv)[4],
const v_int32 (&buv)[4],
v_uint8& rr, v_uint8& gg, v_uint8& bb)
{
v_uint8 v16 = vx_setall_u8(16);
v_uint8 posY = vy - v16;
v_uint16 yy0, yy1;
v_expand(posY, yy0, yy1);
v_int32 yy[4];
v_int32 yy00, yy01, yy10, yy11;
v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]);
v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]);
v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY);
v_int32 y[4], r[4], g[4], b[4];
for(int k = 0; k < 4; k++)
{
y[k] = yy[k]*vcy;
r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT;
g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT;
b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT;
}
v_int16 r0, r1, g0, g1, b0, b1;
r0 = v_pack(r[0], r[1]);
r1 = v_pack(r[2], r[3]);
g0 = v_pack(g[0], g[1]);
g1 = v_pack(g[2], g[3]);
b0 = v_pack(b[0], b[1]);
b1 = v_pack(b[2], b[3]);
rr = v_pack_u(r0, r1);
gg = v_pack_u(g0, g1);
bb = v_pack_u(b0, b1);
}
template<int bIdx, int dcn, bool is420>
static inline void cvtYuv42xxp2RGB8(int u, int v, int vy01, int vy11, int vy02, int vy12,
static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v,
const uchar vy01, const uchar vy11, const uchar vy02, const uchar vy12,
uchar* row1, uchar* row2)
{
u = u - 128;
v = v - 128;
int ruv, guv, buv;
uvToRGBuv(u, v, ruv, guv, buv);
int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
uchar r00, g00, b00, a00;
uchar r01, g01, b01, a01;
int y00 = std::max(0, vy01 - 16) * ITUR_BT_601_CY;
row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
row1[1] = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
row1[bIdx] = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
yRGBuvToRGBA(vy01, ruv, guv, buv, r00, g00, b00, a00);
yRGBuvToRGBA(vy11, ruv, guv, buv, r01, g01, b01, a01);
row1[2-bIdx] = r00;
row1[1] = g00;
row1[bIdx] = b00;
if(dcn == 4)
row1[3] = uchar(0xff);
row1[3] = a00;
int y01 = std::max(0, vy11 - 16) * ITUR_BT_601_CY;
row1[dcn+2-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
row1[dcn+1] = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
row1[dcn+0+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
row1[dcn+2-bIdx] = r01;
row1[dcn+1] = g01;
row1[dcn+0+bIdx] = b01;
if(dcn == 4)
row1[7] = uchar(0xff);
row1[7] = a01;
if(is420)
{
int y10 = std::max(0, vy02 - 16) * ITUR_BT_601_CY;
row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
row2[1] = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
row2[bIdx] = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
uchar r10, g10, b10, a10;
uchar r11, g11, b11, a11;
yRGBuvToRGBA(vy02, ruv, guv, buv, r10, g10, b10, a10);
yRGBuvToRGBA(vy12, ruv, guv, buv, r11, g11, b11, a11);
row2[2-bIdx] = r10;
row2[1] = g10;
row2[bIdx] = b10;
if(dcn == 4)
row2[3] = uchar(0xff);
row2[3] = a10;
int y11 = std::max(0, vy12 - 16) * ITUR_BT_601_CY;
row2[dcn+2-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
row2[dcn+1] = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
row2[dcn+0+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
row2[dcn+2-bIdx] = r11;
row2[dcn+1] = g11;
row2[dcn+0+bIdx] = b11;
if(dcn == 4)
row2[7] = uchar(0xff);
row2[7] = a11;
}
}
// bIdx is 0 or 2, uIdx is 0 or 1, dcn is 3 or 4
template<int bIdx, int uIdx, int dcn>
struct YUV420sp2RGB8Invoker : ParallelLoopBody
{
......@@ -1056,15 +1153,80 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
uchar* row2 = dst_data + dst_step * (j + 1);
const uchar* y2 = y1 + stride;
for (int i = 0; i < width; i += 2, row1 += dcn*2, row2 += dcn*2)
int i = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
v_uint8 a = vx_setall_u8(uchar(0xff));
for( ; i <= width - 2*vsize;
i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
{
int u = int(uv[i + 0 + uIdx]);
int v = int(uv[i + 1 - uIdx]);
v_uint8 u, v;
v_load_deinterleave(uv + i, u, v);
if(uIdx)
{
swap(u, v);
}
v_uint8 vy[4];
v_load_deinterleave(y1 + i, vy[0], vy[1]);
v_load_deinterleave(y2 + i, vy[2], vy[3]);
v_int32 ruv[4], guv[4], buv[4];
uvToRGBuv(u, v, ruv, guv, buv);
int vy01 = int(y1[i]);
int vy11 = int(y1[i + 1]);
int vy02 = int(y2[i]);
int vy12 = int(y2[i + 1]);
v_uint8 r[4], g[4], b[4];
for(int k = 0; k < 4; k++)
{
yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
}
if(bIdx)
{
for(int k = 0; k < 4; k++)
swap(r[k], b[k]);
}
// [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
v_uint8 r0_0, r0_1, r1_0, r1_1;
v_zip(r[0], r[1], r0_0, r0_1);
v_zip(r[2], r[3], r1_0, r1_1);
v_uint8 g0_0, g0_1, g1_0, g1_1;
v_zip(g[0], g[1], g0_0, g0_1);
v_zip(g[2], g[3], g1_0, g1_1);
v_uint8 b0_0, b0_1, b1_0, b1_1;
v_zip(b[0], b[1], b0_0, b0_1);
v_zip(b[2], b[3], b1_0, b1_1);
if(dcn == 4)
{
v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a);
v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a);
v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a);
v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a);
}
else //dcn == 3
{
v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0);
v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1);
v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0);
v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1);
}
}
vx_cleanup();
#endif
for ( ; i < width; i += 2, row1 += dcn*2, row2 += dcn*2)
{
uchar u = uv[i + 0 + uIdx];
uchar v = uv[i + 1 - uIdx];
uchar vy01 = y1[i];
uchar vy11 = y1[i + 1];
uchar vy02 = y2[i];
uchar vy12 = y2[i + 1];
cvtYuv42xxp2RGB8<bIdx, dcn, true>(u, v, vy01, vy11, vy02, vy12, row1, row2);
}
......@@ -1108,16 +1270,77 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
uchar* row1 = dst_data + dst_step * j;
uchar* row2 = dst_data + dst_step * (j + 1);
const uchar* y2 = y1 + stride;
int i = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
v_uint8 a = vx_setall_u8(uchar(0xff));
for( ; i <= width/2 - vsize;
i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
{
v_uint8 u, v;
u = vx_load(u1 + i);
v = vx_load(v1 + i);
v_uint8 vy[4];
v_load_deinterleave(y1 + 2*i, vy[0], vy[1]);
v_load_deinterleave(y2 + 2*i, vy[2], vy[3]);
v_int32 ruv[4], guv[4], buv[4];
uvToRGBuv(u, v, ruv, guv, buv);
v_uint8 r[4], g[4], b[4];
for(int k = 0; k < 4; k++)
{
yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
}
if(bIdx)
{
for(int k = 0; k < 4; k++)
swap(r[k], b[k]);
}
// [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
v_uint8 r0_0, r0_1, r1_0, r1_1;
v_zip(r[0], r[1], r0_0, r0_1);
v_zip(r[2], r[3], r1_0, r1_1);
v_uint8 g0_0, g0_1, g1_0, g1_1;
v_zip(g[0], g[1], g0_0, g0_1);
v_zip(g[2], g[3], g1_0, g1_1);
v_uint8 b0_0, b0_1, b1_0, b1_1;
v_zip(b[0], b[1], b0_0, b0_1);
v_zip(b[2], b[3], b1_0, b1_1);
if(dcn == 4)
{
v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a);
v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a);
v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a);
v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a);
}
else //dcn == 3
{
v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0);
v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1);
for (int i = 0; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2)
v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0);
v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1);
}
}
vx_cleanup();
#endif
for (; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2)
{
int u = int(u1[i]);
int v = int(v1[i]);
uchar u = u1[i];
uchar v = v1[i];
int vy01 = int(y1[2 * i]);
int vy11 = int(y1[2 * i + 1]);
int vy02 = int(y2[2 * i]);
int vy12 = int(y2[2 * i + 1]);
uchar vy01 = y1[2 * i];
uchar vy11 = y1[2 * i + 1];
uchar vy02 = y2[2 * i];
uchar vy12 = y2[2 * i + 1];
cvtYuv42xxp2RGB8<bIdx, dcn, true>(u, v, vy01, vy11, vy02, vy12, row1, row2);
}
......@@ -1150,106 +1373,258 @@ inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int
///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
{
const int shifted16 = (16 << ITUR_BT_601_SHIFT);
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16;
return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
}
static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
{
const int shifted16 = (16 << ITUR_BT_601_SHIFT);
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
v_uint16 r0, r1, g0, g1, b0, b1;
v_expand(r, r0, r1);
v_expand(g, g0, g1);
v_expand(b, b0, b1);
v_uint32 rq[4], gq[4], bq[4];
v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]);
v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]);
v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]);
v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY);
v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16);
v_uint32 y[4];
for(int k = 0; k < 4; k++)
{
y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT;
}
v_uint16 y0, y1;
y0 = v_pack(y[0], y[1]);
y1 = v_pack(y[2], y[3]);
return v_pack(y0, y1);
}
static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
{
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
const int shifted128 = (128 << ITUR_BT_601_SHIFT);
int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128;
int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128;
u = saturate_cast<uchar>(uu >> ITUR_BT_601_SHIFT);
v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
}
static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
{
// [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..]
v_int16 vlowByte = vx_setall_s16(0x00ff);
v_int16 rd0, rd1, gd0, gd1, bd0, bd1;
rd0 = v_reinterpret_as_s16(r0) & vlowByte;
rd1 = v_reinterpret_as_s16(r1) & vlowByte;
gd0 = v_reinterpret_as_s16(g0) & vlowByte;
gd1 = v_reinterpret_as_s16(g1) & vlowByte;
bd0 = v_reinterpret_as_s16(b0) & vlowByte;
bd1 = v_reinterpret_as_s16(b1) & vlowByte;
v_int32 rq[4], gq[4], bq[4];
v_expand(rd0, rq[0], rq[1]);
v_expand(rd1, rq[2], rq[3]);
v_expand(gd0, gq[0], gq[1]);
v_expand(gd1, gq[2], gq[3]);
v_expand(bd0, bq[0], bq[1]);
v_expand(bd1, bq[2], bq[3]);
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
const int shifted128 = (128 << ITUR_BT_601_SHIFT);
v_int32 shift = vx_setall_s32(halfShift + shifted128);
v_int32 ru, gu, bu, gv, bv;
ru = vx_setall_s32(ITUR_BT_601_CRU);
gu = vx_setall_s32(ITUR_BT_601_CGU);
gv = vx_setall_s32(ITUR_BT_601_CGV);
bu = vx_setall_s32(ITUR_BT_601_CBU);
bv = vx_setall_s32(ITUR_BT_601_CBV);
v_int32 uq[4], vq[4];
for(int k = 0; k < 4; k++)
{
uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT;
vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT;
}
v_int16 u0, u1, v0, v1;
u0 = v_pack(uq[0], uq[1]);
u1 = v_pack(uq[2], uq[3]);
v0 = v_pack(vq[0], vq[1]);
v1 = v_pack(vq[2], vq[3]);
u = v_pack_u(u0, u1);
v = v_pack_u(v0, v1);
}
struct RGB8toYUV420pInvoker: public ParallelLoopBody
{
RGB8toYUV420pInvoker(const uchar * _src_data, size_t _src_step,
uchar * _y_data, uchar * _uv_data, size_t _dst_step,
int _src_width, int _src_height, int _scn, bool swapBlue_, bool swapUV_, bool interleaved_)
: src_data(_src_data), src_step(_src_step),
y_data(_y_data), uv_data(_uv_data), dst_step(_dst_step),
src_width(_src_width), src_height(_src_height),
scn(_scn), swapBlue(swapBlue_), swapUV(swapUV_), interleaved(interleaved_) { }
RGB8toYUV420pInvoker(const uchar * _srcData, size_t _srcStep,
uchar * _yData, uchar * _uvData, size_t _dstStep,
int _srcWidth, int _srcHeight, int _scn, bool _swapBlue, bool _swapUV, bool _interleave)
: srcData(_srcData), srcStep(_srcStep),
yData(_yData), uvData(_uvData), dstStep(_dstStep),
srcWidth(_srcWidth), srcHeight(_srcHeight),
srcCn(_scn), swapBlue(_swapBlue), swapUV(_swapUV), interleave(_interleave) { }
void operator()(const Range& rowRange) const CV_OVERRIDE
{
const int w = src_width;
const int h = src_height;
const int cn = scn;
for( int i = rowRange.start; i < rowRange.end; i++ )
const int w = srcWidth;
const int h = srcHeight;
const int scn = srcCn;
const uchar* srcRow = (uchar*)0;
uchar* yRow = (uchar*)0, *uRow = (uchar*)0, *vRow = (uchar*)0, *uvRow = (uchar*)0;
for( int sRow = rowRange.start*2; sRow < rowRange.end*2; sRow++)
{
const uchar* brow0 = src_data + src_step * (2 * i);
const uchar* grow0 = brow0 + 1;
const uchar* rrow0 = brow0 + 2;
const uchar* brow1 = src_data + src_step * (2 * i + 1);
const uchar* grow1 = brow1 + 1;
const uchar* rrow1 = brow1 + 2;
if (swapBlue)
srcRow = srcData + srcStep*sRow;
yRow = yData + dstStep * sRow;
bool evenRow = (sRow % 2) == 0;
if(evenRow)
{
std::swap(brow0, rrow0);
std::swap(brow1, rrow1);
if (interleave)
{
uvRow = uvData + dstStep*(sRow/2);
}
else
{
uRow = uvData + dstStep * (sRow/4) + ((sRow/2) % 2) * (w/2);
vRow = uvData + dstStep * ((sRow + h)/4) + (((sRow + h)/2) % 2) * (w/2);
}
}
int i = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
uchar* y = y_data + dst_step * (2*i);
uchar* u;
uchar* v;
if (interleaved)
{
u = uv_data + dst_step * i;
v = uv_data + dst_step * i + 1;
}
else
for( ; i <= w/2 - vsize;
i += vsize)
{
u = uv_data + dst_step * (i/2) + (i % 2) * (w/2);
v = uv_data + dst_step * ((i + h/2)/2) + ((i + h/2) % 2) * (w/2);
}
// processing (2*vsize) pixels at once
v_uint8 b0, b1, g0, g1, r0, r1, a0, a1;
if(scn == 4)
{
v_load_deinterleave(srcRow + 2*4*i + 0*vsize, b0, g0, r0, a0);
v_load_deinterleave(srcRow + 2*4*i + 4*vsize, b1, g1, r1, a1);
}
else // scn == 3
{
v_load_deinterleave(srcRow + 2*3*i + 0*vsize, b0, g0, r0);
v_load_deinterleave(srcRow + 2*3*i + 3*vsize, b1, g1, r1);
}
if (swapUV)
{
std::swap(u, v);
}
if(swapBlue)
{
swap(b0, r0); swap(b1, r1);
}
v_uint8 y0, y1;
y0 = rgbToY42x(r0, g0, b0);
y1 = rgbToY42x(r1, g1, b1);
for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
v_store(yRow + 2*i + 0*vsize, y0);
v_store(yRow + 2*i + 1*vsize, y1);
if(evenRow)
{
v_uint8 u, v;
rgbToUV42x(r0, r1, g0, g1, b0, b1, u, v);
if(swapUV)
{
swap(u, v);
}
if(interleave)
{
v_store_interleave(uvRow + 2*i, u, v);
}
else
{
v_store(uRow + i, u);
v_store(vRow + i, v);
}
}
}
vx_cleanup();
#endif
// processing two pixels at once
for( ; i < w/2; i++)
{
int r00 = rrow0[j]; int g00 = grow0[j]; int b00 = brow0[j];
int r01 = rrow0[cn + j]; int g01 = grow0[cn + j]; int b01 = brow0[cn + j];
int r10 = rrow1[j]; int g10 = grow1[j]; int b10 = brow1[j];
int r11 = rrow1[cn + j]; int g11 = grow1[cn + j]; int b11 = brow1[cn + j];
const int shifted16 = (16 << ITUR_BT_601_SHIFT);
const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16;
int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16;
int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16;
int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16;
y[2*k + 0] = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
y[2*k + 1] = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
y[2*k + dst_step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
y[2*k + dst_step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
const int shifted128 = (128 << ITUR_BT_601_SHIFT);
int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128;
if (interleaved)
uchar b0, g0, r0;
uchar b1, g1, r1;
b0 = srcRow[(2*i+0)*scn + 0];
g0 = srcRow[(2*i+0)*scn + 1];
r0 = srcRow[(2*i+0)*scn + 2];
b1 = srcRow[(2*i+1)*scn + 0];
g1 = srcRow[(2*i+1)*scn + 1];
r1 = srcRow[(2*i+1)*scn + 2];
if(swapBlue)
{
u[k*2] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
v[k*2] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
swap(b0, r0); swap(b1, r1);
}
else
uchar y0 = rgbToY42x(r0, g0, b0);
uchar y1 = rgbToY42x(r1, g1, b1);
yRow[2*i+0] = y0;
yRow[2*i+1] = y1;
if(evenRow)
{
u[k] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
v[k] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
uchar uu, vv;
rgbToUV42x(r0, g0, b0, uu, vv);
if(swapUV)
{
swap(uu, vv);
}
if(interleave)
{
uvRow[2*i+0] = uu;
uvRow[2*i+1] = vv;
}
else
{
uRow[i] = uu;
vRow[i] = vv;
}
}
}
}
}
const uchar * src_data;
size_t src_step;
uchar *y_data, *uv_data;
size_t dst_step;
int src_width;
int src_height;
const int scn;
const uchar * srcData;
size_t srcStep;
uchar *yData, *uvData;
size_t dstStep;
int srcWidth;
int srcHeight;
const int srcCn;
bool swapBlue;
bool swapUV;
bool interleaved;
bool interleave;
};
///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; dcn is 3 or 4
template<int bIdx, int uIdx, int yIdx, int dcn>
struct YUV422toRGB8Invoker : ParallelLoopBody
{
......@@ -1269,6 +1644,10 @@ struct YUV422toRGB8Invoker : ParallelLoopBody
int rangeBegin = range.start;
int rangeEnd = range.end;
// [yIdx, uIdx] | [uidx, vidx]:
// 0, 0 | 1, 3
// 0, 1 | 3, 1
// 1, 0 | 0, 2
const int uidx = 1 - yIdx + uIdx * 2;
const int vidx = (2 + uidx) % 4;
const uchar* yuv_src = src_data + rangeBegin * src_step;
......@@ -1276,14 +1655,69 @@ struct YUV422toRGB8Invoker : ParallelLoopBody
for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step)
{
uchar* row = dst_data + dst_step * j;
int i = 0;
#if CV_SIMD
const int vsize = v_uint8::nlanes;
v_uint8 a = vx_setall_u8(uchar(0xff));
for(; i <= 2*width - 4*vsize;
i += 4*vsize, row += vsize*dcn*2)
{
v_uint8 u, v, vy[2];
if(yIdx == 1) // UYVY
{
v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]);
}
else // YUYV or YVYU
{
v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v);
if(uIdx == 1) // YVYU
{
swap(u, v);
}
}
v_int32 ruv[4], guv[4], buv[4];
uvToRGBuv(u, v, ruv, guv, buv);
for (int i = 0; i < 2 * width; i += 4, row += dcn*2)
v_uint8 r[2], g[2], b[2];
yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]);
yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]);
if(bIdx)
{
swap(r[0], b[0]);
swap(r[1], b[1]);
}
// [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
v_uint8 r0_0, r0_1;
v_zip(r[0], r[1], r0_0, r0_1);
v_uint8 g0_0, g0_1;
v_zip(g[0], g[1], g0_0, g0_1);
v_uint8 b0_0, b0_1;
v_zip(b[0], b[1], b0_0, b0_1);
if(dcn == 4)
{
v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0, a);
v_store_interleave(row + 4*vsize, b0_1, g0_1, r0_1, a);
}
else //dcn == 3
{
v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0);
v_store_interleave(row + 3*vsize, b0_1, g0_1, r0_1);
}
}
vx_cleanup();
#endif
for (; i < 2 * width; i += 4, row += dcn*2)
{
int u = int(yuv_src[i + uidx]);
int v = int(yuv_src[i + vidx]);
uchar u = yuv_src[i + uidx];
uchar v = yuv_src[i + vidx];
int vy0 = int(yuv_src[i + yIdx]);
int vy1 = int(yuv_src[i + yIdx + 2]);
uchar vy0 = yuv_src[i + yIdx];
uchar vy1 = yuv_src[i + yIdx + 2];
cvtYuv42xxp2RGB8<bIdx, dcn, false>(u, v, vy0, vy1, 0, 0, row, (uchar*)(0));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment