Commit cf160cdb authored by Frank Barchard's avatar Frank Barchard

implement I444ToABGR by swapping uv and transpose matrix

U contributes to B and G.  V contributes to R and G.
By swapping U and V, they contribute to the opposite channels.  Adjust the matrix so the U contribution is in the matrix location such that it till contribute to the
new B channel and vice versa.
This allows ABGR versions of YUV conversion to use the same low level code as ARGB, just using a different matrix and swapping U and V pointers.

As a result the existing I444ToABGRRow functions are no longer needed and are removed.

Previously this function was only Intel AVX2 optimized for Windwos.  Now it is also optimized for Arm and GCC.

ARMv7 Neon
Was LibYUVConvertTest.I444ToABGR_Opt (75971 ms)
Now LibYUVConvertTest.I444ToABGR_Opt (3672 ms)
20.6 times faster.

R=xhwang@chromium.org
BUG=libyuv:515

Review URL: https://codereview.chromium.org/1414133006 .
parent e8ee1755
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1524
Version: 1525
License: BSD
License File: LICENSE
......
......@@ -120,7 +120,6 @@ extern "C" {
#define HAS_I422TORGBAROW_SSSE3
#define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOABGRROW_SSSE3
#define HAS_I444TOARGBROW_SSSE3
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOABGRROW_SSSE3
......@@ -246,7 +245,6 @@ extern "C" {
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2
#define HAS_I444TOABGRROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
......@@ -460,6 +458,7 @@ struct YuvConstants {
extern const struct YuvConstants kYuvIConstants; // BT.601
extern const struct YuvConstants kYuvJConstants; // JPeg color space
extern const struct YuvConstants kYuvHConstants; // BT.709
extern const struct YuvConstants kYvuIConstants; // YVU to BGR BT.601
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP
......@@ -1035,12 +1034,6 @@ void I444ToARGBRow_C(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -1210,30 +1203,6 @@ void I444ToARGBRow_AVX2(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToABGRRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I444ToABGRRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I444ToABGRRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I444ToABGRRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -1452,18 +1421,6 @@ void I444ToARGBRow_Any_AVX2(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToABGRRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I444ToABGRRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1524
#define LIBYUV_VERSION 1525
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -129,7 +129,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
width, height);
}
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
......@@ -145,7 +144,6 @@ int J444ToARGB(const uint8* src_y, int src_stride_y,
width, height);
}
// Convert I444 to ABGR.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
......@@ -153,66 +151,12 @@ int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
int y;
void (*I444ToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I444ToABGRRow_C;
if (!src_y || !src_u || !src_v ||
!dst_abgr ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u == width &&
src_stride_v == width &&
dst_stride_abgr == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
}
#if defined(HAS_I444TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I444ToABGRRow = I444ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I444ToABGRRow = I444ToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_I444TOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I444ToABGRRow = I444ToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I444ToABGRRow = I444ToABGRRow_AVX2;
}
}
#endif
#if defined(HAS_I444TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I444ToABGRRow = I444ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I444ToABGRRow = I444ToABGRRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I444ToABGRRow(src_y, src_u, src_v, dst_abgr, &kYuvIConstants, width);
dst_abgr += dst_stride_abgr;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
return I444ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v,
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuIConstants,
width, height);
}
// Convert I422 to ARGB.
......
......@@ -126,9 +126,6 @@ ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
ANY31C(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I444TOABGRROW_SSSE3
ANY31C(I444ToABGRRow_Any_SSSE3, I444ToABGRRow_SSSE3, 0, 0, 4, 7)
#endif
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
#endif
......@@ -150,9 +147,6 @@ ANY31C(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
#ifdef HAS_I444TOARGBROW_AVX2
ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I444TOABGRROW_AVX2
ANY31C(I444ToABGRRow_Any_AVX2, I444ToABGRRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I411TOARGBROW_AVX2
ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
#endif
......
......@@ -1018,8 +1018,6 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
// BT.601 constants for YUV to RGB.
// TODO(fbarchard): Unify these structures to be platform independent.
// TODO(fbarchard): Generate SIMD structures from float matrix.
// BT601 constants for YUV to RGB.
#if defined(__aarch64__)
const YuvConstants SIMD_ALIGNED(kYuvIConstants) = {
{ -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
......@@ -1029,7 +1027,6 @@ const YuvConstants SIMD_ALIGNED(kYuvIConstants) = {
{ BB, BG, BR, 0, 0, 0, 0, 0 },
{ 0x0101 * YG, 0, 0, 0 }
};
#elif defined(__arm__)
const YuvConstants SIMD_ALIGNED(kYuvIConstants) = {
{ -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
......@@ -1099,6 +1096,42 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
*r = Clamp((int32)(y1 + YGB) >> 6);
}
// BT.601 constants for YVU to BGR.
// Allows YUV TO RGB code to implement YUV to BGR by swapping UV and using this
// matrix.
#if defined(__aarch64__)
const YuvConstants SIMD_ALIGNED(kYvuIConstants) = {
{ -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
{ -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
{ VG, UG, VG, UG, VG, UG, VG, UG },
{ VG, UG, VG, UG, VG, UG, VG, UG },
{ BR, BG, BB, 0, 0, 0, 0, 0 },
{ 0x0101 * YG, 0, 0, 0 }
};
#elif defined(__arm__)
const YuvConstants SIMD_ALIGNED(kYvuIConstants) = {
{ -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
{ VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
{ BR, BG, BB, 0, 0, 0, 0, 0 },
{ 0x0101 * YG, 0, 0, 0 }
};
#else
const YuvConstants SIMD_ALIGNED(kYvuIConstants) = {
{ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, UB, 0 },
{ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
{ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, VR },
{ BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
{ BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
{ BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
{ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
};
#endif
#undef BB
#undef BG
#undef BR
......@@ -1279,34 +1312,6 @@ void I444ToARGBRow_C(const uint8* src_y,
rgb_buf[3] = 255;
}
}
void I444ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
YuvPixel(src_y[0], u, v, rgb_buf + 2, rgb_buf + 1, rgb_buf + 0,
yuvconstants);
rgb_buf[3] = 255;
YuvPixel(src_y[1], u, v, rgb_buf + 6, rgb_buf + 5, rgb_buf + 4,
yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 2;
src_v += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = 255;
}
}
#else
void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
......@@ -1325,24 +1330,6 @@ void I444ToARGBRow_C(const uint8* src_y,
rgb_buf += 4; // Advance 1 pixel.
}
}
void I444ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width; ++x) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = 255;
src_y += 1;
src_u += 1;
src_v += 1;
rgb_buf += 4; // Advance 1 pixel.
}
}
#endif
// Also used for 420
......
......@@ -1619,33 +1619,6 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
);
}
void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV444
YUVTORGB(yuvconstants)
STOREABGR
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......
......@@ -2267,45 +2267,6 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_I444TOARGBROW_AVX2
#ifdef HAS_I444TOABGRROW_AVX2
// 16 pixels
// 16 UV values with 16 Y producing 16 ABGR (64 bytes).
__declspec(naked)
void I444ToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
push ebx
mov eax, [esp + 12 + 4] // Y
mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUV444_AVX2
YUVTORGB_AVX2(ebx)
STOREABGR_AVX2
sub ecx, 16
jg convertloop
pop ebx
pop edi
pop esi
vzeroupper
ret
}
}
#endif // HAS_I444TOABGRROW_AVX2
#ifdef HAS_I411TOARGBROW_AVX2
// 16 pixels
// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
......@@ -2870,43 +2831,6 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes).
__declspec(naked)
void I444ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
push ebx
mov eax, [esp + 12 + 4] // Y
mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUV444
YUVTORGB(ebx)
STOREABGR
sub ecx, 8
jg convertloop
pop ebx
pop edi
pop esi
ret
}
}
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
__declspec(naked)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment