Commit b86dbf24 authored by Frank Barchard's avatar Frank Barchard

refactor I420AlphaToABGR to use I420AlphaToARGB internally

swap U and V and transpose conversion matrix, so I420AlphaToARGB and
I420AlphaToABGR share low level code.

Having less code with same performance allows more focused
optimization for future ARM versions.

R=harryjin@google.com
TBR=harryjin@chromium.org
BUG=libyuv:473,libyuv:516

Review URL: https://codereview.chromium.org/1422263002 .
parent cf160cdb
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1525
Version: 1526
License: BSD
License File: LICENSE
......
......@@ -107,7 +107,6 @@ extern "C" {
#define HAS_H422TOARGBROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
......@@ -199,7 +198,6 @@ extern "C" {
#define HAS_H422TOABGRROW_AVX2
#define HAS_H422TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2
#define HAS_I422ALPHATOABGRROW_AVX2
#define HAS_I422ALPHATOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
......@@ -254,7 +252,6 @@ extern "C" {
// The following are also available on x64 Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422ALPHATOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
......@@ -1053,13 +1050,6 @@ void I422AlphaToARGBRow_C(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -1216,13 +1206,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -1230,13 +1213,6 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -1434,13 +1410,6 @@ void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToABGRRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -1448,13 +1417,6 @@ void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToABGRRow_Any_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width);
void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1525
#define LIBYUV_VERSION 1526
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -45,7 +45,6 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
}
// Convert I444 to ARGB.
LIBYUV_API
static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
......@@ -129,33 +128,33 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
width, height);
}
// Convert J444 to ARGB.
// Convert I444 to ABGR.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I444ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJConstants,
dst_abgr, dst_stride_abgr,
&kYvuIConstants, // Use Yvu matrix
width, height);
}
// Convert I444 to ABGR.
// Convert J444 to ARGB.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I444ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v,
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuIConstants,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJConstants,
width, height);
}
......@@ -307,13 +306,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
}
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate) {
static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height, int attenuate) {
int y;
void (*I422AlphaToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
......@@ -393,7 +392,7 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvIConstants,
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
if (attenuate) {
ARGBAttenuateRow(dst_argb, dst_argb, width);
......@@ -409,6 +408,23 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
return 0;
}
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate) {
return I420AlphaToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
src_a, src_stride_a,
dst_argb, dst_stride_argb,
&kYuvIConstants,
width, height, attenuate);
}
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
......@@ -417,99 +433,13 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate) {
int y;
void (*I422AlphaToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) = I422AlphaToABGRRow_C;
void (*ARGBAttenuateRow)(const uint8* src_abgr, uint8* dst_abgr,
int width) = ARGBAttenuateRow_C;
if (!src_y || !src_u || !src_v || !dst_abgr ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
#if defined(HAS_I422ALPHATOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422AlphaToABGRRow = I422AlphaToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422AlphaToABGRRow = I422AlphaToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_I422ALPHATOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422AlphaToABGRRow = I422AlphaToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422AlphaToABGRRow = I422AlphaToABGRRow_AVX2;
}
}
#endif
#if defined(HAS_I422ALPHATOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422AlphaToABGRRow = I422AlphaToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422AlphaToABGRRow = I422AlphaToABGRRow_NEON;
}
}
#endif
#if defined(HAS_I422ALPHATOABGRROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_abgr, 4) && IS_ALIGNED(dst_stride_abgr, 4)) {
I422AlphaToABGRRow = I422AlphaToABGRRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvIConstants,
width);
if (attenuate) {
ARGBAttenuateRow(dst_abgr, dst_abgr, width);
}
dst_abgr += dst_stride_abgr;
src_a += src_stride_a;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
return I420AlphaToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
src_a, src_stride_a,
dst_abgr, dst_stride_abgr,
&kYvuIConstants, // Use Yvu matrix
width, height, attenuate);
}
// Convert I400 to ARGB.
......
......@@ -46,11 +46,9 @@ extern "C" {
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
ANY41C(I422AlphaToABGRRow_Any_SSSE3, I422AlphaToABGRRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I422ALPHATOARGBROW_AVX2
ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 7)
ANY41C(I422AlphaToABGRRow_Any_AVX2, I422AlphaToABGRRow_AVX2, 1, 0, 4, 7)
#endif
#undef ANY41C
......
......@@ -1413,34 +1413,6 @@ void I422ToABGRRow_C(const uint8* src_y,
}
}
void I422AlphaToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
const uint8* src_a,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = src_a[0];
YuvPixel(src_y[1], src_u[0], src_v[0],
rgb_buf + 6, rgb_buf + 5, rgb_buf + 4, yuvconstants);
rgb_buf[7] = src_a[1];
src_y += 2;
src_u += 1;
src_v += 1;
src_a += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = src_a[0];
}
}
void I422ToRGB24Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......
......@@ -1766,38 +1766,6 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
);
}
void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
LABELALIGN
"1: \n"
READYUVA422
YUVTORGB(yuvconstants)
STOREABGR
"subl $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[a_buf]"+r"(a_buf), // %[a_buf]
[dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
#if defined(__i386__) && defined(__pic__)
[width]"+m"(width) // %[width]
#else
[width]"+rm"(width) // %[width]
#endif
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -2229,43 +2197,6 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_I422ALPHATOARGBROW_AVX2
#if defined(HAS_I422ALPHATOABGRROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.
void OMITFP I422AlphaToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
LABELALIGN
"1: \n"
READYUVA422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREABGR_AVX2
"subl $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[a_buf]"+r"(a_buf), // %[a_buf]
[dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
#if defined(__i386__) && defined(__pic__)
[width]"+m"(width) // %[width]
#else
[width]"+rm"(width) // %[width]
#endif
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_I422ALPHATOABGRROW_AVX2
#if defined(HAS_I422TOABGRROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
......
......@@ -153,25 +153,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
}
#endif
#if defined(HAS_I422ALPHATOABGRROW_SSSE3)
void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm4, xmm5;
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
READYUVA422
YUVTORGB(yuvconstants)
STOREABGR
width -= 8;
}
}
#endif
// 32 bit
#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
......@@ -2185,49 +2166,6 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_I422ALPHATOARGBROW_AVX2
#ifdef HAS_I422ALPHATOABGRROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.
__declspec(naked)
void I422AlphaToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
push ebx
push ebp
mov eax, [esp + 16 + 4] // Y
mov esi, [esp + 16 + 8] // U
mov edi, [esp + 16 + 12] // V
mov ebp, [esp + 16 + 16] // A
mov edx, [esp + 16 + 20] // abgr
mov ebx, [esp + 16 + 24] // yuvconstants
mov ecx, [esp + 16 + 28] // width
sub edi, esi
convertloop:
READYUVA422_AVX2
YUVTORGB_AVX2(ebx)
STOREABGR_AVX2
sub ecx, 16
jg convertloop
pop ebp
pop ebx
pop edi
pop esi
vzeroupper
ret
}
}
#endif // HAS_I422ALPHATOABGRROW_AVX2
#ifdef HAS_I444TOARGBROW_AVX2
// 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
......@@ -3027,46 +2965,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR.
__declspec(naked)
void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
push ebx
push ebp
mov eax, [esp + 16 + 4] // Y
mov esi, [esp + 16 + 8] // U
mov edi, [esp + 16 + 12] // V
mov ebp, [esp + 16 + 16] // A
mov edx, [esp + 16 + 20] // abgr
mov ebx, [esp + 16 + 24] // yuvconstants
mov ecx, [esp + 16 + 28] // width
sub edi, esi
convertloop:
READYUVA422
YUVTORGB(ebx)
STOREABGR
sub ecx, 8
jg convertloop
pop ebp
pop ebx
pop edi
pop esi
ret
}
}
// 8 pixels.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
// Similar to I420 but duplicate UV once more.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment