Commit 284d6bdf authored by fbarchard@google.com's avatar fbarchard@google.com

Port I422ToBGRA from Windows version that does 16 pixels at a time, for performance improvement.

BUG=386
TESTED=nacl build
R=brucedawson@google.com, harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/36549004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1207 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 685813d6
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1206
Version: 1207
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1206
#define LIBYUV_VERSION 1207
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -998,7 +998,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 1,%%ymm0,(%1,%2,1)
VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
......@@ -2066,7 +2066,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
#endif // HAS_I422TOARGBROW_SSSE3
#if defined(HAS_I422TOBGRAROW_AVX2)
#if defined(HAS_I422TOARGBROW_AVX2) || defined(HAS_I422TOBGRAROW_AVX2)
struct {
lvec8 kUVToB_AVX; // 0
lvec8 kUVToG_AVX; // 32
......@@ -2094,180 +2094,7 @@ struct {
{ YG, YG, YG, YG, YG, YG, YG, YG,
YG, YG, YG, YG, YG, YG, YG, YG }
};
// 32 pixels
// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 BGRA pixels.
void I422ToBGRARow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_bgra,
int width) {
// Note: vpermq shuffles quad words (64 bit). 0xd8 = 0%11 01 10 00 in binary.
// "vpermq 0xd8 ABCD dst" results in dst = ACBD. This is useful because
// vpunpck l/h works on the low/high quad words respectively.
asm volatile (
"sub %[u_buf], %[v_buf] \n"
LABELALIGN
// Compute 32 BGRA pixels each iteration in the following loop.
"1: \n"
/*
* Prepare UV contribution to RGB.
*/
"vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n"
// ymm0 = xxxxxxxxxxxxxxxxUUUUUUUUUUUUUUUU, uint8
MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)
// ymm1 = xxxxxxxxxxxxxxxxVVVVVVVVVVVVVVVV, uint8
"lea " MEMLEA(0x10, [u_buf]) ", %[u_buf] \n" // u_buf += 16
"vpermq $0xd8, %%ymm0, %%ymm0 \n"
// ymm0 = xxxxxxxxUUUUUUUUxxxxxxxxUUUUUUUU
"vpermq $0xd8, %%ymm1, %%ymm1 \n"
// ymm1 = xxxxxxxxVVVVVVVVxxxxxxxxVVVVVVVV
"vpunpcklbw %%ymm1, %%ymm0, %%ymm0 \n"
// ymm0 = UVUVUVUVUVUVUVUVUVUVUVUVUVUVUVUV
"vpmaddubsw " MEMACCESS([kYuvConstants]) ", %%ymm0, %%ymm2 \n"
// ymm2 (B) = int16(UB * U + VB * V), for each int16.
"vpmaddubsw " MEMACCESS2(32, [kYuvConstants]) ", %%ymm0, %%ymm1 \n"
// ymm1 (G) = int16(UG * U + VG * V), for each int16.
"vpmaddubsw " MEMACCESS2(64, [kYuvConstants]) ", %%ymm0, %%ymm0 \n"
// ymm0 (R) = int16(UR * U + VR * V), for each int16.
"vpsubw " MEMACCESS2(96, [kYuvConstants]) ", %%ymm2, %%ymm2 \n"
// ymm2 -= BB, each int16
"vpsubw " MEMACCESS2(128, [kYuvConstants]) ", %%ymm1, %%ymm1 \n"
// ymm1 -= BG, each int16
"vpsubw " MEMACCESS2(160, [kYuvConstants]) ", %%ymm0, %%ymm0 \n"
// ymm0 -= BR, each int16
// Shuffle order so that we can upsample with vpunpck l/h wd later.
"vpermq $0xd8, %%ymm0, %%ymm0 \n"
"vpermq $0xd8, %%ymm1, %%ymm1 \n"
"vpermq $0xd8, %%ymm2, %%ymm2 \n"
/*
* Prepare Y contribution to RGB.
*/
// Use ymm3 and ymm4 as temporary variables in this block.
"vmovdqu " MEMACCESS([y_buf]) ", %%ymm3 \n"
// ymm3 = YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY
"lea " MEMLEA(0x20, [y_buf]) ",%[y_buf] \n" // y_buf += 32
"vpermq $0xd8, %%ymm3, %%ymm3 \n"
"vpxor %%ymm4, %%ymm4, %%ymm4 \n" // ymm4 = 0x00...
"vpunpcklbw %%ymm4, %%ymm3, %%ymm6 \n"
"vpunpckhbw %%ymm4, %%ymm3, %%ymm7 \n"
// ymm6 = 0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y (int16), pixels 0-15.
// ymm7 = 0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y0Y (int16), pixels 16-31.
// Upsample UV_RGB pixels 16-31.
"vpunpckhwd %%ymm2, %%ymm2, %%ymm5 \n"
"vpunpckhwd %%ymm1, %%ymm1, %%ymm4 \n"
"vpunpckhwd %%ymm0, %%ymm0, %%ymm3 \n"
// Upsample UV_RGB pixels 0-15.
"vpunpcklwd %%ymm2, %%ymm2, %%ymm2 \n"
"vpunpcklwd %%ymm1, %%ymm1, %%ymm1 \n"
"vpunpcklwd %%ymm0, %%ymm0, %%ymm0 \n"
// ymm6/7 -= BY, for each int16.
"vpsubsw " MEMACCESS2(192, [kYuvConstants]) ", %%ymm6, %%ymm6 \n"
"vpsubsw " MEMACCESS2(192, [kYuvConstants]) ", %%ymm7, %%ymm7 \n"
// ymm6/7 *= YG, for each int16.
"vpmullw " MEMACCESS2(224, [kYuvConstants]) ", %%ymm6, %%ymm6 \n"
"vpmullw " MEMACCESS2(224, [kYuvConstants]) ", %%ymm7, %%ymm7 \n"
/*
* Pixels 0-15.
*/
"vpaddsw %%ymm2, %%ymm6, %%ymm2 \n" // ymm2 (B) += ymm6 (each int16)
"vpaddsw %%ymm1, %%ymm6, %%ymm1 \n" // ymm1 (G)
"vpaddsw %%ymm0, %%ymm6, %%ymm0 \n" // ymm0 (R)
"vpsraw $6, %%ymm2, %%ymm2 \n" // ymm2 >>= 6 (each int16)
"vpsraw $6, %%ymm1, %%ymm1 \n"
"vpsraw $6, %%ymm0, %%ymm0 \n"
// Cast each int16 to uint8.
"vpackuswb %%ymm2, %%ymm2, %%ymm2 \n"
// ymm2 = xxxxxxxxBBBBBBBBxxxxxxxxBBBBBBBB
"vpackuswb %%ymm1, %%ymm1, %%ymm1 \n"
// ymm1 = xxxxxxxxGGGGGGGGxxxxxxxxGGGGGGGG
"vpackuswb %%ymm0, %%ymm0, %%ymm0 \n"
// ymm0 = xxxxxxxxRRRRRRRRxxxxxxxxRRRRRRRR
"vpunpcklbw %%ymm2, %%ymm1, %%ymm2 \n"
// ymm2 = BGBGBGBGBGBGBGBGBGBGBGBGBGBGBGBG
"vpcmpeqb %%ymm6, %%ymm6, %%ymm6 \n" // ymm6 = 0xFF..., for alpha.
"vpunpcklbw %%ymm0, %%ymm6, %%ymm0 \n"
// ymm0 = RARARARARARARARARARARARARARARARA
"vpermq $0xd8, %%ymm2, %%ymm2 \n"
"vpermq $0xd8, %%ymm0, %%ymm0 \n"
"vpunpcklwd %%ymm2, %%ymm0, %%ymm1 \n"
// ymm1 = BGRABGRABGRABGRABGRABGRABGRABGRA, pixels 0-7.
"vpunpckhwd %%ymm2, %%ymm0, %%ymm2 \n"
// ymm2 = BGRABGRABGRABGRABGRABGRABGRABGRA, pixels 8-15.
// Store pixels 0-15.
"vmovdqu %%ymm1," MEMACCESS([dst_bgra]) "\n"
"vmovdqu %%ymm2," MEMACCESS2(0x20, [dst_bgra]) "\n"
/*
* Pixels 16-31.
*/
"vpaddsw %%ymm5, %%ymm7, %%ymm5 \n" // ymm5 (B) += ymm7 (each int16)
"vpaddsw %%ymm4, %%ymm7, %%ymm4 \n" // ymm4 (G)
"vpaddsw %%ymm3, %%ymm7, %%ymm3 \n" // ymm3 (R)
"vpsraw $6, %%ymm5, %%ymm5 \n" // ymm5 >>= 6 (each int16)
"vpsraw $6, %%ymm4, %%ymm4 \n"
"vpsraw $6, %%ymm3, %%ymm3 \n"
// Cast each int16 to uint8.
"vpackuswb %%ymm5, %%ymm5, %%ymm5 \n"
// ymm5 = xxxxxxxxBBBBBBBBxxxxxxxxBBBBBBBB
"vpackuswb %%ymm4, %%ymm4, %%ymm4 \n"
// ymm4 = xxxxxxxxGGGGGGGGxxxxxxxxGGGGGGGG
"vpackuswb %%ymm3, %%ymm3, %%ymm3 \n"
// ymm3 = xxxxxxxxRRRRRRRRxxxxxxxxRRRRRRRR
"vpunpcklbw %%ymm5, %%ymm4, %%ymm5 \n"
// ymm5 = BGBGBGBGBGBGBGBGBGBGBGBGBGBGBGBG
"vpunpcklbw %%ymm3, %%ymm6, %%ymm3 \n"
// ymm3 = RARARARARARARARARARARARARARARARA
"vpermq $0xd8, %%ymm5, %%ymm5 \n"
"vpermq $0xd8, %%ymm3, %%ymm3 \n"
"vpunpcklwd %%ymm5, %%ymm3, %%ymm4 \n"
// ymm4 = BGRABGRABGRABGRABGRABGRABGRABGRA, pixels 16-23.
"vpunpckhwd %%ymm5, %%ymm3, %%ymm5 \n"
// ymm5 = BGRABGRABGRABGRABGRABGRABGRABGRA, pixels 24-31.
// Store pixels 16-31.
"vmovdqu %%ymm4," MEMACCESS2(0x40, [dst_bgra]) "\n"
"vmovdqu %%ymm5," MEMACCESS2(0x60, [dst_bgra]) "\n"
"lea " MEMLEA(0x80, [dst_bgra]) ", %[dst_bgra] \n" // dst_bgra += 128
"sub $0x20, %[width] \n" // width -= 32
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants_AVX.kUVToB_AVX) // %[kYuvConstants]
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
// TODO(magjed): declare ymm usage when applicable.
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
#endif // HAS_I422TOBGRAROW_AVX2
#endif // defined(HAS_I422TOARGBROW_AVX2) || defined(HAS_I422TOBGRAROW_AVX2)
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 \
......@@ -2302,6 +2129,54 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
#if defined(HAS_I422TOBGRAROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_bgra,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpxor %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
READYUV422_AVX2
YUVTORGB_AVX2
// Step 3: Weave into BGRA
"vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
"vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
"vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
"vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
"lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants_AVX.kUVToB_AVX) // %[kYuvConstants]
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
#endif // HAS_I422TOBGRAROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment