Commit 000cf89c authored by Frank Barchard's avatar Frank Barchard

YUY2ToARGB avx2 in 1 step conversion.

Includes UYVYToARGB ssse3 fix.

Was
YUY2ToARGB_Opt (433 ms)
69.79%  libyuv_unittest  libyuv_unittest      [.] I422ToARGBRow_AVX2
20.73%  libyuv_unittest  libyuv_unittest      [.] YUY2ToUV422Row_AVX2
 6.04%  libyuv_unittest  libyuv_unittest      [.] YUY2ToYRow_AVX2
 0.77%  libyuv_unittest  libyuv_unittest      [.] YUY2ToARGBRow_AVX2

Now
YUY2ToARGB_Opt (280 ms)
95.66%  libyuv_unittest  libyuv_unittest      [.] YUY2ToARGBRow_AVX2

BUG=libyuv:494
R=harryjin@google.com

Review URL: https://codereview.chromium.org/1364813002 .
parent 16f12b58
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1489
Version: 1490
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1489
#define LIBYUV_VERSION 1490
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -2607,48 +2607,6 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
}
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, yuvconstants, twidth);
src_yuy2 += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif
#if defined(HAS_UYVYTOARGBROW_AVX2)
void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, yuvconstants, twidth);
src_uyvy += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif // !defined(LIBYUV_DISABLE_X86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
This diff is collapsed.
......@@ -243,6 +243,30 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
// YUY2 shuf 16 Y to 32 Y.
static const lvec8 kShuffleYUY2Y = {
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
};
// YUY2 shuf 8 UV to 16 UV.
static const lvec8 kShuffleYUY2UV = {
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
};
// UYVY shuf 16 Y to 32 Y.
static const lvec8 kShuffleUYVYY = {
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
};
// UYVY shuf 8 UV to 16 UV.
static const lvec8 kShuffleUYVYUV = {
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
};
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked)
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
......@@ -1899,6 +1923,24 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm lea eax, [eax + 16] \
}
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
#define READYUY2_AVX2 __asm { \
__asm vmovdqu ymm4, [eax] /* YUY2 */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
__asm vmovdqu ymm0, [eax] /* UV */ \
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 32] \
}
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
#define READUYVY_AVX2 __asm { \
__asm vmovdqu ymm4, [eax] /* UYVY */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
__asm vmovdqu ymm0, [eax] /* UV */ \
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 32] \
}
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) __asm { \
__asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
......@@ -2168,6 +2210,65 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_NV12TOARGBROW_AVX2
// 16 pixels.
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
__declspec(naked)
void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
__asm {
push ebp
mov eax, [esp + 4 + 4] // yuy2
mov edx, [esp + 4 + 8] // argb
mov ebp, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUY2_AVX2
YUVTORGB_AVX2(ebp)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
pop ebp
vzeroupper
ret
}
}
// 16 pixels.
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
__declspec(naked)
void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
__asm {
push ebp
mov eax, [esp + 4 + 4] // uyvy
mov edx, [esp + 4 + 8] // argb
mov ebp, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READUYVY_AVX2
YUVTORGB_AVX2(ebp)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
pop ebp
vzeroupper
ret
}
}
#ifdef HAS_I422TOBGRAROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
......@@ -2338,17 +2439,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm lea eax, [eax + 8] \
}
// YUY2 shuf 8 Y to 16 Y.
static const vec8 kShuffleYUY2Y = {
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
};
// YUY2 shuf 4 UV to 8 UV.
static const vec8 kShuffleYUY2UV = {
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
};
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
#define READYUY2 __asm { \
__asm movdqu xmm4, [eax] /* YUY2 */ \
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
......@@ -2357,24 +2448,13 @@ static const vec8 kShuffleYUY2UV = {
__asm lea eax, [eax + 16] \
}
// UYVY shuf 8 Y to 16 Y.
static const vec8 kShuffleUYVYY = {
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
};
// UYVY shuf 4 UV to 8 UV.
static const vec8 kShuffleUYVYUV = {
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
};
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
#define READUYVY __asm { \
__asm movdqu xmm4, [eax] /* UYVY */ \
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
__asm movdqu xmm0, [eax] /* UV */ \
__asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 16] \
__asm lea eax, [eax + 8] \
}
// Convert 8 pixels: 8 UV and 8 Y.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment