Commit 9d2cd6a3 authored by Frank Barchard's avatar Frank Barchard Committed by Frank Barchard

H010ToAR30 optimized to 2 step conversion

Previously H010ToAR30 was done in a 3 step conversion:
H010ToH420, H420ToARGB, ARGBToAR30.
This CL merges the first 2 steps into H010ToARGB, to
improve performance.
Caveat - only 10 bit YUV is supported at this time.
Previously the low level code supported different numbers
of bits - 9, 10, 12 or 16.

Was 3 step conversion:
LibYUVConvertTest.H010ToAR30_Any (1263 ms)
LibYUVConvertTest.H010ToAR30_Unaligned (951 ms)
LibYUVConvertTest.H010ToAR30_Invert (913 ms)
LibYUVConvertTest.H010ToAR30_Opt (901 ms)

Now 2 step conversion:
LibYUVConvertTest.H010ToAR30_Any (853 ms)
LibYUVConvertTest.H010ToAR30_Unaligned (811 ms)
LibYUVConvertTest.H010ToAR30_Invert (781 ms)
LibYUVConvertTest.H010ToAR30_Opt (755 ms)

Bug: libyuv:751
Test: LibYUVConvertTest.H010ToAR30_Opt
Change-Id: Ica7574040401cd57145a4827acdf3c0e58346a2a
Reviewed-on: https://chromium-review.googlesource.com/853288Reviewed-by: 's avatarFrank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
parent 263243aa
......@@ -410,7 +410,9 @@ int H422ToABGR(const uint8* src_y,
width, height);
}
// Convert 10 bit YUV to 10 bit RGB with matrix
// Convert 10 bit YUV to ARGB with matrix
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits.
static int H010ToAR30Matrix(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
......@@ -420,20 +422,15 @@ static int H010ToAR30Matrix(const uint16* src_y,
uint8* dst_ar30,
int dst_stride_ar30,
const struct YuvConstants* yuvconstants,
int scale, // 16384 for 10 bits
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*Convert16To8Row)(const uint16* src_y, uint8* dst_y, int scale,
int width) = Convert16To8Row_C;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf,
const uint16* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C;
I210ToARGBRow_C;
void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToAR30Row_C;
if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
return -1;
}
......@@ -443,20 +440,11 @@ static int H010ToAR30Matrix(const uint16* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30;
}
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
#if defined(HAS_I210TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
Convert16To8Row = Convert16To8Row_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert16To8Row = Convert16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
Convert16To8Row = Convert16To8Row_AVX2;
I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I210ToARGBRow = I210ToARGBRow_SSSE3;
}
}
#endif
......@@ -476,73 +464,25 @@ static int H010ToAR30Matrix(const uint16* src_y,
}
}
#endif
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
{
// Row buffers for 8 bit YUV and RGB.
align_buffer_64(row_buf, width + halfwidth * 2 + width * 4);
uint8* row_y = row_buf;
uint8* row_u = row_buf + width;
uint8* row_v = row_buf + width + halfwidth;
uint8* row_argb = row_buf + width + halfwidth * 2;
for (y = 0; y < height - 1; y += 2) {
Convert16To8Row(src_y, row_y, scale, width);
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30, width);
align_buffer_64(row_argb, width * 4);
Convert16To8Row(src_y + src_stride_y, row_y, scale, width);
I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30 + dst_stride_ar30, width);
dst_ar30 += dst_stride_ar30 * 2;
src_y += src_stride_y * 2;
src_u += src_stride_u;
src_v += src_stride_v;
}
if (height & 1) {
Convert16To8Row(src_y, row_y, scale, width);
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
for (y = 0; y < height; ++y) {
I210ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30, width);
dst_ar30 += dst_stride_ar30;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
free_aligned_buffer_64(row_buf);
free_aligned_buffer_64(row_argb);
}
return 0;
}
......@@ -560,7 +500,7 @@ int H010ToAR30(const uint16* src_y,
int height) {
return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_ar30, dst_stride_ar30,
&kYuvH709Constants, 16384, width, height);
&kYuvH709Constants, width, height);
}
// Convert 10 bit YUV to ARGB with matrix
......
......@@ -211,7 +211,7 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
memcpy(temp, y_buf + n, r * SBPP); \
memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \
ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
}
......
......@@ -1625,16 +1625,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
// Read 4 UV from 422 10 bit, upsample to 8 UV
// TODO(fbarchard): Consider shufb to replace pack/unpack
// TODO(fbarchard): Consider pmulhuw to replace psraw
// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
#define READYUV422_10 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"punpcklwd %%xmm1,%%xmm0 \n" \
"psraw $0x2,%%xmm0 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"psraw $0x2,%%xmm0 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"movdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
"psllw $0x6,%%xmm4 \n" \
"psllw $0x6,%%xmm4 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment