Commit 9d2cd6a3 authored by Frank Barchard's avatar Frank Barchard Committed by Frank Barchard

H010ToAR30 optimized to 2 step conversion

Previously H010ToAR30 was done in a 3 step conversion:
H010ToH420, H420ToARGB, ARGBToAR30.
This CL merges the first 2 steps into H010ToARGB, to
improve performance.
Caveat - only 10 bit YUV is supported at this time.
Previously the low level code supported different numbers
of bits - 9, 10, 12 or 16.

Was 3 step conversion:
LibYUVConvertTest.H010ToAR30_Any (1263 ms)
LibYUVConvertTest.H010ToAR30_Unaligned (951 ms)
LibYUVConvertTest.H010ToAR30_Invert (913 ms)
LibYUVConvertTest.H010ToAR30_Opt (901 ms)

Now 2 step conversion:
LibYUVConvertTest.H010ToAR30_Any (853 ms)
LibYUVConvertTest.H010ToAR30_Unaligned (811 ms)
LibYUVConvertTest.H010ToAR30_Invert (781 ms)
LibYUVConvertTest.H010ToAR30_Opt (755 ms)

Bug: libyuv:751
Test: LibYUVConvertTest.H010ToAR30_Opt
Change-Id: Ica7574040401cd57145a4827acdf3c0e58346a2a
Reviewed-on: https://chromium-review.googlesource.com/853288Reviewed-by: 's avatarFrank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
parent 263243aa
...@@ -410,7 +410,9 @@ int H422ToABGR(const uint8* src_y, ...@@ -410,7 +410,9 @@ int H422ToABGR(const uint8* src_y,
width, height); width, height);
} }
// Convert 10 bit YUV to 10 bit RGB with matrix // Convert 10 bit YUV to ARGB with matrix
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits.
static int H010ToAR30Matrix(const uint16* src_y, static int H010ToAR30Matrix(const uint16* src_y,
int src_stride_y, int src_stride_y,
const uint16* src_u, const uint16* src_u,
...@@ -420,20 +422,15 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -420,20 +422,15 @@ static int H010ToAR30Matrix(const uint16* src_y,
uint8* dst_ar30, uint8* dst_ar30,
int dst_stride_ar30, int dst_stride_ar30,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int scale, // 16384 for 10 bits
int width, int width,
int height) { int height) {
int y; int y;
int halfwidth = (width + 1) >> 1; void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf,
void (*Convert16To8Row)(const uint16* src_y, uint8* dst_y, int scale, const uint16* v_buf, uint8* rgb_buf,
int width) = Convert16To8Row_C;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) = const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C; I210ToARGBRow_C;
void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) = void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToAR30Row_C; ARGBToAR30Row_C;
if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
return -1; return -1;
} }
...@@ -443,20 +440,11 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -443,20 +440,11 @@ static int H010ToAR30Matrix(const uint16* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_I210TOARGBROW_SSSE3)
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3; I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 8)) {
Convert16To8Row = Convert16To8Row_SSSE3; I210ToARGBRow = I210ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert16To8Row = Convert16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
Convert16To8Row = Convert16To8Row_AVX2;
} }
} }
#endif #endif
...@@ -476,73 +464,25 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -476,73 +464,25 @@ static int H010ToAR30Matrix(const uint16* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
{ {
// Row buffers for 8 bit YUV and RGB. // Row buffers for 8 bit YUV and RGB.
align_buffer_64(row_buf, width + halfwidth * 2 + width * 4); align_buffer_64(row_argb, width * 4);
uint8* row_y = row_buf;
uint8* row_u = row_buf + width;
uint8* row_v = row_buf + width + halfwidth;
uint8* row_argb = row_buf + width + halfwidth * 2;
for (y = 0; y < height - 1; y += 2) {
Convert16To8Row(src_y, row_y, scale, width);
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30, width);
Convert16To8Row(src_y + src_stride_y, row_y, scale, width); for (y = 0; y < height; ++y) {
I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width); I210ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30 + dst_stride_ar30, width);
dst_ar30 += dst_stride_ar30 * 2;
src_y += src_stride_y * 2;
src_u += src_stride_u;
src_v += src_stride_v;
}
if (height & 1) {
Convert16To8Row(src_y, row_y, scale, width);
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30, width); ARGBToAR30Row(row_argb, dst_ar30, width);
dst_ar30 += dst_stride_ar30;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
} }
free_aligned_buffer_64(row_buf); free_aligned_buffer_64(row_argb);
} }
return 0; return 0;
} }
...@@ -560,7 +500,7 @@ int H010ToAR30(const uint16* src_y, ...@@ -560,7 +500,7 @@ int H010ToAR30(const uint16* src_y,
int height) { int height) {
return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_ar30, dst_stride_ar30, src_stride_v, dst_ar30, dst_stride_ar30,
&kYuvH709Constants, 16384, width, height); &kYuvH709Constants, width, height);
} }
// Convert 10 bit YUV to ARGB with matrix // Convert 10 bit YUV to ARGB with matrix
......
...@@ -211,7 +211,7 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) ...@@ -211,7 +211,7 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
memcpy(temp, y_buf + n, r * SBPP); \ memcpy(temp, y_buf + n, r * SBPP); \
memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
} }
......
...@@ -1625,16 +1625,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, ...@@ -1625,16 +1625,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
// Read 4 UV from 422 10 bit, upsample to 8 UV // Read 4 UV from 422 10 bit, upsample to 8 UV
// TODO(fbarchard): Consider shufb to replace pack/unpack // TODO(fbarchard): Consider shufb to replace pack/unpack
// TODO(fbarchard): Consider pmulhuw to replace psraw
// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
#define READYUV422_10 \ #define READYUV422_10 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"punpcklwd %%xmm1,%%xmm0 \n" \ "punpcklwd %%xmm1,%%xmm0 \n" \
"psraw $0x2,%%xmm0 \n" \ "psraw $0x2,%%xmm0 \n" \
"packuswb %%xmm0,%%xmm0 \n" \ "packuswb %%xmm0,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \
"movdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ "movdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
"psllw $0x6,%%xmm4 \n" \ "psllw $0x6,%%xmm4 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment