H010ToAR30 optimized to 2 step conversion

Previously H010ToAR30 was done in a 3 step conversion: H010ToH420, H420ToARGB, ARGBToAR30. This CL merges the first 2 steps into H010ToARGB, to improve performance. Caveat - only 10 bit YUV is supported at this time. Previously the low level code supported different numbers of bits - 9, 10, 12 or 16. Was 3 step conversion: LibYUVConvertTest.H010ToAR30_Any (1263 ms) LibYUVConvertTest.H010ToAR30_Unaligned (951 ms) LibYUVConvertTest.H010ToAR30_Invert (913 ms) LibYUVConvertTest.H010ToAR30_Opt (901 ms) Now 2 step conversion: LibYUVConvertTest.H010ToAR30_Any (853 ms) LibYUVConvertTest.H010ToAR30_Unaligned (811 ms) LibYUVConvertTest.H010ToAR30_Invert (781 ms) LibYUVConvertTest.H010ToAR30_Opt (755 ms) Bug: libyuv:751 Test: LibYUVConvertTest.H010ToAR30_Opt Change-Id: Ica7574040401cd57145a4827acdf3c0e58346a2a Reviewed-on: https://chromium-review.googlesource.com/853288Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org>

H010ToAR30 optimized to 2 step conversion
Previously H010ToAR30 was done in a 3 step conversion: H010ToH420, H420ToARGB, ARGBToAR30. This CL merges the first 2 steps into H010ToARGB, to improve performance. Caveat - only 10 bit YUV is supported at this time. Previously the low level code supported different numbers of bits - 9, 10, 12 or 16. Was 3 step conversion: LibYUVConvertTest.H010ToAR30_Any (1263 ms) LibYUVConvertTest.H010ToAR30_Unaligned (951 ms) LibYUVConvertTest.H010ToAR30_Invert (913 ms) LibYUVConvertTest.H010ToAR30_Opt (901 ms) Now 2 step conversion: LibYUVConvertTest.H010ToAR30_Any (853 ms) LibYUVConvertTest.H010ToAR30_Unaligned (811 ms) LibYUVConvertTest.H010ToAR30_Invert (781 ms) LibYUVConvertTest.H010ToAR30_Opt (755 ms) Bug: libyuv:751 Test: LibYUVConvertTest.H010ToAR30_Opt Change-Id: Ica7574040401cd57145a4827acdf3c0e58346a2a Reviewed-on: https://chromium-review.googlesource.com/853288Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org>
9d2cd6a3 · Frank Barchard · Frank Barchard · 263243aa · 9d2cd6a3 · 9d2cd6a3
Commit 9d2cd6a3 authored Jan 06, 2018 by Frank Barchard Committed by Frank Barchard Jan 07, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 86 deletions

convert_argb.cc source/convert_argb.cc +22 -82

row_any.cc source/row_any.cc +1 -1

row_gcc.cc source/row_gcc.cc +5 -3

No files found.
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -410,7 +410,9 @@ int H422ToABGR(const uint8* src_y,
                          width, height);
 }

-// Convert 10 bit YUV to 10 bit RGB with matrix
+// Convert 10 bit YUV to ARGB with matrix
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
 static int H010ToAR30Matrix(const uint16* src_y,
                            int src_stride_y,
                            const uint16* src_u,
@@ -420,20 +422,15 @@ static int H010ToAR30Matrix(const uint16* src_y,
                            uint8* dst_ar30,
                            int dst_stride_ar30,
                            const struct YuvConstants* yuvconstants,
-                            int scale,  // 16384 for 10 bits
                            int width,
                            int height) {
  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*Convert16To8Row)(const uint16* src_y, uint8* dst_y, int scale,
-                          int width) = Convert16To8Row_C;
-  void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
-                        const uint8* v_buf, uint8* rgb_buf,
+  void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf,
+                        const uint16* v_buf, uint8* rgb_buf,
                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
+      I210ToARGBRow_C;
  void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
      ARGBToAR30Row_C;
-
  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
    return -1;
  }
@@ -443,20 +440,11 @@ static int H010ToAR30Matrix(const uint16* src_y,
    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
    dst_stride_ar30 = -dst_stride_ar30;
  }
-
-#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+#if defined(HAS_I210TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Convert16To8Row = Convert16To8Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      Convert16To8Row = Convert16To8Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_CONVERT16TO8ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Convert16To8Row = Convert16To8Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      Convert16To8Row = Convert16To8Row_AVX2;
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
    }
  }
 #endif
@@ -476,73 +464,25 @@ static int H010ToAR30Matrix(const uint16* src_y,
    }
  }
 #endif
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif

  {
    // Row buffers for 8 bit YUV and RGB.
-    align_buffer_64(row_buf, width + halfwidth * 2 + width * 4);
-    uint8* row_y = row_buf;
-    uint8* row_u = row_buf + width;
-    uint8* row_v = row_buf + width + halfwidth;
-    uint8* row_argb = row_buf + width + halfwidth * 2;
-
-    for (y = 0; y < height - 1; y += 2) {
-      Convert16To8Row(src_y, row_y, scale, width);
-      Convert16To8Row(src_u, row_u, scale, halfwidth);
-      Convert16To8Row(src_v, row_v, scale, halfwidth);
-      I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
-      ARGBToAR30Row(row_argb, dst_ar30, width);
+    align_buffer_64(row_argb, width * 4);

-      Convert16To8Row(src_y + src_stride_y, row_y, scale, width);
-      I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
-      ARGBToAR30Row(row_argb, dst_ar30 + dst_stride_ar30, width);
-      dst_ar30 += dst_stride_ar30 * 2;
-      src_y += src_stride_y * 2;
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-
-    if (height & 1) {
-      Convert16To8Row(src_y, row_y, scale, width);
-      Convert16To8Row(src_u, row_u, scale, halfwidth);
-      Convert16To8Row(src_v, row_v, scale, halfwidth);
-      I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
+    for (y = 0; y < height; ++y) {
+      I210ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width);
      ARGBToAR30Row(row_argb, dst_ar30, width);
+      dst_ar30 += dst_stride_ar30;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
    }

-    free_aligned_buffer_64(row_buf);
+    free_aligned_buffer_64(row_argb);
  }
+
  return 0;
 }

@@ -560,7 +500,7 @@ int H010ToAR30(const uint16* src_y,
               int height) {
  return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvH709Constants, 16384, width, height);
+                          &kYuvH709Constants, width, height);
 }

 // Convert 10 bit YUV to ARGB with matrix

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -211,7 +211,7 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
    memcpy(temp, y_buf + n, r * SBPP);                                         \
    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);        \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);         \
    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
  }


--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1625,16 +1625,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,

 // Read 4 UV from 422 10 bit, upsample to 8 UV
 // TODO(fbarchard): Consider shufb to replace pack/unpack
+// TODO(fbarchard): Consider pmulhuw to replace psraw
+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
 #define READYUV422_10 \
  "movq       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
    "punpcklwd  %%xmm1,%%xmm0                                   \n"            \
-    "psraw      $0x2,%%xmm0                                     \n" \
-    "packuswb   %%xmm0,%%xmm0                                   \n" \
+    "psraw      $0x2,%%xmm0                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
    "movdqu     " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "psllw      $0x6,%%xmm4                                     \n" \
+    "psllw      $0x6,%%xmm4                                     \n"            \
    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]               \n"

 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.