I210ToARGB conversion from 10 bit YUV to RGB

SSSE3 optimized 10 bit YUV conversion to ARGB in single step. Bug: libyuv:751 Test: I010ToARGB Change-Id: I234b2850e35992113ee6bd638732bafc7010a60d Reviewed-on: https://chromium-review.googlesource.com/848238 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>

I210ToARGB conversion from 10 bit YUV to RGB
SSSE3 optimized 10 bit YUV conversion to ARGB in single step. Bug: libyuv:751 Test: I010ToARGB Change-Id: I234b2850e35992113ee6bd638732bafc7010a60d Reviewed-on: https://chromium-review.googlesource.com/848238 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
a6465859 · Frank Barchard · Commit Bot · ac088b4b · a6465859 · a6465859
Commit a6465859 authored Jan 04, 2018 by Frank Barchard Committed by Commit Bot Jan 05, 2018
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1686
+Version: 1687
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -63,6 +63,32 @@ int I420ToABGR(const uint8* src_y,
               int width,
               int height);
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 // Convert I422 to ARGB.
 LIBYUV_API
 int I422ToARGB(const uint8* src_y,

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -265,6 +265,8 @@ extern "C" {
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
+// I210 is for H010.  2 = 422.  I for 601 vs H for 709.
+#define HAS_I210TOARGBROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #endif
@@ -1735,9 +1737,9 @@ void I422ToARGBRow_C(const uint8* src_y,
                     uint8* dst_argb,
                     const struct YuvConstants* yuvconstants,
                     int width);
-void I422ToARGBRow_C(const uint8* src_y,
+void I210ToARGBRow_C(const uint16* src_y,
-                     const uint8* src_u,
+                     const uint16* src_u,
-                     const uint8* src_v,
+                     const uint16* src_v,
                     uint8* dst_argb,
                     const struct YuvConstants* yuvconstants,
                     int width);
@@ -1807,12 +1809,6 @@ void I422ToARGBRow_AVX2(const uint8* src_y,
                        uint8* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
 void I422ToRGBARow_AVX2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
@@ -1849,6 +1845,13 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
                         uint8* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I210ToARGBRow_SSSE3(const uint16* src_y,
+                         const uint16* src_u,
+                         const uint16* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
                              const uint8* u_buf,
                              const uint8* v_buf,
@@ -1863,12 +1866,6 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
                             uint8* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
 void NV12ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_uv,
                         uint8* dst_argb,
@@ -1999,6 +1996,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                             uint8* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I210ToARGBRow_Any_SSSE3(const uint16* src_y,
+                             const uint16* src_u,
+                             const uint16* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1686
+#define LIBYUV_VERSION 1687
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -47,7 +47,7 @@ int ARGBCopy(const uint8* src_argb,
  return 0;
 }
-// Convert I422 to ARGB with matrix
+// Convert I420 to ARGB with matrix
 static int I420ToARGBMatrix(const uint8* src_y,
                            int src_stride_y,
                            const uint8* src_u,
@@ -573,18 +573,13 @@ static int H010ToARGBMatrix(const uint16* src_y,
                            uint8* dst_argb,
                            int dst_stride_argb,
                            const struct YuvConstants* yuvconstants,
-                            int scale,  // 16384 for 10 bits
                            int width,
                            int height) {
  int y;
-  int halfwidth = (width + 1) >> 1;
+  void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf,
-  void (*Convert16To8Row)(const uint16* src_y, uint8* dst_y, int scale,
+                        const uint16* v_buf, uint8* rgb_buf,
-                          int width) = Convert16To8Row_C;
-  void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
-                        const uint8* v_buf, uint8* rgb_buf,
                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
+      I210ToARGBRow_C;
  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
    return -1;
  }
@@ -594,85 +589,23 @@ static int H010ToARGBMatrix(const uint16* src_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
-#if defined(HAS_CONVERT16TO8ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      Convert16To8Row = Convert16To8Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_CONVERT16TO8ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    Convert16To8Row = Convert16To8Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      Convert16To8Row = Convert16To8Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
    }
  }
 #endif
-  {
+  for (y = 0; y < height; ++y) {
-    // Row buffers for 8 bit YUV.
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    align_buffer_64(row_buf, width + halfwidth * 2);
+    dst_argb += dst_stride_argb;
-    uint8* row_y = row_buf;
+    src_y += src_stride_y;
-    uint8* row_u = row_buf + width;
+    if (y & 1) {
-    uint8* row_v = row_buf + width + halfwidth;
-    for (y = 0; y < height - 1; y += 2) {
-      Convert16To8Row(src_y, row_y, scale, width);
-      Convert16To8Row(src_u, row_u, scale, halfwidth);
-      Convert16To8Row(src_v, row_v, scale, halfwidth);
-      I422ToARGBRow(row_y, row_u, row_v, dst_argb, yuvconstants, width);
-      Convert16To8Row(src_y + src_stride_y, row_y, scale, width);
-      I422ToARGBRow(row_y, row_u, row_v, dst_argb + dst_stride_argb,
-                    yuvconstants, width);
-      dst_argb += dst_stride_argb * 2;
-      src_y += src_stride_y * 2;
      src_u += src_stride_u;
      src_v += src_stride_v;
    }
-    if (height & 1) {
-      Convert16To8Row(src_y, row_y, scale, width);
-      Convert16To8Row(src_u, row_u, scale, halfwidth);
-      Convert16To8Row(src_v, row_v, scale, halfwidth);
-      I422ToARGBRow(row_y, row_u, row_v, dst_argb, yuvconstants, width);
-    }
-    free_aligned_buffer_64(row_buf);
  }
  return 0;
 }
@@ -691,7 +624,7 @@ int H010ToARGB(const uint16* src_y,
               int height) {
  return H010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, 16384, width, height);
+                          &kYuvH709Constants, width, height);
 }
 // Convert I444 to ARGB with matrix

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -194,6 +194,32 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
 #endif
 #undef ANY31C
+// 64 byte per row for future AVX2
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, uint8* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {           \
+    SIMD_ALIGNED(T temp[16 * 3]);                                              \
+    SIMD_ALIGNED(uint8 out[64]);                                               \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                    \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);                 \
+    }                                                                          \
+    memcpy(temp, y_buf + n, r * SBPP);                                         \
+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);        \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+  }
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7)
+#endif
+#undef ANY31CT
 // Any 2 planes to 1.
 #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)       \
  void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1295,6 +1295,51 @@ static __inline void YuvPixel(uint8 y,
  *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
 }
+// C reference code that mimics the YUV 10 bit assembly.
+static __inline void YuvPixel10(uint16 y,
+                                uint16 u,
+                                uint16 v,
+                                uint8* b,
+                                uint8* g,
+                                uint8* r,
+                                const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+  uint32 y1 = (uint32)((y << 6) * yg) >> 16;
+  u = clamp255(u >> 2);
+  v = clamp255(v >> 2);
+  *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
+  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
+  *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
+}
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
@@ -1388,6 +1433,33 @@ void I422ToARGBRow_C(const uint8* src_y,
  }
 }
+// 10 bit YUV to ARGB
+void I210ToARGBRow_C(const uint16* src_y,
+                     const uint16* src_u,
+                     const uint16* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
 void I422AlphaToARGBRow_C(const uint8* src_y,
                          const uint8* src_u,
                          const uint8* src_v,

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1623,6 +1623,20 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+// TODO(fbarchard): Consider shufb to replace pack/unpack
+#define READYUV422_10 \
+  "movq       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklwd  %%xmm1,%%xmm0                                   \n"            \
+    "psraw      $0x2,%%xmm0                                     \n" \
+    "packuswb   %%xmm0,%%xmm0                                   \n" \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movdqu     " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "psllw      $0x6,%%xmm4                                     \n" \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]               \n"
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422 \
  "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
@@ -1862,6 +1876,36 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
  );
 }
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
+                                const uint16* u_buf,
+                                const uint16* v_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422_10
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
                                     const uint8* u_buf,