H010ToAR30 in 1 step with SSSE3 assembly

Switch YUV conversion macro to output 16 bits per channel. STOREAR30 macro to output AR30. [ RUN ] LibYUVConvertTest.TestH420ToARGB uniques: B 220, G, 220, R 220 [ OK ] LibYUVConvertTest.TestH420ToARGB (0 ms) [ RUN ] LibYUVConvertTest.TestH010ToARGB uniques: B 256, G, 256, R 256 [ OK ] LibYUVConvertTest.TestH010ToARGB (0 ms) [ RUN ] LibYUVConvertTest.TestH010ToAR30 uniques: B 883, G, 883, R 883 [ OK ] LibYUVConvertTest.TestH010ToAR30 (0 ms) Bug: libyuv:751 Test: LibYUVConvertTest.H010ToAR30_Opt Change-Id: I902b718e2c8b68ede69625ccafebc6519d5af70d Reviewed-on: https://chromium-review.googlesource.com/869511Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>

H010ToAR30 in 1 step with SSSE3 assembly
Switch YUV conversion macro to output 16 bits per channel. STOREAR30 macro to output AR30. [ RUN ] LibYUVConvertTest.TestH420ToARGB uniques: B 220, G, 220, R 220 [ OK ] LibYUVConvertTest.TestH420ToARGB (0 ms) [ RUN ] LibYUVConvertTest.TestH010ToARGB uniques: B 256, G, 256, R 256 [ OK ] LibYUVConvertTest.TestH010ToARGB (0 ms) [ RUN ] LibYUVConvertTest.TestH010ToAR30 uniques: B 883, G, 883, R 883 [ OK ] LibYUVConvertTest.TestH010ToAR30 (0 ms) Bug: libyuv:751 Test: LibYUVConvertTest.H010ToAR30_Opt Change-Id: I902b718e2c8b68ede69625ccafebc6519d5af70d Reviewed-on: https://chromium-review.googlesource.com/869511Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
09db0c4c · Frank Barchard · Commit Bot · 37f97210 · 09db0c4c · 09db0c4c
Commit 09db0c4c authored Jan 19, 2018 by Frank Barchard Committed by Commit Bot Jan 19, 2018
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1689
+Version: 1690
 License: BSD
 License File: LICENSE


--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -420,6 +420,19 @@ int H010ToARGB(const uint16* src_y,
               int width,
               int height);

+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
 // Convert H010 to AR30.
 LIBYUV_API
 int H010ToAR30(const uint16* src_y,

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -256,6 +256,7 @@ extern "C" {
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
 // I210 is for H010.  2 = 422.  I for 601 vs H for 709.
+#define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
@@ -1682,6 +1683,12 @@ void I422ToARGBRow_C(const uint8* src_y,
                     uint8* dst_argb,
                     const struct YuvConstants* yuvconstants,
                     int width);
+void I210ToAR30Row_C(const uint16* src_y,
+                     const uint16* src_u,
+                     const uint16* src_v,
+                     uint8* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
 void I210ToARGBRow_C(const uint16* src_y,
                     const uint16* src_u,
                     const uint16* src_v,
@@ -1791,6 +1798,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width);

+void I210ToAR30Row_SSSE3(const uint16* src_y,
+                         const uint16* src_u,
+                         const uint16* src_v,
+                         uint8* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I210ToARGBRow_SSSE3(const uint16* src_y,
                         const uint16* src_u,
                         const uint16* src_v,
@@ -1947,6 +1960,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                             uint8* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I210ToAR30Row_Any_SSSE3(const uint16* src_y,
+                             const uint16* src_u,
+                             const uint16* src_v,
+                             uint8* dst_ar30,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I210ToARGBRow_Any_SSSE3(const uint16* src_y,
                             const uint16* src_u,
                             const uint16* src_v,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1689
+#define LIBYUV_VERSION 1690

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -413,7 +413,7 @@ int H422ToABGR(const uint8* src_y,
 // Convert 10 bit YUV to ARGB with matrix
 // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
 // multiply 10 bit yuv into high bits to allow any number of bits.
-static int H010ToAR30Matrix(const uint16* src_y,
+static int I010ToAR30Matrix(const uint16* src_y,
                            int src_stride_y,
                            const uint16* src_u,
                            int src_stride_u,
@@ -425,12 +425,10 @@ static int H010ToAR30Matrix(const uint16* src_y,
                            int width,
                            int height) {
  int y;
-  void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf,
+  void (*I210ToAR30Row)(const uint16* y_buf, const uint16* u_buf,
                        const uint16* v_buf, uint8* rgb_buf,
                        const struct YuvConstants* yuvconstants, int width) =
-      I210ToARGBRow_C;
-  void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToAR30Row_C;
+      I210ToAR30Row_C;
  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
    return -1;
  }
@@ -440,60 +438,51 @@ static int H010ToAR30Matrix(const uint16* src_y,
    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
    dst_stride_ar30 = -dst_stride_ar30;
  }
-#if defined(HAS_I210TOARGBROW_SSSE3)
+#if defined(HAS_I210TOAR30ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I210ToARGBRow = I210ToARGBRow_SSSE3;
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
    }
  }
 #endif
-#if defined(HAS_I210TOARGBROW_AVX2)
+#if defined(HAS_I210TOAR30ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      I210ToARGBRow = I210ToARGBRow_AVX2;
+      I210ToAR30Row = I210ToAR30Row_AVX2;
    }
  }
 #endif
-#if defined(HAS_ARGBTOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAR30Row = ARGBToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR30Row = ARGBToAR30Row_AVX2;
-    }
-  }
-#endif
-
-  {
-    // Row buffers for 8 bit YUV and RGB.
-    align_buffer_64(row_argb, width * 4);
-
-    for (y = 0; y < height; ++y) {
-      I210ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width);
-      ARGBToAR30Row(row_argb, dst_ar30, width);
-      dst_ar30 += dst_stride_ar30;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
    }
-
-    free_aligned_buffer_64(row_argb);
  }
-
  return 0;
 }

+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
 // Convert H010 to AR30.
 LIBYUV_API
 int H010ToAR30(const uint16* src_y,
@@ -506,7 +495,7 @@ int H010ToAR30(const uint16* src_y,
               int dst_stride_ar30,
               int width,
               int height) {
-  return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                          src_stride_v, dst_ar30, dst_stride_ar30,
                          &kYuvH709Constants, width, height);
 }

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -214,6 +214,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
  }

+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16, 2, 4, 7)
+#endif
 #ifdef HAS_I210TOARGBROW_SSSE3
 ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7)
 #endif

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -11,6 +11,7 @@
 #include "libyuv/row.h"

 #include <string.h>  // For memcpy and memset.
+#include <stdio.h>

 #include "libyuv/basic_types.h"

@@ -31,9 +32,8 @@ static __inline int32 clamp255(int32 v) {
  return (((255 - (v)) >> 31) | (v)) & 255;
 }

-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32 clamp1023(int32 v) {
+  return (((1023 - (v)) >> 31) | (v)) & 1023;
 }

 static __inline uint32 Abs(int32 v) {
@@ -49,15 +49,23 @@ static __inline int32 clamp255(int32 v) {
  return (v > 255) ? 255 : v;
 }

-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32 clamp1023(int32 v) {
+  return (v > 1023) ? 1023 : v;
 }

 static __inline uint32 Abs(int32 v) {
  return (v < 0) ? -v : v;
 }
 #endif  // USE_BRANCHLESS
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Clamp10(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp1023(v));
+}

 #ifdef LIBYUV_LITTLE_ENDIAN
 #define WRITEWORD(p, v) *(uint32*)(p) = v
@@ -1340,6 +1348,56 @@ static __inline void YuvPixel10(uint16 y,
  *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
 }

+// C reference code that mimics the YUV 16 bit assembly.
+static __inline void YuvPixel16(int16 y,
+                                int16 u,
+                                int16 v,
+                                int* b,
+                                int* g,
+                                int* r,
+                                const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32 y1 = (uint32)((y << 6) * yg) >> 16;
+  u = clamp255(u >> 2);
+  v = clamp255(v >> 2);
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+
+  if ((int16)(*b & 0xffff) != *b) {
+  	printf("%d vs %d   bb %d y1 %d\n",(int16)*b, *b, bb, y1);
+  }
+
+}
+
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
@@ -1460,6 +1518,48 @@ void I210ToARGBRow_C(const uint16* src_y,
  }
 }

+static void StoreAR30(uint8* rgb_buf,
+                      int b,
+                      int g,
+                      int r) {
+  uint32 ar30;
+  b = b >> 4;  // convert 10.6 to 10 bit.
+  g = g >> 4;
+  r = r >> 4;
+  b = Clamp10(b);
+  g = Clamp10(g);
+  r = Clamp10(r);
+  ar30 = b | ((uint32)g << 10) | ((uint32)r << 20) | 0xc0000000;
+  (*(uint32*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16* src_y,
+                     const uint16* src_u,
+                     const uint16* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
 void I422AlphaToARGBRow_C(const uint8* src_y,
                          const uint8* src_u,
                          const uint8* src_v,

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1696,7 +1696,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
    "movdqa     160(%[yuvconstants]),%%xmm13                    \n"            \
    "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                    \
+#define YUVTORGB16(yuvconstants)                                  \
  "movdqa     %%xmm0,%%xmm1                                   \n" \
  "movdqa     %%xmm0,%%xmm2                                   \n" \
  "movdqa     %%xmm0,%%xmm3                                   \n" \
@@ -1712,45 +1712,42 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
  "paddsw     %%xmm4,%%xmm0                                   \n" \
  "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n" \
-  "psraw      $0x6,%%xmm0                                     \n" \
-  "psraw      $0x6,%%xmm1                                     \n" \
-  "psraw      $0x6,%%xmm2                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "packuswb   %%xmm1,%%xmm1                                   \n" \
-  "packuswb   %%xmm2,%%xmm2                                   \n"
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS \
  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
-  "movdqa     %%xmm0,%%xmm1                                     \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     96(%[yuvconstants]),%%xmm0                      \n"            \
-    "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     128(%[yuvconstants]),%%xmm1                     \n"            \
-    "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     160(%[yuvconstants]),%%xmm2                     \n"            \
-    "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB16(yuvconstants) \
+  "movdqa     %%xmm0,%%xmm1                                   \n"            \
+  "movdqa     %%xmm0,%%xmm2                                   \n"            \
+  "movdqa     %%xmm0,%%xmm3                                   \n"            \
+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n"            \
+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n"            \
+  "psubw      %%xmm1,%%xmm0                                   \n"            \
+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n"            \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n"            \
+  "psubw      %%xmm2,%%xmm1                                   \n"            \
+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n"            \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n"            \
+  "psubw      %%xmm3,%%xmm2                                   \n"            \
+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n"            \
+  "paddsw     %%xmm4,%%xmm0                                   \n"            \
+  "paddsw     %%xmm4,%%xmm1                                   \n"            \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS
 #endif

+#define YUVTORGB(yuvconstants) \
+    YUVTORGB16(yuvconstants)                                      \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
+
 // Store 8 ARGB values.
 #define STOREARGB \
  "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
@@ -1774,6 +1771,32 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
    "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n"           \
    "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"

+// Store 8 AR30 values.
+#define STOREAR30 \
+  "psraw      $0x4,%%xmm0                                      \n" \
+  "psraw      $0x4,%%xmm1                                      \n" \
+  "psraw      $0x4,%%xmm2                                      \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "psllw      $0x4,%%xmm2                                      \n" \
+  "movdqa     %%xmm0,%%xmm3                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
+  "movdqa     %%xmm1,%%xmm2                                    \n" \
+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
+  "pslld      $0xa,%%xmm1                                      \n" \
+  "pslld      $0xa,%%xmm2                                      \n" \
+  "por        %%xmm1,%%xmm0                                    \n" \
+  "por        %%xmm2,%%xmm3                                    \n" \
+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
+
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
@@ -1827,9 +1850,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
    "pshufb    %%xmm5,%%xmm0                   \n"
    "pshufb    %%xmm6,%%xmm1                   \n"
    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0,(%[dst_rgb24])            \n"
-    "movdqu    %%xmm1,0x8(%[dst_rgb24])              \n"
-    "lea       0x18(%[dst_rgb24]),%[dst_rgb24]           \n"
+    "movq      %%xmm0,(%[dst_rgb24])           \n"
+    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
+    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
    "subl      $0x8,%[width]                   \n"
    "jg        1b                              \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -1908,6 +1931,41 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
  );
 }

+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16* y_buf,
+                                const uint16* u_buf,
+                                const uint16* v_buf,
+                                uint8* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
                                     const uint8* u_buf,