I420Alpha row function in 1 pass.

API change - I420AlphaToARGB takes flag indicating if RGB should be premultiplied by alpha. This version implements an efficient SSSE3 version for Windows. C version done in 2 steps. Was libyuvTest.I420AlphaToARGB_Any (1136 ms) libyuvTest.I420AlphaToARGB_Unaligned (1210 ms) libyuvTest.I420AlphaToARGB_Invert (966 ms) libyuvTest.I420AlphaToARGB_Opt (1031 ms) libyuvTest.I420AlphaToABGR_Any (1020 ms) libyuvTest.I420AlphaToABGR_Unaligned (1359 ms) libyuvTest.I420AlphaToABGR_Invert (1082 ms) libyuvTest.I420AlphaToABGR_Opt (986 ms) R=harryjin@google.com BUG=libyuv:496 Review URL: https://codereview.chromium.org/1367093002 .

I420Alpha row function in 1 pass.
API change - I420AlphaToARGB takes flag indicating if RGB should be premultiplied by alpha. This version implements an efficient SSSE3 version for Windows. C version done in 2 steps. Was libyuvTest.I420AlphaToARGB_Any (1136 ms) libyuvTest.I420AlphaToARGB_Unaligned (1210 ms) libyuvTest.I420AlphaToARGB_Invert (966 ms) libyuvTest.I420AlphaToARGB_Opt (1031 ms) libyuvTest.I420AlphaToABGR_Any (1020 ms) libyuvTest.I420AlphaToABGR_Unaligned (1359 ms) libyuvTest.I420AlphaToABGR_Invert (1082 ms) libyuvTest.I420AlphaToABGR_Opt (986 ms) R=harryjin@google.com BUG=libyuv:496 Review URL: https://codereview.chromium.org/1367093002 .
e365cdde · Frank Barchard · d4594bee · e365cdde · e365cdde · e365cdde
Commit e365cdde authored Sep 25, 2015 by Frank Barchard
9 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1492
+Version: 1493
 License: BSD
 License File: LICENSE


--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -83,7 +83,7 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
                    const uint8* src_v, int src_stride_v,
                    const uint8* src_a, int src_stride_a,
                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
+                    int width, int height, int attenuate);

 // Convert I420 with Alpha to preattenuated ABGR.
 LIBYUV_API
@@ -92,7 +92,7 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
                    const uint8* src_v, int src_stride_v,
                    const uint8* src_a, int src_stride_a,
                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height);
+                    int width, int height, int attenuate);

 // Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -187,6 +187,14 @@ extern "C" {
 #define HAS_I422TOABGRROW_SSSE3
 #endif

+
+// The following are available on 32 bit x86 Visual C and clangcl.
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_IX86) 
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I422ALPHATOABGRROW_SSSE3
+#endif    
+
 // The following are available for AVX2 Visual C and clangcl 32 bit:
 // TODO(fbarchard): Port to gcc.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
@@ -257,6 +265,7 @@ extern "C" {
 #endif

 // The following are disabled when SSSE3 is available:
+// TODO(fbarchard): remove sse2.  ssse3 is faster and well supported.
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
    !defined(LIBYUV_SSSE3_ONLY)
@@ -1045,6 +1054,20 @@ void I422ToARGBRow_C(const uint8* src_y,
                     uint8* dst_argb,
                     struct YuvConstants* yuvconstants,
                     int width);
+void I422AlphaToARGBRow_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          const uint8* a_buf,
+                          uint8* dst_argb,
+                          struct YuvConstants* yuvconstants,
+                          int width);
+void I422AlphaToABGRRow_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          const uint8* a_buf,
+                          uint8* dst_argb,
+                          struct YuvConstants* yuvconstants,
+                          int width);
 void I422ToABGRRow_C(const uint8* src_y,
                     const uint8* src_u,
                     const uint8* src_v,
@@ -1213,6 +1236,20 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
                         uint8* dst_argb,
                         struct YuvConstants* yuvconstants,
                         int width);
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              struct YuvConstants* yuvconstants,
+                              int width);
+void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              struct YuvConstants* yuvconstants,
+                              int width);
 void I422ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -1405,6 +1442,20 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                             uint8* dst_argb,
                             struct YuvConstants* yuvconstants,
                             int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  const uint8* a_buf,
+                                  uint8* dst_argb,
+                                  struct YuvConstants* yuvconstants,
+                                  int width);
+void I422AlphaToABGRRow_Any_SSSE3(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  const uint8* a_buf,
+                                  uint8* dst_abgr,
+                                  struct YuvConstants* yuvconstants,
+                                  int width);
 void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1492
+#define LIBYUV_VERSION 1493

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -336,16 +336,15 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
                    const uint8* src_v, int src_stride_v,
                    const uint8* src_a, int src_stride_a,
                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height) {
+                    int width, int height, int attenuate) {
  int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
-      ARGBCopyYToAlphaRow_C;
+  void (*I422AlphaToARGBRow)(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToARGBRow_C;
  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
                           int width) = ARGBAttenuateRow_C;
  if (!src_y || !src_u || !src_v || !dst_argb ||
@@ -358,53 +357,37 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_I422TOARGBROW_AVX2)
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
    }
  }
 #endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+#if defined(HAS_I422ALPHATOARGBROW_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
-  }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
-    }
+    I422AlphaToARGBRow = I422AlphaToARGBRow_MIPS_DSPR2;
  }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSE2)
@@ -441,9 +424,10 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
 #endif

  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvConstants, width);
-    ARGBCopyYToAlphaRow(src_a, dst_argb, width);
-    ARGBAttenuateRow(dst_argb, dst_argb, width);
+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvConstants, width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
    dst_argb += dst_stride_argb;
    src_a += src_stride_a;
    src_y += src_stride_y;
@@ -454,24 +438,24 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
  }
  return 0;
 }
-// Convert I420 with Alpha to preattenuated ABGR.
+
+// Convert I420 with Alpha to preattenuated ARGB.
 LIBYUV_API
 int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
                    const uint8* src_u, int src_stride_u,
                    const uint8* src_v, int src_stride_v,
                    const uint8* src_a, int src_stride_a,
                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height) {
+                    int width, int height, int attenuate) {
  int y;
-  void (*I422ToABGRRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        struct YuvConstants* yuvconstants,
-                        int width) = I422ToABGRRow_C;
-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
-      ARGBCopyYToAlphaRow_C;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*I422AlphaToABGRRow)(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_abgr,
+                             struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToABGRRow_C;
+  void (*ARGBAttenuateRow)(const uint8* src_abgr, uint8* dst_abgr,
                           int width) = ARGBAttenuateRow_C;
  if (!src_y || !src_u || !src_v || !dst_abgr ||
      width <= 0 || height == 0) {
@@ -483,53 +467,37 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
    dst_stride_abgr = -dst_stride_abgr;
  }
-#if defined(HAS_I422TOABGRROW_SSSE3)
+#if defined(HAS_I422ALPHATOABGRROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    I422AlphaToABGRRow = I422AlphaToABGRRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_SSSE3;
+      I422AlphaToABGRRow = I422AlphaToABGRRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_I422TOABGRROW_AVX2)
+#if defined(HAS_I422ALPHATOABGRROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    I422AlphaToABGRRow = I422AlphaToABGRRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      I422ToABGRRow = I422ToABGRRow_AVX2;
+      I422AlphaToABGRRow = I422AlphaToABGRRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_I422TOABGRROW_NEON)
+#if defined(HAS_I422ALPHATOABGRROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    I422AlphaToABGRRow = I422AlphaToABGRRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_NEON;
+      I422AlphaToABGRRow = I422AlphaToABGRRow_NEON;
    }
  }
 #endif
-#if defined(HAS_I422TOABGRROW_MIPS_DSPR2)
+#if defined(HAS_I422ALPHATOABGRROW_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
      IS_ALIGNED(dst_abgr, 4) && IS_ALIGNED(dst_stride_abgr, 4)) {
-    I422ToABGRRow = I422ToABGRRow_MIPS_DSPR2;
-  }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
-    }
+    I422AlphaToABGRRow = I422AlphaToABGRRow_MIPS_DSPR2;
  }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSE2)
@@ -566,9 +534,10 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
 #endif

  for (y = 0; y < height; ++y) {
-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, &kYuvConstants, width);
-    ARGBCopyYToAlphaRow(src_a, dst_abgr, width);
-    ARGBAttenuateRow(dst_abgr, dst_abgr, width);
+    I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvConstants, width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_abgr, dst_abgr, width);
+    }
    dst_abgr += dst_stride_abgr;
    src_a += src_stride_a;
    src_y += src_stride_y;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -22,6 +22,34 @@ extern "C" {
 // Subsampled source needs to be increase by 1 of not even.
 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))

+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 const uint8* a_buf, uint8* dst_ptr,                           \
+                 struct YuvConstants* yuvconstants,  int width) {              \
+      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \
+      memset(temp, 0, 64 * 4);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      memcpy(temp + 192, a_buf + n, r);                                        \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+               yuvconstants, MASK + 1);                                        \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+ANY41C(I422AlphaToABGRRow_Any_SSSE3, I422AlphaToABGRRow_SSSE3, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
 // Any 3 planes to 1.
 #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
@@ -50,7 +78,7 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #ifdef HAS_I422TOUYVYROW_NEON
 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
-#undef ANY31C
+#undef ANY31

 // Any 3 planes to 1 with yuvconstants
 #define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2412,6 +2412,29 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
 }
 #endif

+void I422AlphaToARGBRow_C(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          const uint8* src_a,
+                          uint8* dst_argb,
+                          struct YuvConstants* yuvconstants,
+                          int width) {
+
+    I422ToARGBRow_C(src_y, src_u, src_v, dst_argb, &kYuvConstants, width);
+    ARGBCopyYToAlphaRow_C(src_a, dst_argb, width);
+}
+
+void I422AlphaToABGRRow_C(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          const uint8* src_a,
+                          uint8* dst_abgr,
+                          struct YuvConstants* yuvconstants,
+                          int width) {
+    I422ToABGRRow_C(src_y, src_u, src_v, dst_abgr, &kYuvConstants, width);
+    ARGBCopyYToAlphaRow_C(src_a, dst_abgr, width);
+}
+
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
                             const uint8* src_u,

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2416,6 +2416,20 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
    __asm lea        eax, [eax + 8]                                            \
  }

+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422 __asm {                                                    \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
+    __asm lea        ebp, [ebp + 8]                                            \
+  }
+
 // Read 2 UV from 411, upsample to 8 UV.
 #define READYUV411 __asm {                                                     \
    __asm pinsrw     xmm0, [esi], 0        /* U */                             \
@@ -2833,6 +2847,88 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }

+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              struct YuvConstants* yuvconstants,
+                              int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUVA422
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR (32 bytes).
+__declspec(naked)
+void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_abgr,
+                              struct YuvConstants* yuvconstants,
+                              int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // abgr
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUVA422
+    YUVTORGB(ebx)
+    STOREABGR
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 // 8 pixels.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -518,7 +518,7 @@ TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
 TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)

 #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                       YALIGN, W1280, DIFF, N, NEG, OFF)                       \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN)                \
 TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
@@ -547,7 +547,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
                        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
                        src_a + OFF, kWidth,                                   \
                        dst_argb_c + OFF, kStrideB,                            \
-                        kWidth, NEG kHeight);                                  \
+                        kWidth, NEG kHeight, ATTEN);                           \
  MaskCpuFlags(-1);                                                            \
  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
@@ -555,7 +555,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
                          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
                          src_a + OFF, kWidth,                                 \
                          dst_argb_opt + OFF, kStrideB,                        \
-                          kWidth, NEG kHeight);                                \
+                          kWidth, NEG kHeight, ATTEN);                         \
  }                                                                            \
  int max_diff = 0;                                                            \
  for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                         \
@@ -578,13 +578,15 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
 #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
                       YALIGN, DIFF)                                           \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0)                        \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0)                     \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0)                   \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1)                      \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0)                      \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_, DIFF, _Invert, -, 0)                         \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)                         \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_, DIFF, _Opt, +, 0)
+        YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)

 TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
 TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)