I411ToARGB doing 2 UV values with 8 Y values

BUG=40 TEST=planar_test Review URL: https://webrtc-codereview.appspot.com/637005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@277 16f28f9a-4ce2-e073-06de-1de4eb20be90

I411ToARGB doing 2 UV values with 8 Y values
BUG=40 TEST=planar_test Review URL: https://webrtc-codereview.appspot.com/637005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@277 16f28f9a-4ce2-e073-06de-1de4eb20be90
e214fe3f · fbarchard@google.com · 6d6b7709 · e214fe3f · e214fe3f · e214fe3f
Commit e214fe3f authored Jun 04, 2012 by fbarchard@google.com
10 changed files
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -31,6 +31,13 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
+// Convert I420 to I400.  (calls CopyPlane ignoring u/v)
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
 // I420 mirror.
 int I420Mirror(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@@ -62,6 +69,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
+// Convert I444 to ARGB.
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
 // Convert I422 to ARGB.
 int I422ToARGB(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@@ -69,8 +83,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
-// Convert I444 to ARGB.
+// Convert I411 to ARGB.
-int I444ToARGB(const uint8* src_y, int src_stride_y,
+int I411ToARGB(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
               uint8* dst_argb, int dst_stride_argb,

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -660,32 +660,32 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_Any_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      I420ToARGBRow = I420ToARGBRow_NEON;
+      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I420ToARGBRow = I420ToARGBRow_Unaligned_SSSE3;
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I420ToARGBRow = I420ToARGBRow_SSSE3;
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
      }
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    if (y & 1) {
@@ -708,32 +708,32 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
    dst_stride_bgra = -dst_stride_bgra;
  }
-  void (*I420ToBGRARow)(const uint8* y_buf,
+  void (*I422ToBGRARow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToBGRARow_C;
+                        int width) = I422ToBGRARow_C;
-#if defined(HAS_I420TOBGRAROW_NEON)
+#if defined(HAS_I422TOBGRAROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToBGRARow = I420ToBGRARow_Any_NEON;
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      I420ToBGRARow = I420ToBGRARow_NEON;
+      I422ToBGRARow = I422ToBGRARow_NEON;
    }
  }
-#elif defined(HAS_I420TOBGRAROW_SSSE3)
+#elif defined(HAS_I422TOBGRAROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I420ToBGRARow = I420ToBGRARow_Any_SSSE3;
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I420ToBGRARow = I420ToBGRARow_Unaligned_SSSE3;
+      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
-        I420ToBGRARow = I420ToBGRARow_SSSE3;
+        I422ToBGRARow = I422ToBGRARow_SSSE3;
      }
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
    dst_bgra += dst_stride_bgra;
    src_y += src_stride_y;
    if (y & 1) {
@@ -756,32 +756,32 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
    dst_stride_abgr = -dst_stride_abgr;
  }
-  void (*I420ToABGRRow)(const uint8* y_buf,
+  void (*I422ToABGRRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToABGRRow_C;
+                        int width) = I422ToABGRRow_C;
-#if defined(HAS_I420TOABGRROW_NEON)
+#if defined(HAS_I422TOABGRROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToABGRRow = I420ToABGRRow_Any_NEON;
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      I420ToABGRRow = I420ToABGRRow_NEON;
+      I422ToABGRRow = I422ToABGRRow_NEON;
    }
  }
-#elif defined(HAS_I420TOABGRROW_SSSE3)
+#elif defined(HAS_I422TOABGRROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I420ToABGRRow = I420ToABGRRow_Any_SSSE3;
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      I420ToABGRRow = I420ToABGRRow_Unaligned_SSSE3;
+      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
-        I420ToABGRRow = I420ToABGRRow_SSSE3;
+        I422ToABGRRow = I422ToABGRRow_SSSE3;
      }
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
    dst_abgr += dst_stride_abgr;
    src_y += src_stride_y;
    if (y & 1) {
@@ -804,18 +804,18 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_NEON;
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I420ToARGBRow = I420ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
@@ -835,7 +835,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
    ARGBToRGB24Row(row, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
@@ -859,18 +859,18 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_NEON;
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I420ToARGBRow = I420ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
@@ -890,7 +890,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
    ARGBToRAWRow(row, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
@@ -914,18 +914,18 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
    dst_stride_rgb = -dst_stride_rgb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_NEON;
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I420ToARGBRow = I420ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
@@ -944,7 +944,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
    ARGBToRGB565Row(row, dst_rgb, width);
    dst_rgb += dst_stride_rgb;
    src_y += src_stride_y;
@@ -968,18 +968,18 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_NEON;
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I420ToARGBRow = I420ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
@@ -998,7 +998,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
    ARGBToARGB1555Row(row, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
@@ -1022,18 +1022,18 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_NEON;
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I420ToARGBRow = I420ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
@@ -1052,7 +1052,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
    ARGBToARGB4444Row(row, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;

--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -446,18 +446,18 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_NEON;
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I420ToARGBRow = I420ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
  SIMD_ALIGNED(uint8 row[kMaxStride]);
@@ -478,7 +478,7 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
  }
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToARGBRow(src_y, src_u, src_v, row, width);
    ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
    dst_bayer += dst_stride_bayer;
    src_y += src_stride_y;

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -51,6 +51,26 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
  }
 }
+// Convert I420 to I400.  (calls CopyPlane ignoring u/v)
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8*, int,
+               uint8*, int,
+               int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
 // Mirror a plane of data
 void MirrorPlane(const uint8* src_y, int src_stride_y,
                 uint8* dst_y, int dst_stride_y,
@@ -202,6 +222,45 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
  return 0;
 }
+// Convert I444 to ARGB.
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I444ToARGBRow_C;
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I444ToARGBRow = I444ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+  for (int y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
 // Convert I422 to ARGB.
 int I422ToARGB(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@@ -214,30 +273,32 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_Any_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      I420ToARGBRow = I420ToARGBRow_NEON;
+      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8) &&
+    if (IS_ALIGNED(width, 8)) {
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      I420ToARGBRow = I420ToARGBRow_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    I420ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    src_u += src_stride_u;
@@ -246,8 +307,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
  return 0;
 }
-// Convert I444 to ARGB.
+// Convert I411 to ARGB.
-int I444ToARGB(const uint8* src_y, int src_stride_y,
+int I411ToARGB(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
               uint8* dst_argb, int dst_stride_argb,
@@ -258,21 +319,25 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I444ToARGBRow)(const uint8* y_buf,
+  void (*I411ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I444ToARGBRow_C;
+                        int width) = I411ToARGBRow_C;
-#if defined(HAS_I444TOARGBROW_SSSE3)
+#if defined(HAS_I411TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-      IS_ALIGNED(width, 8) &&
+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    if (IS_ALIGNED(width, 8)) {
-    I444ToARGBRow = I444ToARGBRow_SSSE3;
+      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I411ToARGBRow = I411ToARGBRow_SSSE3;
+      }
+    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    src_u += src_stride_u;
@@ -281,6 +346,7 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
  return 0;
 }
 // Convert I400 to ARGB.
 int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
                         uint8* dst_argb, int dst_stride_argb,
@@ -724,24 +790,24 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* argb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_Any_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      I420ToARGBRow = I420ToARGBRow_NEON;
+      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      I420ToARGBRow = I420ToARGBRow_SSSE3;
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
 #endif
@@ -766,7 +832,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
      SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
      src_uv += src_stride_uv;
    }
-    I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width);
+    I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
  }
@@ -803,24 +869,24 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
    }
  }
 #endif
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* argb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_Any_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      I420ToARGBRow = I420ToARGBRow_NEON;
+      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      I420ToARGBRow = I420ToARGBRow_SSSE3;
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
 #endif
@@ -832,7 +898,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
  for (int y = 0; y < height; ++y) {
    YUY2ToUVRow(src_yuy2, src_stride_yuy2, rowu, rowv, width);
    YUY2ToYRow(src_yuy2, rowy, width);
-    I420ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
    src_yuy2 += src_stride_yuy2;
    dst_argb += dst_stride_argb;
  }
@@ -869,24 +935,24 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
    }
  }
 #endif
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* argb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_Any_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      I420ToARGBRow = I420ToARGBRow_NEON;
+      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      I420ToARGBRow = I420ToARGBRow_SSSE3;
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
 #endif
@@ -898,7 +964,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
  for (int y = 0; y < height; ++y) {
    UYVYToUVRow(src_uyvy, src_stride_uyvy, rowu, rowv, width);
    UYVYToYRow(src_uyvy, rowy, width);
-    I420ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
    src_uyvy += src_stride_uyvy;
    dst_argb += dst_stride_argb;
  }
@@ -916,18 +982,18 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
    dst_stride_rgb = -dst_stride_rgb;
  }
-  void (*I420ToARGBRow)(const uint8* y_buf,
+  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
-                        int width) = I420ToARGBRow_C;
+                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I420ToARGBRow = I420ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_NEON;
  }
-#elif defined(HAS_I420TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I420ToARGBRow = I420ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
  SIMD_ALIGNED(uint8 row[kMaxStride]);
@@ -960,7 +1026,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
      SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
      src_uv += src_stride_uv;
    }
-    I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
+    I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
    ARGBToRGB565Row(row, dst_rgb, width);
    dst_rgb += dst_stride_rgb;
    src_y += src_stride_y;

--- a/source/row.h
+++ b/source/row.h
@@ -30,7 +30,7 @@ extern "C" {
 #define LIBYUV_SSSE3_ONLY
 #endif
-// The following are available on all x86 platforms
+// The following are available on all x86 platforms:
 #if !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_ABGRTOARGBROW_SSSE3
@@ -55,10 +55,11 @@ extern "C" {
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
 #define HAS_I400TOARGBROW_SSE2
-#define HAS_I420TOABGRROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
-#define HAS_I420TOARGBROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
-#define HAS_I420TOBGRAROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
 #define HAS_I444TOARGBROW_SSSE3
+#define HAS_I411TOARGBROW_SSSE3
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROWUV_SSSE3
 #define HAS_ADDROW_SSE2
@@ -75,7 +76,7 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_SSSE3
 #endif
-// The following are available only useful when SSSE3 is unavailable.
+// The following are disabled when SSSE3 is available:
 #if !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
    !defined(LIBYUV_SSSE3_ONLY)
@@ -91,9 +92,9 @@ extern "C" {
 #define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
 #define HAS_COPYROW_NEON
-#define HAS_I420TOARGBROW_NEON
+#define HAS_I422TOARGBROW_NEON
-#define HAS_I420TOBGRAROW_NEON
+#define HAS_I422TOBGRAROW_NEON
-#define HAS_I420TOABGRROW_NEON
+#define HAS_I422TOABGRROW_NEON
 #endif
 #if defined(_MSC_VER)
@@ -118,17 +119,17 @@ typedef uint32 __attribute__((vector_size(16))) uvec32;
 #define OMITFP __attribute__((optimize("omit-frame-pointer")))
 #endif
-void I420ToARGBRow_NEON(const uint8* y_buf,
+void I422ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width);
-void I420ToBGRARow_NEON(const uint8* y_buf,
+void I422ToBGRARow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width);
-void I420ToABGRRow_NEON(const uint8* y_buf,
+void I422ToABGRRow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
@@ -219,19 +220,19 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
-void I420ToARGBRow_C(const uint8* y_buf,
+void I422ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
                     int width);
-void I420ToBGRARow_C(const uint8* y_buf,
+void I422ToBGRARow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
                     int width);
-void I420ToABGRRow_C(const uint8* y_buf,
+void I422ToABGRRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
@@ -243,54 +244,78 @@ void I444ToARGBRow_C(const uint8* y_buf,
                     uint8* rgb_buf,
                     int width);
+void I411ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width);
 void YToARGBRow_C(const uint8* y_buf,
                  uint8* rgb_buf,
                  int width);
-void I420ToARGBRow_SSSE3(const uint8* y_buf,
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* rgb_buf,
+                         uint8* argb_buf,
                         int width);
-void I420ToBGRARow_SSSE3(const uint8* y_buf,
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* rgb_buf,
+                         uint8* argb_buf,
                         int width);
-void I420ToABGRRow_SSSE3(const uint8* y_buf,
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* rgb_buf,
+                         uint8* argb_buf,
                         int width);
-void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                                   uint8* rgb_buf,
+                         uint8* bgra_buf,
                         int width);
-void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                                   uint8* rgb_buf,
+                         uint8* abgr_buf,
                         int width);
-void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* rgb_buf,
+                                   uint8* argb_buf,
                                   int width);
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                         uint8* rgb_buf,
+                                   uint8* argb_buf,
+                                   int width);
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* argb_buf,
+                                   int width);
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* bgra_buf,
+                                   int width);
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* abgr_buf,
                                   int width);
 void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* rgb_buf,
+                     uint8* argb_buf,
                     int width);
 // ARGB preattenuated alpha blend.
@@ -310,24 +335,37 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
                    uint8* dst_argb, int width);
 // 'Any' functions handle any size and alignment.
-void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
+void I444ToARGBRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
                             uint8* rgb_buf,
                             int width);
-void I420ToBGRARow_Any_SSSE3(const uint8* y_buf,
+void I422ToARGBRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
                             uint8* rgb_buf,
                             int width);
-void I420ToABGRRow_Any_SSSE3(const uint8* y_buf,
+void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
                             uint8* rgb_buf,
                             int width);
+void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
+void I422ToABGRRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
 void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -344,19 +382,19 @@ void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
 void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int width);
-void I420ToARGBRow_Any_NEON(const uint8* y_buf,
+void I422ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
-void I420ToBGRARow_Any_NEON(const uint8* y_buf,
+void I422ToBGRARow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
-void I420ToABGRRow_Any_NEON(const uint8* y_buf,
+void I422ToABGRRow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -359,7 +359,8 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
                                        (255u << ashift);
 }
-void I420ToARGBRow_C(const uint8* y_buf,
+// Also used for 420
+void I422ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
@@ -377,7 +378,7 @@ void I420ToARGBRow_C(const uint8* y_buf,
  }
 }
-void I420ToBGRARow_C(const uint8* y_buf,
+void I422ToBGRARow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
@@ -395,7 +396,7 @@ void I420ToBGRARow_C(const uint8* y_buf,
  }
 }
-void I420ToABGRRow_C(const uint8* y_buf,
+void I422ToABGRRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
@@ -427,6 +428,32 @@ void I444ToARGBRow_C(const uint8* y_buf,
  }
 }
+void I411ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 3; x += 4) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
+    YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
+    y_buf += 4;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 16;  // Advance 4 pixels.
+  }
+  if (width & 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+  }
+}
 void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
  for (int x = 0; x < width; ++x) {
    YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
@@ -686,8 +713,8 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
-// Wrappers to handle odd sizes/alignments
+// Wrappers to handle odd width
-#define YUVANY(NAMEANY, I420TORGB_SSE, I420TORGB_C)                            \
+#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT)                    \
    void NAMEANY(const uint8* y_buf,                                           \
                 const uint8* u_buf,                                           \
                 const uint8* v_buf,                                           \
@@ -696,22 +723,24 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
      int n = width & ~7;                                                      \
      I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n);                          \
      I420TORGB_C(y_buf + n,                                                   \
-                   u_buf + (n >> 1),                                           \
+                  u_buf + (n >> UV_SHIFT),                                     \
-                   v_buf + (n >> 1),                                           \
+                  v_buf + (n >> UV_SHIFT),                                     \
                  rgb_buf + n * 4, width & 7);                                 \
    }
-#if defined(HAS_I420TOARGBROW_SSSE3)
+#if defined(HAS_I422TOARGBROW_SSSE3)
-YUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_Unaligned_SSSE3, I420ToARGBRow_C)
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
-YUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_Unaligned_SSSE3, I420ToBGRARow_C)
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
-YUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_Unaligned_SSSE3, I420ToABGRRow_C)
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
 #endif
-#if defined(HAS_I420TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
-YUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, I420ToARGBRow_C)
+YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C)
-YUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, I420ToBGRARow_C)
+YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C)
-YUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, I420ToABGRRow_C)
+YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C)
 #endif
-#undef YUVANY
+#undef YANY
 #define RGBANY(NAMEANY, ARGBTORGB, BPP)                                        \
    void NAMEANY(const uint8* argb_buf,                                        \

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -18,7 +18,7 @@ extern "C" {
 // This module is for GCC Neon
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
-#define YUVTORGB                                                               \
+#define YUV422TORGB                                                            \
    "vld1.u8    {d0}, [%0]!                    \n"                             \
    "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
    "vld1.u32   {d2[1]}, [%2]!                 \n"                             \
@@ -46,17 +46,17 @@ extern "C" {
    "vtrn.u8    d22, d23                       \n"                             \
    "vtrn.u8    d16, d17                       \n"                             \
-#if defined(HAS_I420TOARGBROW_NEON) || \
+#if defined(HAS_I422TOARGBROW_NEON) || \
-    defined(HAS_I420TOBGRAROW_NEON) || \
+    defined(HAS_I422TOBGRAROW_NEON) || \
-    defined(HAS_I420TOABGRROW_NEON)
+    defined(HAS_I422TOABGRROW_NEON)
 static const vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
                               0, 0, 0, 0, 0, 0, 0, 0 };
 static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
                             0, 0, 0, 0, 0, 0, 0, 0 };
 #endif
-#ifdef HAS_I420TOARGBROW_NEON
+#ifdef HAS_I422TOARGBROW_NEON
-void I420ToARGBRow_NEON(const uint8* y_buf,
+void I422ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
@@ -68,7 +68,7 @@ void I420ToARGBRow_NEON(const uint8* y_buf,
    "vmov.u16   q14, #74                       \n"
    "vmov.u16   q15, #16                       \n"
  "1:                                          \n"
-YUVTORGB
+    YUV422TORGB
    "vmov.u8    d21, d16                       \n"
    "vmov.u8    d23, #255                      \n"
    "vst4.u8    {d20, d21, d22, d23}, [%3]!    \n"
@@ -85,10 +85,10 @@ YUVTORGB
                      "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
-#endif
+#endif  // HAS_I422TOARGBROW_NEON
-#ifdef HAS_I420TOBGRAROW_NEON
+#ifdef HAS_I422TOBGRAROW_NEON
-void I420ToBGRARow_NEON(const uint8* y_buf,
+void I422ToBGRARow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
@@ -100,7 +100,7 @@ void I420ToBGRARow_NEON(const uint8* y_buf,
    "vmov.u16   q14, #74                       \n"
    "vmov.u16   q15, #16                       \n"
  "1:                                          \n"
-YUVTORGB
+    YUV422TORGB
    "vswp.u8    d20, d22                       \n"
    "vmov.u8    d21, d16                       \n"
    "vmov.u8    d19, #255                      \n"
@@ -118,10 +118,10 @@ YUVTORGB
                      "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
-#endif
+#endif  // HAS_I422TOBGRAROW_NEON
-#ifdef HAS_I420TOABGRROW_NEON
+#ifdef HAS_I422TOABGRROW_NEON
-void I420ToABGRRow_NEON(const uint8* y_buf,
+void I422ToABGRRow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
@@ -133,7 +133,7 @@ void I420ToABGRRow_NEON(const uint8* y_buf,
    "vmov.u16   q14, #74                       \n"
    "vmov.u16   q15, #16                       \n"
  "1:                                          \n"
-YUVTORGB
+    YUV422TORGB
    "vswp.u8    d20, d22                       \n"
    "vmov.u8    d21, d16                       \n"
    "vmov.u8    d23, #255                      \n"
@@ -151,7 +151,7 @@ YUVTORGB
                      "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
-#endif
+#endif  // HAS_I422TOABGRROW_NEON
 #ifdef HAS_SPLITUV_NEON
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
@@ -172,7 +172,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
    : "memory", "cc", "q0", "q1" // Clobber List
  );
 }
-#endif
+#endif  // HAS_SPLITUV_NEON
 #ifdef HAS_COPYROW_NEON
 // Copy multiple of 64
@@ -266,7 +266,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    : "memory", "cc", "r3", "q0"
  );
 }
-#endif
+#endif  // HAS_MIRRORROW_NEON
 #ifdef HAS_MIRRORROWUV_NEON
 void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
@@ -325,7 +325,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
    : "memory", "cc", "r12", "q0"
  );
 }
-#endif
+#endif  // HAS_MIRRORROWUV_NEON
 #endif  // __ARM_NEON__

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1215,7 +1215,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
 #endif  // HAS_ARGBTOYROW_SSSE3
-#ifdef HAS_I420TOARGBROW_SSSE3
+#ifdef HAS_I422TOARGBROW_SSSE3
 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
 #define UR 0
@@ -1251,8 +1251,37 @@ struct {
  { YG, YG, YG, YG, YG, YG, YG, YG }
 };
-// Convert 8 pixels
+// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB                                                               \
+#define YUV444TORGB                                                            \
+    "movq       (%1),%%xmm0                    \n"                             \
+    "movq       (%1,%2,1),%%xmm1               \n"                             \
+    "lea        0x8(%1),%1                     \n"                             \
+    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
+    "movdqa     %%xmm0,%%xmm1                  \n"                             \
+    "movdqa     %%xmm0,%%xmm2                  \n"                             \
+    "pmaddubsw  (%5),%%xmm0                    \n"                             \
+    "pmaddubsw  16(%5),%%xmm1                  \n"                             \
+    "pmaddubsw  32(%5),%%xmm2                  \n"                             \
+    "psubw      48(%5),%%xmm0                  \n"                             \
+    "psubw      64(%5),%%xmm1                  \n"                             \
+    "psubw      80(%5),%%xmm2                  \n"                             \
+    "movq       (%0),%%xmm3                    \n"                             \
+    "lea        0x8(%0),%0                     \n"                             \
+    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
+    "psubsw     96(%5),%%xmm3                  \n"                             \
+    "pmullw     112(%5),%%xmm3                 \n"                             \
+    "paddsw     %%xmm3,%%xmm0                  \n"                             \
+    "paddsw     %%xmm3,%%xmm1                  \n"                             \
+    "paddsw     %%xmm3,%%xmm2                  \n"                             \
+    "psraw      $0x6,%%xmm0                    \n"                             \
+    "psraw      $0x6,%%xmm1                    \n"                             \
+    "psraw      $0x6,%%xmm2                    \n"                             \
+    "packuswb   %%xmm0,%%xmm0                  \n"                             \
+    "packuswb   %%xmm1,%%xmm1                  \n"                             \
+    "packuswb   %%xmm2,%%xmm2                  \n"
+// Convert 8 pixels: 4 UV and 8 Y
+#define YUV422TORGB                                                            \
    "movd       (%1),%%xmm0                    \n"                             \
    "movd       (%1,%2,1),%%xmm1               \n"                             \
    "lea        0x4(%1),%1                     \n"                             \
@@ -1281,10 +1310,41 @@ struct {
    "packuswb   %%xmm1,%%xmm1                  \n"                             \
    "packuswb   %%xmm2,%%xmm2                  \n"
-void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
+// Convert 8 pixels: 2 UV and 8 Y
+#define YUV411TORGB                                                            \
+    "movd       (%1),%%xmm0                    \n"                             \
+    "movd       (%1,%2,1),%%xmm1               \n"                             \
+    "lea        0x2(%1),%1                     \n"                             \
+    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
+    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
+    "punpckldq  %%xmm0,%%xmm0                  \n"                             \
+    "movdqa     %%xmm0,%%xmm1                  \n"                             \
+    "movdqa     %%xmm0,%%xmm2                  \n"                             \
+    "pmaddubsw  (%5),%%xmm0                    \n"                             \
+    "pmaddubsw  16(%5),%%xmm1                  \n"                             \
+    "pmaddubsw  32(%5),%%xmm2                  \n"                             \
+    "psubw      48(%5),%%xmm0                  \n"                             \
+    "psubw      64(%5),%%xmm1                  \n"                             \
+    "psubw      80(%5),%%xmm2                  \n"                             \
+    "movq       (%0),%%xmm3                    \n"                             \
+    "lea        0x8(%0),%0                     \n"                             \
+    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
+    "psubsw     96(%5),%%xmm3                  \n"                             \
+    "pmullw     112(%5),%%xmm3                 \n"                             \
+    "paddsw     %%xmm3,%%xmm0                  \n"                             \
+    "paddsw     %%xmm3,%%xmm1                  \n"                             \
+    "paddsw     %%xmm3,%%xmm2                  \n"                             \
+    "psraw      $0x6,%%xmm0                    \n"                             \
+    "psraw      $0x6,%%xmm1                    \n"                             \
+    "psraw      $0x6,%%xmm2                    \n"                             \
+    "packuswb   %%xmm0,%%xmm0                  \n"                             \
+    "packuswb   %%xmm1,%%xmm1                  \n"                             \
+    "packuswb   %%xmm2,%%xmm2                  \n"
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* rgb_buf,
+                                uint8* argb_buf,
                                int width) {
  asm volatile (
    "sub       %1,%2                           \n"
@@ -1292,7 +1352,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
    "pxor      %%xmm4,%%xmm4                   \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    YUVTORGB
+    YUV444TORGB
    "punpcklbw %%xmm1,%%xmm0                   \n"
    "punpcklbw %%xmm5,%%xmm2                   \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1306,7 +1366,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
+    "+r"(argb_buf),  // %3
    "+rm"(width)    // %4
  : "r"(&kYuvConstants.kUVToB) // %5
  : "memory", "cc"
@@ -1316,10 +1376,10 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
  );
 }
-void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* rgb_buf,
+                                uint8* argb_buf,
                                int width) {
  asm volatile (
    "sub       %1,%2                           \n"
@@ -1327,22 +1387,21 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
    "pxor      %%xmm4,%%xmm4                   \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    YUVTORGB
+    YUV422TORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm2,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,(%3)                     \n"
-    "movdqa    %%xmm5,(%3)                     \n"
+    "movdqa    %%xmm1,0x10(%3)                 \n"
-    "movdqa    %%xmm0,0x10(%3)                 \n"
    "lea       0x20(%3),%3                     \n"
    "sub       $0x8,%4                         \n"
    "jg        1b                              \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
+    "+r"(argb_buf),  // %3
    "+rm"(width)    // %4
  : "r"(&kYuvConstants.kUVToB) // %5
  : "memory", "cc"
@@ -1352,10 +1411,10 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
  );
 }
-void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* rgb_buf,
+                                uint8* argb_buf,
                                int width) {
  asm volatile (
    "sub       %1,%2                           \n"
@@ -1363,13 +1422,13 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
    "pxor      %%xmm4,%%xmm4                   \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    YUVTORGB
+    YUV411TORGB
-    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm2                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm2,(%3)                     \n"
+    "movdqa    %%xmm0,(%3)                     \n"
    "movdqa    %%xmm1,0x10(%3)                 \n"
    "lea       0x20(%3),%3                     \n"
    "sub       $0x8,%4                         \n"
@@ -1377,7 +1436,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
+    "+r"(argb_buf),  // %3
    "+rm"(width)    // %4
  : "r"(&kYuvConstants.kUVToB) // %5
  : "memory", "cc"
@@ -1387,10 +1446,10 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
  );
 }
-void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* rgb_buf,
+                                          uint8* argb_buf,
                                          int width) {
  asm volatile (
    "sub       %1,%2                           \n"
@@ -1398,7 +1457,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
    "pxor      %%xmm4,%%xmm4                   \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    YUVTORGB
+    YUV444TORGB
    "punpcklbw %%xmm1,%%xmm0                   \n"
    "punpcklbw %%xmm5,%%xmm2                   \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1412,7 +1471,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
+    "+r"(argb_buf),  // %3
    "+rm"(width)    // %4
  : "r"(&kYuvConstants.kUVToB) // %5
  : "memory", "cc"
@@ -1422,10 +1481,10 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  );
 }
-void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* rgb_buf,
+                                          uint8* argb_buf,
                                          int width) {
  asm volatile (
    "sub       %1,%2                           \n"
@@ -1433,22 +1492,92 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
    "pxor      %%xmm4,%%xmm4                   \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    YUVTORGB
+    YUV422TORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%3)                     \n"
+    "movdqu    %%xmm1,0x10(%3)                 \n"
+    "lea       0x20(%3),%3                     \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(argb_buf),  // %3
+    "+rm"(width)    // %4
+  : "r"(&kYuvConstants.kUVToB) // %5
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %1,%2                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    YUV411TORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%3)                     \n"
+    "movdqu    %%xmm1,0x10(%3)                 \n"
+    "lea       0x20(%3),%3                     \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(argb_buf),  // %3
+    "+rm"(width)    // %4
+  : "r"(&kYuvConstants.kUVToB) // %5
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* bgra_buf,
+                                int width) {
+  asm volatile (
+    "sub       %1,%2                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    YUV422TORGB
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "punpcklbw %%xmm0,%%xmm1                   \n"
    "punpcklbw %%xmm2,%%xmm5                   \n"
    "movdqa    %%xmm5,%%xmm0                   \n"
    "punpcklwd %%xmm1,%%xmm5                   \n"
    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5,(%3)                     \n"
+    "movdqa    %%xmm5,(%3)                     \n"
-    "movdqu    %%xmm0,0x10(%3)                 \n"
+    "movdqa    %%xmm0,0x10(%3)                 \n"
    "lea       0x20(%3),%3                     \n"
    "sub       $0x8,%4                         \n"
    "jg        1b                              \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
+    "+r"(bgra_buf),  // %3
    "+rm"(width)    // %4
  : "r"(&kYuvConstants.kUVToB) // %5
  : "memory", "cc"
@@ -1458,10 +1587,10 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
  );
 }
-void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                          uint8* rgb_buf,
+                                uint8* abgr_buf,
                                int width) {
  asm volatile (
    "sub       %1,%2                           \n"
@@ -1469,21 +1598,21 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
    "pxor      %%xmm4,%%xmm4                   \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    YUVTORGB
+    YUV422TORGB
    "punpcklbw %%xmm1,%%xmm2                   \n"
    "punpcklbw %%xmm5,%%xmm0                   \n"
    "movdqa    %%xmm2,%%xmm1                   \n"
    "punpcklwd %%xmm0,%%xmm2                   \n"
    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,(%3)                     \n"
+    "movdqa    %%xmm2,(%3)                     \n"
-    "movdqu    %%xmm1,0x10(%3)                 \n"
+    "movdqa    %%xmm1,0x10(%3)                 \n"
    "lea       0x20(%3),%3                     \n"
    "sub       $0x8,%4                         \n"
    "jg        1b                              \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
+    "+r"(abgr_buf),  // %3
    "+rm"(width)    // %4
  : "r"(&kYuvConstants.kUVToB) // %5
  : "memory", "cc"
@@ -1493,10 +1622,10 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
  );
 }
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                uint8* rgb_buf,
+                                          uint8* bgra_buf,
                                          int width) {
  asm volatile (
    "sub       %1,%2                           \n"
@@ -1504,43 +1633,22 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
    "pxor      %%xmm4,%%xmm4                   \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    "movd      (%1),%%xmm0                     \n"
+    YUV422TORGB
-    "movd      (%1,%2,1),%%xmm1                \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "lea       0x4(%1),%1                      \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm5                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "pmaddubsw (%5),%%xmm0                     \n"
+    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "pmaddubsw 16(%5),%%xmm1                   \n"
+    "movdqu    %%xmm5,(%3)                     \n"
-    "pmaddubsw 32(%5),%%xmm2                   \n"
+    "movdqu    %%xmm0,0x10(%3)                 \n"
-    "psubw     48(%5),%%xmm0                   \n"
+    "lea       0x20(%3),%3                     \n"
-    "psubw     64(%5),%%xmm1                   \n"
+    "sub       $0x8,%4                         \n"
-    "psubw     80(%5),%%xmm2                   \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "punpcklbw %%xmm4,%%xmm3                   \n"
-    "psubsw    96(%5),%%xmm3                   \n"
-    "pmullw    112(%5),%%xmm3                  \n"
-    "paddsw    %%xmm3,%%xmm0                   \n"
-    "paddsw    %%xmm3,%%xmm1                   \n"
-    "paddsw    %%xmm3,%%xmm2                   \n"
-    "psraw     $0x6,%%xmm0                     \n"
-    "psraw     $0x6,%%xmm1                     \n"
-    "psraw     $0x6,%%xmm2                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%4                         \n"
-    "movdqa    %%xmm0,(%3)                     \n"
-    "lea       0x10(%3),%3                     \n"
    "jg        1b                              \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
+    "+r"(bgra_buf),  // %3
    "+rm"(width)    // %4
  : "r"(&kYuvConstants.kUVToB) // %5
  : "memory", "cc"
@@ -1549,7 +1657,43 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
 #endif
  );
 }
+void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* abgr_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %1,%2                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    YUV422TORGB
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm2                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm2,(%3)                     \n"
+    "movdqu    %%xmm1,0x10(%3)                 \n"
+    "lea       0x20(%3),%3                     \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(abgr_buf),  // %3
+    "+rm"(width)    // %4
+  : "r"(&kYuvConstants.kUVToB) // %5
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
+  );
+}
+#endif  // HAS_I422TOARGBROW_SSSE3
 #ifdef HAS_YTOARGBROW_SSE2
 void YToARGBRow_SSE2(const uint8* y_buf,

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1200,7 +1200,7 @@ __asm {
  }
 }
-#ifdef HAS_I420TOARGBROW_SSSE3
+#ifdef HAS_I422TOARGBROW_SSSE3
 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
@@ -1235,7 +1235,42 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-#define YUVTORGB __asm {                                                       \
+// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUV444TORGB __asm {                                                    \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movq       xmm0, qword ptr [esi]          /* U */                    \
+    __asm movq       xmm1, qword ptr [esi + edi]    /* V */                    \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
+    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
+    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
+    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
+    __asm psubw      xmm1, kUVBiasG                                            \
+    __asm psubw      xmm2, kUVBiasR                                            \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm4                                                \
+    __asm psubsw     xmm3, kYSub16                                             \
+    __asm pmullw     xmm3, kYToRgb                                             \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+// Convert 8 pixels: 4 UV and 8 Y
+#define YUV422TORGB __asm {                                                    \
    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
    __asm movd       xmm0, [esi]          /* U */                              \
    __asm movd       xmm1, [esi + edi]    /* V */                              \
@@ -1267,11 +1302,47 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm packuswb   xmm2, xmm2           /* R */                              \
  }
+// Convert 8 pixels: 2 UV and 8 Y
+#define YUV411TORGB __asm {                                                    \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 2]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
+    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
+    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
+    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
+    __asm psubw      xmm1, kUVBiasG                                            \
+    __asm psubw      xmm2, kUVBiasR                                            \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm4                                                \
+    __asm psubsw     xmm3, kYSub16                                             \
+    __asm pmullw     xmm3, kYToRgb                                             \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+// 8 pixels, dest aligned 16.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
 __declspec(naked) __declspec(align(16))
-void I420ToARGBRow_SSSE3(const uint8* y_buf,
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* rgb_buf,
+                         uint8* argb_buf,
                         int width) {
  __asm {
    push       esi
@@ -1279,7 +1350,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
    mov        eax, [esp + 8 + 4]   // Y
    mov        esi, [esp + 8 + 8]   // U
    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb
+    mov        edx, [esp + 8 + 16]  // argb
    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -1287,7 +1358,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
    align      16
 convertloop:
-    YUVTORGB
+    YUV444TORGB
    // Step 3: Weave into ARGB
    punpcklbw  xmm0, xmm1           // BG
@@ -1307,11 +1378,13 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
 __declspec(naked) __declspec(align(16))
-void I420ToBGRARow_SSSE3(const uint8* y_buf,
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* rgb_buf,
+                         uint8* argb_buf,
                         int width) {
  __asm {
    push       esi
@@ -1319,24 +1392,67 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
    mov        eax, [esp + 8 + 4]   // Y
    mov        esi, [esp + 8 + 8]   // U
    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb
+    mov        edx, [esp + 8 + 16]  // argb
    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
    pxor       xmm4, xmm4
    align      16
 convertloop:
-    YUVTORGB
+    YUV422TORGB
-    // Step 3: Weave into BGRA
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+// 8 pixels, dest aligned 16.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm0           // GB
+    pxor       xmm4, xmm4
-    punpcklbw  xmm5, xmm2           // AR
-    movdqa     xmm0, xmm5
+    align      16
-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
+ convertloop:
-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
+    YUV411TORGB
-    movdqa     [edx], xmm5
-    movdqa     [edx + 16], xmm0
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
    lea        edx,  [edx + 32]
    sub        ecx, 8
    jg         convertloop
@@ -1347,11 +1463,13 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
  }
 }
+// 8 pixels, unaligned.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
 __declspec(naked) __declspec(align(16))
-void I420ToABGRRow_SSSE3(const uint8* y_buf,
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                         uint8* rgb_buf,
+                                   uint8* argb_buf,
                                   int width) {
  __asm {
    push       esi
@@ -1359,7 +1477,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
    mov        eax, [esp + 8 + 4]   // Y
    mov        esi, [esp + 8 + 8]   // U
    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb
+    mov        edx, [esp + 8 + 16]  // argb
    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -1367,15 +1485,15 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
    align      16
 convertloop:
-    YUVTORGB
+    YUV444TORGB
    // Step 3: Weave into ARGB
-    punpcklbw  xmm2, xmm1           // RG
+    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm0, xmm5           // BA
+    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm2
+    movdqa     xmm1, xmm0
-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm2
+    movdqa     [edx], xmm0
    movdqa     [edx + 16], xmm1
    lea        edx,  [edx + 32]
    sub        ecx, 8
@@ -1387,11 +1505,13 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
  }
 }
+// 8 pixels, unaligned.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
 __declspec(naked) __declspec(align(16))
-void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* rgb_buf,
+                                   uint8* argb_buf,
                                   int width) {
  __asm {
    push       esi
@@ -1399,7 +1519,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
    mov        eax, [esp + 8 + 4]   // Y
    mov        esi, [esp + 8 + 8]   // U
    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb
+    mov        edx, [esp + 8 + 16]  // argb
    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -1407,7 +1527,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
    align      16
 convertloop:
-    YUVTORGB
+    YUV422TORGB
    // Step 3: Weave into ARGB
    punpcklbw  xmm0, xmm1           // BG
@@ -1415,8 +1535,8 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
    movdqa     xmm1, xmm0
    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
+    movdqa     [edx], xmm0
-    movdqu     [edx + 16], xmm1
+    movdqa     [edx + 16], xmm1
    lea        edx,  [edx + 32]
    sub        ecx, 8
    jg         convertloop
@@ -1427,11 +1547,14 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  }
 }
+// 8 pixels, unaligned.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// Similar to I420 but duplicate UV once more.
 __declspec(naked) __declspec(align(16))
-void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* rgb_buf,
+                                   uint8* argb_buf,
                                   int width) {
  __asm {
    push       esi
@@ -1439,14 +1562,54 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
    mov        eax, [esp + 8 + 4]   // Y
    mov        esi, [esp + 8 + 8]   // U
    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb
+    mov        edx, [esp + 8 + 16]  // argb
    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
    pxor       xmm4, xmm4
    align      16
 convertloop:
-    YUVTORGB
+    YUV411TORGB
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* bgra_buf,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // bgra
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+    align      16
+ convertloop:
+    YUV422TORGB
    // Step 3: Weave into BGRA
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -1455,8 +1618,8 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
    movdqa     xmm0, xmm5
    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
-    movdqu     [edx], xmm5
+    movdqa     [edx], xmm5
-    movdqu     [edx + 16], xmm0
+    movdqa     [edx + 16], xmm0
    lea        edx,  [edx + 32]
    sub        ecx, 8
    jg         convertloop
@@ -1468,10 +1631,10 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
 }
 __declspec(naked) __declspec(align(16))
-void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                                   uint8* rgb_buf,
+                         uint8* abgr_buf,
                         int width) {
  __asm {
    push       esi
@@ -1479,7 +1642,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
    mov        eax, [esp + 8 + 4]   // Y
    mov        esi, [esp + 8 + 8]   // U
    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb
+    mov        edx, [esp + 8 + 16]  // abgr
    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -1487,7 +1650,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
    align      16
 convertloop:
-    YUVTORGB
+    YUV422TORGB
    // Step 3: Weave into ARGB
    punpcklbw  xmm2, xmm1           // RG
@@ -1495,8 +1658,8 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
    movdqa     xmm1, xmm2
    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    movdqu     [edx], xmm2
+    movdqa     [edx], xmm2
-    movdqu     [edx + 16], xmm1
+    movdqa     [edx + 16], xmm1
    lea        edx,  [edx + 32]
    sub        ecx, 8
    jg         convertloop
@@ -1508,10 +1671,10 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
 }
 __declspec(naked) __declspec(align(16))
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                         uint8* rgb_buf,
+                                   uint8* bgra_buf,
                                   int width) {
  __asm {
    push       esi
@@ -1519,7 +1682,47 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
    mov        eax, [esp + 8 + 4]   // Y
    mov        esi, [esp + 8 + 8]   // U
    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb
+    mov        edx, [esp + 8 + 16]  // bgra
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pxor       xmm4, xmm4
+    align      16
+ convertloop:
+    YUV422TORGB
+    // Step 3: Weave into BGRA
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    punpcklbw  xmm1, xmm0           // GB
+    punpcklbw  xmm5, xmm2           // AR
+    movdqa     xmm0, xmm5
+    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
+    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
+    movdqu     [edx], xmm5
+    movdqu     [edx + 16], xmm0
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* abgr_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // abgr
    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -1527,43 +1730,18 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
    align      16
 convertloop:
-    // Step 1: Find 4 UV contributions to 4 R,G,B values
+    YUV422TORGB
-    movd       xmm0, [esi]          // U
-    movd       xmm1, [esi + edi]    // V
-    lea        esi,  [esi + 4]
-    punpcklbw  xmm0, xmm1           // UV
-    movdqa     xmm1, xmm0
-    movdqa     xmm2, xmm0
-    pmaddubsw  xmm0, kUVToB        // scale B UV
-    pmaddubsw  xmm1, kUVToG        // scale G UV
-    pmaddubsw  xmm2, kUVToR        // scale R UV
-    psubw      xmm0, kUVBiasB      // unbias back to signed
-    psubw      xmm1, kUVBiasG
-    psubw      xmm2, kUVBiasR
-    // Step 2: Find Y contribution to 4 R,G,B values
-    movd       xmm3, [eax]
-    lea        eax, [eax + 4]
-    punpcklbw  xmm3, xmm4
-    psubsw     xmm3, kYSub16
-    pmullw     xmm3, kYToRgb
-    paddsw     xmm0, xmm3           // B += Y
-    paddsw     xmm1, xmm3           // G += Y
-    paddsw     xmm2, xmm3           // R += Y
-    psraw      xmm0, 6
-    psraw      xmm1, 6
-    psraw      xmm2, 6
-    packuswb   xmm0, xmm0           // B
-    packuswb   xmm1, xmm1           // G
-    packuswb   xmm2, xmm2           // R
    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm1           // RG
-    punpcklbw  xmm2, xmm5           // RA
+    punpcklbw  xmm0, xmm5           // BA
-    punpcklwd  xmm0, xmm2           // BGRA 4 pixels
+    movdqa     xmm1, xmm2
-    movdqa     [edx], xmm0
+    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    lea        edx,  [edx + 16]
+    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    sub        ecx, 4
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
    jg         convertloop
    pop        edi
@@ -1571,7 +1749,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
    ret
  }
 }
-#endif
+#endif  // HAS_I422TOARGBROW_SSSE3
 #ifdef HAS_YTOARGBROW_SSE2
 __declspec(naked) __declspec(align(16))
@@ -1617,7 +1795,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
    ret
  }
 }
-#endif
+#endif  // HAS_YTOARGBROW_SSE2
 #endif
 #ifdef HAS_MIRRORROW_SSSE3

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -25,88 +25,44 @@
 namespace libyuv {
-TEST_F(libyuvTest, BenchmarkI420ToARGB_C) {
+#define TESTPLANARTOB(FMT_A, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)               \
-  align_buffer_16(src_y, benchmark_width_ * benchmark_height_);
+TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) {                              \
-  align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
-  MaskCpuFlags(kCpuInitialized);
-  for (int i = 0; i < benchmark_iterations_; ++i)
-    I420ToARGB(src_y, benchmark_width_,
-               src_u, benchmark_width_ >> 1,
-               src_v, benchmark_width_ >> 1,
-               dst_argb, benchmark_width_ << 2,
-               benchmark_width_, benchmark_height_);
-  MaskCpuFlags(-1);
-  EXPECT_EQ(0, 0);
-  free_aligned_buffer_16(src_y)
-  free_aligned_buffer_16(src_u)
-  free_aligned_buffer_16(src_v)
-  free_aligned_buffer_16(dst_argb)
-}
-TEST_F(libyuvTest, BenchmarkI420ToARGB_OPT) {
-  align_buffer_16(src_y, benchmark_width_ * benchmark_height_);
-  align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
-  for (int i = 0; i < benchmark_iterations_; ++i)
-    I420ToARGB(src_y, benchmark_width_,
-               src_u, benchmark_width_ >> 1,
-               src_v, benchmark_width_ >> 1,
-               dst_argb, benchmark_width_ << 2,
-               benchmark_width_, benchmark_height_);
-  free_aligned_buffer_16(src_y)
-  free_aligned_buffer_16(src_u)
-  free_aligned_buffer_16(src_v)
-  free_aligned_buffer_16(dst_argb)
-}
-#define TESTI420TO(FMT, BPP)                                                   \
-TEST_F(libyuvTest, I420To##FMT##_CvsOPT) {                                     \
  const int src_width = 1280;                                                  \
  const int src_height = 720;                                                  \
  align_buffer_16(src_y, src_width * src_height);                              \
-  align_buffer_16(src_u, (src_width * src_height) >> 2);                       \
+  align_buffer_16(src_u, src_width / SUBSAMP_X * src_height / SUBSAMP_Y);      \
-  align_buffer_16(src_v, (src_width * src_height) >> 2);                       \
+  align_buffer_16(src_v, src_width / SUBSAMP_X * src_height / SUBSAMP_Y);      \
-  align_buffer_16(dst_rgb_c, (src_width * BPP) * src_height);                  \
+  align_buffer_16(dst_rgb_c, (src_width * BPP_B) * src_height);                \
-  align_buffer_16(dst_rgb_opt, (src_width * BPP) * src_height);                \
+  align_buffer_16(dst_rgb_opt, (src_width * BPP_B) * src_height);              \
  srandom(time(NULL));                                                         \
  for (int i = 0; i < src_height; ++i)                                         \
    for (int j = 0; j < src_width; ++j)                                        \
      src_y[(i * src_width) + j] = (random() & 0xff);                          \
-  for (int i = 0; i < src_height >> 1; ++i)                                    \
+  for (int i = 0; i < src_height / SUBSAMP_X; ++i)                             \
-    for (int j = 0; j < src_width >> 1; ++j) {                                 \
+    for (int j = 0; j < src_width / SUBSAMP_Y; ++j) {                          \
-      src_u[(i * src_width >> 1) + j] = (random() & 0xff);                     \
+      src_u[(i * src_width / SUBSAMP_X) + j] = (random() & 0xff);              \
-      src_v[(i * src_width >> 1) + j] = (random() & 0xff);                     \
+      src_v[(i * src_width / SUBSAMP_X) + j] = (random() & 0xff);              \
    }                                                                          \
  MaskCpuFlags(kCpuInitialized);                                               \
-  I420To##FMT(src_y, src_width,                                                \
+  ##FMT_A##To##FMT_B(src_y, src_width,                                         \
-              src_u, src_width >> 1,                                           \
+              src_u, src_width / SUBSAMP_X,                                    \
-              src_v, src_width >> 1,                                           \
+              src_v, src_width / SUBSAMP_X,                                    \
-              dst_rgb_c, src_width * BPP,                                      \
+              dst_rgb_c, src_width * BPP_B,                                    \
              src_width, src_height);                                          \
  MaskCpuFlags(-1);                                                            \
  const int runs = 1000;                                                       \
  for (int i = 0; i < runs; ++i) {                                             \
-    I420To##FMT(src_y, src_width,                                              \
+    ##FMT_A##To##FMT_B(src_y, src_width,                                       \
-                src_u, src_width >> 1,                                         \
+                src_u, src_width / SUBSAMP_X,                                  \
-                src_v, src_width >> 1,                                         \
+                src_v, src_width / SUBSAMP_X,                                  \
-                dst_rgb_opt, src_width * BPP,                                  \
+                dst_rgb_opt, src_width * BPP_B,                                \
                src_width, src_height);                                        \
  }                                                                            \
  int err = 0;                                                                 \
  for (int i = 0; i < src_height; ++i) {                                       \
-    for (int j = 0; j < src_width * BPP; ++j) {                                \
+    for (int j = 0; j < src_width * BPP_B; ++j) {                              \
-      int diff = static_cast<int>(dst_rgb_c[i * src_width * BPP + j]) -        \
+      int diff = static_cast<int>(dst_rgb_c[i * src_width * BPP_B + j]) -      \
-                 static_cast<int>(dst_rgb_opt[i * src_width * BPP + j]);       \
+                 static_cast<int>(dst_rgb_opt[i * src_width * BPP_B + j]);     \
      if (abs(diff) > 2)                                                       \
        err++;                                                                 \
    }                                                                          \
@@ -119,14 +75,17 @@ TEST_F(libyuvTest, I420To##FMT##_CvsOPT) {                                     \
  free_aligned_buffer_16(dst_rgb_opt)                                          \
 }
-TESTI420TO(ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB, 4)
-TESTI420TO(BGRA, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4)
-TESTI420TO(ABGR, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4)
-TESTI420TO(RAW, 3)
+TESTPLANARTOB(I420, 2, 2, RAW, 3)
-TESTI420TO(RGB24, 3)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3)
-TESTI420TO(RGB565, 2)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2)
-TESTI420TO(ARGB1555, 2)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2)
-TESTI420TO(ARGB4444, 2)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4)
 #define TESTATOB(FMT_A, BPP_A, FMT_B, BPP_B)                                   \
 TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) {                              \