direct conversion from NV12 to ARGB

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/645004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@281 16f28f9a-4ce2-e073-06de-1de4eb20be90

direct conversion from NV12 to ARGB
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/645004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@281 16f28f9a-4ce2-e073-06de-1de4eb20be90
2d9fe082 · fbarchard@google.com · 7c8e16f8 · 2d9fe082 · 2d9fe082 · 2d9fe082
Commit 2d9fe082 authored Jun 05, 2012 by fbarchard@google.com
10 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 280
+Version: 281
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -47,16 +47,33 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
-// Convert NV12 to ARGB.  Also used for NV21.
+// Convert NV12 to ARGB.
 int NV12ToARGB(const uint8* src_y, int src_stride_y,
               const uint8* src_uv, int src_stride_uv,
               uint8* dst_frame, int dst_stride_frame,
               int width, int height);
-// Convert NV12 to RGB565.  Also used for NV21.
+// Convert NV21 to ARGB.
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+// Convert M420 to ARGB.
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+// Convert NV12 to RGB565.
 int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_frame, int dst_stride_frame,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+// Convert NV21 to RGB565.
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);
 // Convert YUY2 to ARGB.

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 280
+#define LIBYUV_VERSION 281
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -367,7 +367,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
 // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
 // easy conversion to I420.
 // M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
 // Chroma is half width / half height. (420)
 // src_stride_m420 is row planar.  Normally this will be the width in pixels.
 //   The UV plane is half width, but 2 values, so src_stride_m420 applies to

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -839,51 +839,191 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
+  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
+                        const uint8* uv_buf,
-                        const uint8* v_buf,
+                        uint8* rgb_buf,
-                        uint8* argb_buf,
+                        int width) = NV12ToARGBRow_C;
-                        int width) = I422ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
-#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
-    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
-      I422ToARGBRow = I422ToARGBRow_NEON;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      }
    }
  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
+#endif
+  for (int y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+// Convert NV21 to ARGB.
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* vu_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+#if defined(HAS_NV21TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8) &&
+    if (IS_ALIGNED(width, 8)) {
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+      }
    }
  }
 #endif
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+  for (int y = 0; y < height; ++y) {
-      SplitUV_C;
+    NV21ToARGBRow(src_y, src_vu, dst_argb, width);
-#if defined(HAS_SPLITUV_NEON)
+    dst_argb += dst_stride_argb;
-  if (TestCpuFlag(kCpuHasNEON)) {
+    src_y += src_stride_y;
-    SplitUV = SplitUV_NEON;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
  }
-#elif defined(HAS_SPLITUV_SSE2)
+  return 0;
-  if (TestCpuFlag(kCpuHasSSE2) &&
+}
-      IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16)) {
-    SplitUV = SplitUV_SSE2;
+// Convert M420 to ARGB.
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+  for (int y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+  }
+  return 0;
+}
+// Convert NV12 to RGB565.
+// TODO(fbarchard): (Re) Optimize for Neon.
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+    NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+  }
+#endif
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
  }
 #endif
-  SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]);
  for (int y = 0; y < height; ++y) {
-    if ((y & 1) == 0) {
+    NV12ToARGBRow(src_y, src_uv, row, width);
-      // Copy a row of UV.
+    ARGBToRGB565Row(row, dst_rgb565, width);
-      SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
      src_uv += src_stride_uv;
    }
-    I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width);
+  }
-    dst_argb += dst_stride_argb;
+  return 0;
+}
+// Convert NV21 to RGB565.
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_vu, int src_stride_vu,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+    NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+  }
+#endif
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+  }
+#endif
+  for (int y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_vu, row, width);
+    ARGBToRGB565Row(row, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
  }
  return 0;
 }
@@ -1020,69 +1160,6 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
  return 0;
 }
-// Convert NV12 to RGB565.
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb, int dst_stride_rgb,
-                 int width, int height) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
-    dst_stride_rgb = -dst_stride_rgb;
-  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
-  }
-#endif
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
-  }
-#endif
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
-      SplitUV_C;
-#if defined(HAS_SPLITUV_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUV = SplitUV_NEON;
-  }
-#elif defined(HAS_SPLITUV_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16)) {
-    SplitUV = SplitUV_SSE2;
-  }
-#endif
-  SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]);
-  for (int y = 0; y < height; ++y) {
-    if ((y & 1) == 0) {
-      // Copy a row of UV.
-      SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
-      src_uv += src_stride_uv;
-    }
-    I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
-    ARGBToRGB565Row(row, dst_rgb, width);
-    dst_rgb += dst_stride_rgb;
-    src_y += src_stride_y;
-  }
-  return 0;
-}
 // SetRow8 writes 'count' bytes using a 32 bit value repeated
 // SetRow32 writes 'count' words using a 32 bit value repeated

--- a/source/row.h
+++ b/source/row.h
@@ -54,12 +54,14 @@ extern "C" {
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
-#define HAS_I400TOARGBROW_SSE2
-#define HAS_I422TOABGRROW_SSSE3
-#define HAS_I422TOARGBROW_SSSE3
-#define HAS_I422TOBGRAROW_SSSE3
 #define HAS_I444TOARGBROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
 #define HAS_I411TOARGBROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROWUV_SSSE3
 #define HAS_ADDROW_SSE2
@@ -220,34 +222,44 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
-void I422ToARGBRow_C(const uint8* y_buf,
+void I444ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
-                     uint8* rgb_buf,
+                     uint8* argb_buf,
                     int width);
-void I422ToBGRARow_C(const uint8* y_buf,
+void I422ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
-                     uint8* rgb_buf,
+                     uint8* argb_buf,
                     int width);
-void I422ToABGRRow_C(const uint8* y_buf,
+void I411ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
                     int width);
-void I444ToARGBRow_C(const uint8* y_buf,
+void NV12ToARGBRow_C(const uint8* y_buf,
+                     const uint8* uv_buf,
+                     uint8* argb_buf,
+                     int width);
+void NV21ToARGBRow_C(const uint8* y_buf,
+                     const uint8* vu_buf,
+                     uint8* argb_buf,
+                     int width);
+void I422ToBGRARow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
-                     uint8* rgb_buf,
+                     uint8* bgra_buf,
                     int width);
-void I411ToARGBRow_C(const uint8* y_buf,
+void I422ToABGRRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
-                     uint8* rgb_buf,
+                     uint8* abgr_buf,
                     int width);
 void YToARGBRow_C(const uint8* y_buf,
@@ -269,6 +281,16 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width);
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* argb_buf,
+                         int width);
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* vu_buf,
                         uint8* argb_buf,
                         int width);
@@ -299,6 +321,16 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* uv_buf,
+                                   uint8* argb_buf,
+                                   int width);
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* vu_buf,
                                   uint8* argb_buf,
                                   int width);
@@ -314,37 +346,16 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
                                   uint8* abgr_buf,
                                   int width);
-void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* argb_buf,
-                     int width);
-// ARGB preattenuated alpha blend.
-void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                                uint8* dst_argb, int width);
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                               uint8* dst_argb, int width);
-void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width);
-void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                            uint8* dst_argb, int width);
-void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                           uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                    uint8* dst_argb, int width);
-// 'Any' functions handle any size and alignment.
 void I444ToARGBRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
-                             uint8* rgb_buf,
+                             uint8* argb_buf,
                             int width);
 void I422ToARGBRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
-                             uint8* rgb_buf,
+                             uint8* argb_buf,
                             int width);
 void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
@@ -353,18 +364,47 @@ void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
                             uint8* rgb_buf,
                             int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* uv_buf,
+                             uint8* argb_buf,
+                             int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf,
+                             const uint8* vu_buf,
+                             uint8* argb_buf,
+                             int width);
 void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
-                             uint8* rgb_buf,
+                             uint8* bgra_buf,
                             int width);
 void I422ToABGRRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
-                             uint8* rgb_buf,
+                             uint8* abgr_buf,
                             int width);
+void YToARGBRow_SSE2(const uint8* y_buf,
+                     uint8* argb_buf,
+                     int width);
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                                uint8* dst_argb, int width);
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                               uint8* dst_argb, int width);
+void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width);
+void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                            uint8* dst_argb, int width);
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                           uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
 void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -359,6 +359,20 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
                                        (255u << ashift);
 }
+void I444ToARGBRow_C(const uint8* y_buf,
+                     const uint8* u_buf,
+                     const uint8* v_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width; ++x) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
+    y_buf += 1;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
 // Also used for 420
 void I422ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
@@ -378,79 +392,97 @@ void I422ToARGBRow_C(const uint8* y_buf,
  }
 }
-void I422ToBGRARow_C(const uint8* y_buf,
+void I411ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
                     int width) {
-  for (int x = 0; x < width - 1; x += 2) {
+  for (int x = 0; x < width - 3; x += 4) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
+    YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
+    YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
+    y_buf += 4;
    u_buf += 1;
    v_buf += 1;
+    rgb_buf += 16;  // Advance 4 pixels.
+  }
+  if (width & 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
  }
 }
-void I422ToABGRRow_C(const uint8* y_buf,
+void NV12ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
+                     const uint8* uv_buf,
-                     const uint8* v_buf,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
+    YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
    y_buf += 2;
-    u_buf += 1;
+    uv_buf += 2;
-    v_buf += 1;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
  }
 }
-void I444ToARGBRow_C(const uint8* y_buf,
+void NV21ToARGBRow_C(const uint8* y_buf,
+                     const uint8* vu_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
+    vu_buf += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+  }
+}
+void I422ToBGRARow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
                     int width) {
-  for (int x = 0; x < width; ++x) {
+  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
-    y_buf += 1;
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
+    y_buf += 2;
    u_buf += 1;
    v_buf += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
  }
 }
-void I411ToARGBRow_C(const uint8* y_buf,
+void I422ToABGRRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
                     uint8* rgb_buf,
                     int width) {
-  for (int x = 0; x < width - 3; x += 4) {
+  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
-    YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
+    y_buf += 2;
-    YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
-    y_buf += 4;
    u_buf += 1;
    v_buf += 1;
-    rgb_buf += 16;  // Advance 4 pixels.
-  }
-  if (width & 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
  }
 }
@@ -728,10 +760,26 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                  rgb_buf + n * 4, width & 7);                                 \
    }
+// Wrappers to handle odd width
+#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT)                    \
+    void NAMEANY(const uint8* y_buf,                                           \
+                 const uint8* uv_buf,                                          \
+                 uint8* rgb_buf,                                               \
+                 int width) {                                                  \
+      int n = width & ~7;                                                      \
+      NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n);                                \
+      NV12TORGB_C(y_buf + n,                                                   \
+                  uv_buf + (n >> UV_SHIFT),                                    \
+                  rgb_buf + n * 4, width & 7);                                 \
+    }
 #if defined(HAS_I422TOARGBROW_SSSE3)
 YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
 YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
 YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
+Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
+Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
 YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
 #endif

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1230,6 +1230,18 @@ static const vec8 kUVToG = {
  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
 };
+static const vec8 kVUToB = {
+  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+static const vec8 kVUToR = {
+  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+static const vec8 kVUToG = {
+  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
@@ -1265,6 +1277,13 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
  }
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi]          /* UV */                   \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB __asm {                                                       \
    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
@@ -1293,6 +1312,34 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm packuswb   xmm2, xmm2           /* R */                              \
  }
+// Convert 8 pixels: 8 VU and 8 Y
+#define YVUTORGB __asm {                                                       \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
+    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
+    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
+    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
+    __asm psubw      xmm1, kUVBiasG                                            \
+    __asm psubw      xmm2, kUVBiasR                                            \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm4                                                \
+    __asm psubsw     xmm3, kYSub16                                             \
+    __asm pmullw     xmm3, kYToRgb                                             \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
 // 8 pixels, dest aligned 16.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
 __declspec(naked) __declspec(align(16))
@@ -1423,6 +1470,82 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+    align      16
+ convertloop:
+    READNV12
+    YUVTORGB
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* argb_buf,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // VU
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+    align      16
+ convertloop:
+    READNV12
+    YVUTORGB
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
 // 8 pixels, unaligned.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
 __declspec(naked) __declspec(align(16))
@@ -1553,6 +1676,83 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  }
 }
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* uv_buf,
+                                   uint8* argb_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+    align      16
+ convertloop:
+    READNV12
+    YUVTORGB
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* uv_buf,
+                                   uint8* argb_buf,
+                                   int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // VU
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pxor       xmm4, xmm4
+    align      16
+ convertloop:
+    READNV12
+    YVUTORGB
+    // Step 3: Weave into ARGB
+    punpcklbw  xmm0, xmm1           // BG
+    punpcklbw  xmm2, xmm5           // RA
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
 __declspec(naked) __declspec(align(16))
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -26,7 +26,7 @@
 namespace libyuv {
 #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)          \
-TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_CvsOPT) {                         \
+TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_OptVsC) {                         \
  const int kWidth = 1280;                                                     \
  const int kHeight = 720;                                                     \
  align_buffer_16(src_y, kWidth * kHeight);                                    \
@@ -88,8 +88,60 @@ TESTPLANARTOB(I411, 4, 1, ARGB, 4)
 TESTPLANARTOB(I422, 2, 1, ARGB, 4)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4)
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)        \
+TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_OptVsC) {                         \
+  const int kWidth = 1280;                                                     \
+  const int kHeight = 720;                                                     \
+  align_buffer_16(src_y, kWidth * kHeight);                                    \
+  align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2);       \
+  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
+  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
+  for (int i = 0; i < kHeight / SUBSAMP_X; ++i)                                \
+    for (int j = 0; j < kWidth / SUBSAMP_Y * 2; ++j) {                         \
+      src_uv[(i * kWidth / SUBSAMP_X) * 2 + j] = (random() & 0xff);            \
+    }                                                                          \
+  MaskCpuFlags(kCpuInitialized);                                               \
+  ##FMT_PLANAR##To##FMT_B(src_y, kWidth,                                       \
+                          src_uv, kWidth / SUBSAMP_X * 2,                      \
+                          dst_argb_c, kWidth * BPP_B,                          \
+                          kWidth, kHeight);                                    \
+  MaskCpuFlags(-1);                                                            \
+  const int runs = 1000;                                                       \
+  for (int i = 0; i < runs; ++i) {                                             \
+    ##FMT_PLANAR##To##FMT_B(src_y, kWidth,                                     \
+                            src_uv, kWidth / SUBSAMP_X * 2,                    \
+                            dst_argb_opt, kWidth * BPP_B,                      \
+                            kWidth, kHeight);                                  \
+  }                                                                            \
+  int err = 0;                                                                 \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth * BPP_B; ++j) {                                 \
+      int diff = static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) -        \
+                 static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]);       \
+      if (abs(diff) > 2) {                                                     \
+        ++err;                                                                 \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_EQ(err, 0);                                                           \
+  free_aligned_buffer_16(src_y)                                                \
+  free_aligned_buffer_16(src_uv)                                               \
+  free_aligned_buffer_16(dst_argb_c)                                           \
+  free_aligned_buffer_16(dst_argb_opt)                                         \
+}
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2)
 #define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)          \
-TEST_F(libyuvTest, ##FMT_A##To##FMT_PLANAR##_CvsOPT) {                         \
+TEST_F(libyuvTest, ##FMT_A##To##FMT_PLANAR##_OptVsC) {                         \
  const int kWidth = 1280;                                                     \
  const int kHeight = 720;                                                     \
  align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight);                       \
@@ -171,36 +223,34 @@ TESTATOPLANAR(ARGB, 4, I422, 2, 1)
 //TESTATOPLANAR(ARGB, 4, I444, 1, 1)
 // TODO(fbarchard): Implement and test 411 and 444
-#define TESTATOB(FMT_A, BPP_A, FMT_B, BPP_B)                                   \
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B)                         \
-TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) {                              \
+TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_OptVsC) {                              \
  const int kWidth = 1280;                                                     \
  const int kHeight = 720;                                                     \
-  align_buffer_16(src_argb, kWidth * kHeight * BPP_A);                         \
+  align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight);                       \
  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
+  for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) {                         \
-    for (int j = 0; j < kWidth * BPP_A; ++j)                                   \
+    src_argb[i] = (random() & 0xff);                                           \
-      src_argb[(i * kWidth * BPP_A) + j] = (random() & 0xff);                  \
+  }                                                                            \
  MaskCpuFlags(kCpuInitialized);                                               \
-  ##FMT_A##To##FMT_B(src_argb, kWidth * BPP_A,                                 \
+  ##FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
              dst_argb_c, kWidth * BPP_B,                                      \
              kWidth, kHeight);                                                \
  MaskCpuFlags(-1);                                                            \
  const int runs = 1000;                                                       \
  for (int i = 0; i < runs; ++i) {                                             \
-    ##FMT_A##To##FMT_B(src_argb, kWidth * BPP_A,                               \
+    ##FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                            \
                dst_argb_opt, kWidth * BPP_B,                                  \
                kWidth, kHeight);                                              \
  }                                                                            \
  int err = 0;                                                                 \
-  for (int i = 0; i < kHeight; ++i) {                                          \
+  for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) {                         \
-    for (int j = 0; j < kWidth * BPP_B; ++j) {                                 \
+    int diff = static_cast<int>(dst_argb_c[i]) -                               \
-      int diff = static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) -        \
+               static_cast<int>(dst_argb_opt[i]);                              \
-                 static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]);       \
+    if (abs(diff) > 2)                                                         \
-      if (abs(diff) > 2)                                                       \
+      err++;                                                                   \
-        err++;                                                                 \
-    }                                                                          \
  }                                                                            \
  EXPECT_EQ(err, 0);                                                           \
  free_aligned_buffer_16(src_argb)                                             \
@@ -208,25 +258,26 @@ TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) {                              \
  free_aligned_buffer_16(dst_argb_opt)                                         \
 }
-TESTATOB(ARGB, 4, ARGB, 4)
+TESTATOB(ARGB, 4, 4, ARGB, 4)
-TESTATOB(ARGB, 4, BGRA, 4)
+TESTATOB(ARGB, 4, 4, BGRA, 4)
-TESTATOB(ARGB, 4, ABGR, 4)
+TESTATOB(ARGB, 4, 4, ABGR, 4)
-TESTATOB(ARGB, 4, RAW, 3)
+TESTATOB(ARGB, 4, 4, RAW, 3)
-TESTATOB(ARGB, 4, RGB24, 3)
+TESTATOB(ARGB, 4, 4, RGB24, 3)
-TESTATOB(ARGB, 4, RGB565, 2)
+TESTATOB(ARGB, 4, 4, RGB565, 2)
-TESTATOB(ARGB, 4, ARGB1555, 2)
+TESTATOB(ARGB, 4, 4, ARGB1555, 2)
-TESTATOB(ARGB, 4, ARGB4444, 2)
+TESTATOB(ARGB, 4, 4, ARGB4444, 2)
-TESTATOB(BGRA, 4, ARGB, 4)
+TESTATOB(BGRA, 4, 4, ARGB, 4)
-TESTATOB(ABGR, 4, ARGB, 4)
+TESTATOB(ABGR, 4, 4, ARGB, 4)
-TESTATOB(RAW, 3, ARGB, 4)
+TESTATOB(RAW, 3, 3, ARGB, 4)
-TESTATOB(RGB24, 3, ARGB, 4)
+TESTATOB(RGB24, 3, 3, ARGB, 4)
-TESTATOB(RGB565, 2, ARGB, 4)
+TESTATOB(RGB565, 2, 2, ARGB, 4)
-TESTATOB(ARGB1555, 2, ARGB, 4)
+TESTATOB(ARGB1555, 2, 2, ARGB, 4)
-TESTATOB(ARGB4444, 2, ARGB, 4)
+TESTATOB(ARGB4444, 2, 2, ARGB, 4)
-TESTATOB(YUY2, 2, ARGB, 4)
+TESTATOB(YUY2, 2, 2, ARGB, 4)
-TESTATOB(UYVY, 2, ARGB, 4)
+TESTATOB(UYVY, 2, 2, ARGB, 4)
+TESTATOB(M420, 3 / 2, 1, ARGB, 4)
 TEST_F(libyuvTest, TestAttenuate) {
  SIMD_ALIGNED(uint8 orig_pixels[256][4]);