YUY2ToARGB and UYVYToARGB use specialized row functions that do not subsample.

BUG=76 TEST=unittests for YUY2ToARGB and UYVYToARGB Review URL: https://webrtc-codereview.appspot.com/763006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@334 16f28f9a-4ce2-e073-06de-1de4eb20be90

YUY2ToARGB and UYVYToARGB use specialized row functions that do not subsample.
BUG=76 TEST=unittests for YUY2ToARGB and UYVYToARGB Review URL: https://webrtc-codereview.appspot.com/763006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@334 16f28f9a-4ce2-e073-06de-1de4eb20be90
c704f789 · fbarchard@google.com · 6343f22b · c704f789 · c704f789 · c704f789
Commit c704f789 authored Aug 30, 2012 by fbarchard@google.com
6 changed files
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -618,21 +618,21 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
    src_stride_yuy2 = -src_stride_yuy2;
  }
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+      int pix) = YUY2ToUV422Row_C;
  void (*YUY2ToYRow)(const uint8* src_yuy2,
                     uint8* dst_y, int pix) = YUY2ToYRow_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    if (width > 16) {
-      YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
    }
    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUVRow = YUY2ToUVRow_SSE2;
+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
        YUY2ToYRow = YUY2ToYRow_SSE2;
      }
    }
@@ -665,7 +665,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
  SIMD_ALIGNED(uint8 rowv[kMaxStride]);

  for (int y = 0; y < height; ++y) {
-    YUY2ToUVRow(src_yuy2, 0, rowu, rowv, width);
+    YUY2ToUV422Row(src_yuy2, rowu, rowv, width);
    YUY2ToYRow(src_yuy2, rowy, width);
    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
    src_yuy2 += src_stride_yuy2;
@@ -688,21 +688,21 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
    src_stride_uyvy = -src_stride_uyvy;
  }
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
+  void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+      int pix) = UYVYToUV422Row_C;
  void (*UYVYToYRow)(const uint8* src_uyvy,
                     uint8* dst_y, int pix) = UYVYToYRow_C;
 #if defined(HAS_UYVYTOYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    if (width > 16) {
-      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
      UYVYToYRow = UYVYToYRow_Any_SSE2;
    }
    if (IS_ALIGNED(width, 16)) {
-      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
+      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUVRow = UYVYToUVRow_SSE2;
+        UYVYToUV422Row = UYVYToUV422Row_SSE2;
        UYVYToYRow = UYVYToYRow_SSE2;
      }
    }
@@ -733,8 +733,9 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
+
  for (int y = 0; y < height; ++y) {
-    UYVYToUVRow(src_uyvy, 0, rowu, rowv, width);
+    UYVYToUV422Row(src_uyvy, rowu, rowv, width);
    UYVYToYRow(src_uyvy, rowy, width);
    I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
    src_uyvy += src_stride_uyvy;

--- a/source/row.h
+++ b/source/row.h
@@ -70,7 +70,6 @@ extern "C" {
 #define HAS_UYVYTOUVROW_SSE2
 #define HAS_UYVYTOYROW_SSE2
 #define HAS_YTOARGBROW_SSE2
-#define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
 #define HAS_ARGBGRAYROW_SSSE3
 #define HAS_ARGBSEPIAROW_SSSE3
@@ -455,33 +454,48 @@ void I422ToABGRRow_Any_NEON(const uint8* y_buf,

 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_y, int pix);
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
                               uint8* dst_y, int pix);
 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_y, int pix);
+                                uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+                                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);

 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_y, int pix);
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                               uint8* dst_y, int pix);
 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_y, int pix);
-
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int pix);
+                                uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+                                   uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
-
-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int src_stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int src_stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+

 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -621,10 +621,10 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
  memcpy(dst, src, count);
 }

-// Filter 2 rows of YUY2 UV's (422) into U and V (420)
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
                   uint8* dst_u, uint8* dst_v, int width) {
-  // Output a row of UV values, filtering 2 rows of YUY2
+  // Output a row of UV values, filtering 2 rows of YUY2.
  for (int x = 0; x < width; x += 2) {
    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
@@ -634,8 +634,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
  }
 }

+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  for (int x = 0; x < width; x += 2) {
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
 void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
-  // Copy a row of yuy2 Y values
+  // Output a row of Y values.
  for (int x = 0; x < width - 1; x += 2) {
    dst_y[x] = src_yuy2[0];
    dst_y[x + 1] = src_yuy2[2];
@@ -646,9 +660,10 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
  }
 }

+// Filter 2 rows of UYVY UV's (422) into U and V (420).
 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
                   uint8* dst_u, uint8* dst_v, int width) {
-  // Copy a row of uyvy UV values
+  // Output a row of UV values.
  for (int x = 0; x < width; x += 2) {
    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
@@ -658,15 +673,29 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
  }
 }

-void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
-  // Copy a row of uyvy Y values
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  for (int x = 0; x < width; x += 2) {
+    dst_u[0] = src_uyvy[0];
+    dst_v[0] = src_uyvy[2];
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+  // Output a row of Y values.
  for (int x = 0; x < width - 1; x += 2) {
-    dst_y[x] = src_yuy2[1];
-    dst_y[x + 1] = src_yuy2[3];
-    src_yuy2 += 4;
+    dst_y[x] = src_uyvy[1];
+    dst_y[x + 1] = src_uyvy[3];
+    src_uyvy += 4;
  }
  if (width & 1) {
-    dst_y[width - 1] = src_yuy2[1];
+    dst_y[width - 1] = src_uyvy[1];
  }
 }

@@ -910,12 +939,12 @@ YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
 YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
 #undef YANY

-#define UVANY(NAMEANY, ARGBTOUV_SSE, ARGBTOUV_C, BPP)                          \
+#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                            \
    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
                 uint8* dst_u, uint8* dst_v, int width) {                      \
      int n = width & ~15;                                                     \
-      ARGBTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n);                \
-      ARGBTOUV_C(src_argb  + n * BPP, src_stride_argb,                         \
+      ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n);                 \
+      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
                 dst_u + (n >> 1),                                             \
                 dst_v + (n >> 1),                                             \
                 width & 15);                                                  \
@@ -927,6 +956,24 @@ UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
 UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
 UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
 #undef UVANY
+
+#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                         \
+    void NAMEANY(const uint8* src_argb,                                        \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      int n = width & ~15;                                                     \
+      ANYTOUV_SSE(src_argb, dst_u, dst_v, n);                                  \
+      ANYTOUV_C(src_argb  + n * BPP,                                           \
+                 dst_u + (n >> 1),                                             \
+                 dst_v + (n >> 1),                                             \
+                 width & 15);                                                  \
+    }
+
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,               \
+         YUY2ToUV422Row_C, 2)
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,               \
+         UYVYToUV422Row_C, 2)
+#undef UV422ANY
+
 #endif

 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2114,7 +2114,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
 }

 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_y, int pix) {
+                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
@@ -2143,7 +2143,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
    "jg        1b                              \n"
  : "+r"(src_yuy2),    // %0
    "+r"(dst_u),       // %1
-    "+r"(dst_y),       // %2
+    "+r"(dst_v),       // %2
    "+r"(pix)          // %3
  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
  : "memory", "cc"
@@ -2153,6 +2153,41 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  );
 }

+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}

 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
                               uint8* dst_y, int pix) {
@@ -2214,7 +2249,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
    "jg        1b                              \n"
  : "+r"(src_yuy2),    // %0
    "+r"(dst_u),       // %1
-    "+r"(dst_y),       // %2
+    "+r"(dst_v),       // %2
    "+r"(pix)          // %3
  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
  : "memory", "cc"
@@ -2224,6 +2259,42 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
  );
 }

+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
  asm volatile (
    ".p2align  4                               \n"
@@ -2250,7 +2321,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
 }

 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_y, int pix) {
+                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
@@ -2279,7 +2350,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
    "jg        1b                              \n"
  : "+r"(src_uyvy),    // %0
    "+r"(dst_u),       // %1
-    "+r"(dst_y),       // %2
+    "+r"(dst_v),       // %2
    "+r"(pix)          // %3
  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
  : "memory", "cc"
@@ -2289,6 +2360,42 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  );
 }

+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                               uint8* dst_y, int pix) {
  asm volatile (
@@ -2316,7 +2423,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
 }

 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_y, int pix) {
+                                uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
@@ -2345,7 +2452,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
    "jg        1b                              \n"
  : "+r"(src_uyvy),    // %0
    "+r"(dst_u),       // %1
-    "+r"(dst_y),       // %2
+    "+r"(dst_v),       // %2
    "+r"(pix)          // %3
  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
  : "memory", "cc"
@@ -2354,6 +2461,42 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 #endif
  );
 }
+
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0,(%1)                     \n"
+    "movq      %%xmm1,(%1,%2)                  \n"
+    "lea       0x8(%1),%1                      \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
 #endif  // HAS_YUY2TOYROW_SSE2

 #ifdef HAS_ARGBBLENDROW_SSE2

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2198,7 +2198,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,

 __declspec(naked) __declspec(align(16))
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_y, int pix) {
+                      uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
    push       esi
    push       edi
@@ -2240,6 +2240,43 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  }
 }

+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
 __declspec(naked) __declspec(align(16))
 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
                               uint8* dst_y, int pix) {
@@ -2268,7 +2305,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,

 __declspec(naked) __declspec(align(16))
 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_y, int pix) {
+                                uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
    push       esi
    push       edi
@@ -2310,6 +2347,43 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
  }
 }

+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
 __declspec(naked) __declspec(align(16))
 void UYVYToYRow_SSE2(const uint8* src_uyvy,
                     uint8* dst_y, int pix) {
@@ -2336,7 +2410,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,

 __declspec(naked) __declspec(align(16))
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_y, int pix) {
+                      uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
    push       esi
    push       edi
@@ -2378,6 +2452,43 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  }
 }

+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
 __declspec(naked) __declspec(align(16))
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                               uint8* dst_y, int pix) {
@@ -2404,7 +2515,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,

 __declspec(naked) __declspec(align(16))
 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_y, int pix) {
+                                uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
    push       esi
    push       edi
@@ -2445,6 +2556,43 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
    ret
  }
 }
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+                                   uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
 #endif  // HAS_YUY2TOYROW_SSE2

 #ifdef HAS_ARGBBLENDROW_SSE2

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -31,11 +31,12 @@ namespace libyuv {
 TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
  const int kWidth = 1280;                                                     \
  const int kHeight = 720;                                                     \
+  const int kStride = (kWidth * 8 * BPP_B + 7) / 8;                            \
  align_buffer_16(src_y, kWidth * kHeight);                                    \
  align_buffer_16(src_u, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);            \
  align_buffer_16(src_v, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);            \
-  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
-  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
+  align_buffer_16(dst_argb_c, kStride * kHeight);                              \
+  align_buffer_16(dst_argb_opt, kStride * kHeight);                            \
  srandom(time(NULL));                                                         \
  for (int i = 0; i < kHeight; ++i)                                            \
    for (int j = 0; j < kWidth; ++j)                                           \
@@ -49,16 +50,16 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
  FMT_PLANAR##To##FMT_B(src_y, kWidth,                                         \
                        src_u, kWidth / SUBSAMP_X,                             \
                        src_v, kWidth / SUBSAMP_X,                             \
-                        dst_argb_c, kWidth * BPP_B,                            \
-                        kWidth, NEG##kHeight);                                 \
+                        dst_argb_c, kStride,                                   \
+                        kWidth, NEG kHeight);                                  \
  MaskCpuFlags(-1);                                                            \
  const int runs = 1000;                                                       \
  for (int i = 0; i < runs; ++i) {                                             \
    FMT_PLANAR##To##FMT_B(src_y, kWidth,                                       \
                          src_u, kWidth / SUBSAMP_X,                           \
                          src_v, kWidth / SUBSAMP_X,                           \
-                          dst_argb_opt, kWidth * BPP_B,                        \
-                          kWidth, NEG##kHeight);                               \
+                          dst_argb_opt, kStride,                               \
+                          kWidth, NEG kHeight);                                \
  }                                                                            \
  int max_diff = 0;                                                            \
  for (int i = 0; i < kHeight; ++i) {                                          \
@@ -96,7 +97,7 @@ TESTPLANARTOB(I422, 2, 1, ARGB, 4)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4)
 TESTPLANARTOB(I420, 2, 2, YUY2, 2)
 TESTPLANARTOB(I420, 2, 2, UYVY, 2)
-TESTPLANARTOB(I420, 2, 2, V210, 45 / 16)
+TESTPLANARTOB(I420, 2, 2, V210, 16 / 6)
 TESTPLANARTOB(I420, 2, 2, I400, 1)
 TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1)
 TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1)
@@ -124,14 +125,14 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
  FMT_PLANAR##To##FMT_B(src_y, kWidth,                                         \
                        src_uv, kWidth / SUBSAMP_X * 2,                        \
                        dst_argb_c, kWidth * BPP_B,                            \
-                        kWidth, NEG##kHeight);                                 \
+                        kWidth, NEG kHeight);                                  \
  MaskCpuFlags(-1);                                                            \
  const int runs = 1000;                                                       \
  for (int i = 0; i < runs; ++i) {                                             \
    FMT_PLANAR##To##FMT_B(src_y, kWidth,                                       \
                          src_uv, kWidth / SUBSAMP_X * 2,                      \
                          dst_argb_opt, kWidth * BPP_B,                        \
-                          kWidth, NEG##kHeight);                               \
+                          kWidth, NEG kHeight);                                \
  }                                                                            \
  int max_diff = 0;                                                            \
  for (int i = 0; i < kHeight; ++i) {                                          \
@@ -164,7 +165,8 @@ TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2)
 TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) {                        \
  const int kWidth = 1280;                                                     \
  const int kHeight = 720;                                                     \
-  align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight);                       \
+  const int kStride = (kWidth * 8 * BPP_A + 7) / 8;                            \
+  align_buffer_16(src_argb, kStride * kHeight);                                \
  align_buffer_16(dst_y_c, kWidth * kHeight);                                  \
  align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
  align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
@@ -173,22 +175,22 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) {                        \
  align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
  srandom(time(NULL));                                                         \
  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kWidth * BPP_A; ++j)                                   \
-      src_argb[(i * kWidth * BPP_A) + j] = (random() & 0xff);                  \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j] = (random() & 0xff);                         \
  MaskCpuFlags(kCpuInitialized);                                               \
-  FMT_A##To##FMT_PLANAR(src_argb, kWidth * BPP_A,                              \
+  FMT_A##To##FMT_PLANAR(src_argb, kStride,                                     \
                        dst_y_c, kWidth,                                       \
                        dst_u_c, kWidth / SUBSAMP_X,                           \
                        dst_v_c, kWidth / SUBSAMP_X,                           \
-                        kWidth, NEG##kHeight);                                 \
+                        kWidth, NEG kHeight);                                  \
  MaskCpuFlags(-1);                                                            \
  const int runs = 1000;                                                       \
  for (int i = 0; i < runs; ++i) {                                             \
-    FMT_A##To##FMT_PLANAR(src_argb, kWidth * BPP_A,                            \
+    FMT_A##To##FMT_PLANAR(src_argb, kStride,                                   \
                          dst_y_opt, kWidth,                                   \
                          dst_u_opt, kWidth / SUBSAMP_X,                       \
                          dst_v_opt, kWidth / SUBSAMP_X,                       \
-                          kWidth, NEG##kHeight);                               \
+                          kWidth, NEG kHeight);                                \
  }                                                                            \
  int max_diff = 0;                                                            \
  for (int i = 0; i < kHeight; ++i) {                                          \
@@ -251,7 +253,7 @@ TESTATOPLANAR(ARGB, 4, I422, 2, 1)
 // TODO(fbarchard): Implement and test 411 and 444
 TESTATOPLANAR(YUY2, 2, I420, 2, 2)
 TESTATOPLANAR(UYVY, 2, I420, 2, 2)
-TESTATOPLANAR(V210, 45 / 16, I420, 2, 2)
+TESTATOPLANAR(V210, 16 / 6, I420, 2, 2)
 TESTATOPLANAR(I400, 1, I420, 2, 2)
 TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2)
 TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2)
@@ -272,13 +274,13 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) {                             \
  MaskCpuFlags(kCpuInitialized);                                               \
  FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                                \
                   dst_argb_c, kWidth * BPP_B,                                 \
-                   kWidth, NEG##kHeight);                                      \
+                   kWidth, NEG kHeight);                                       \
  MaskCpuFlags(-1);                                                            \
  const int runs = 1000;                                                       \
  for (int i = 0; i < runs; ++i) {                                             \
    FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A,                              \
                     dst_argb_opt, kWidth * BPP_B,                             \
-                     kWidth, NEG##kHeight);                                    \
+                     kWidth, NEG kHeight);                                     \
  }                                                                            \
  int max_diff = 0;                                                            \
  for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) {                         \