ARGBToI400 and ARGBToI411 using AVX2. YUY2ToI420 and UVYVToI420 use AVX2.…

ARGBToI400 and ARGBToI411 using AVX2. YUY2ToI420 and UVYVToI420 use AVX2. CopyPlane use rep movsb for AVX2. CopyPlane2 use rep movsb for AVX2 and CopyPlane if strides match AVX2, which will do a single rep movsb for entire image if stride == width. MergeUV for I420ToNV12. BUG=181 TESTED=unittests pass Review URL: https://webrtc-codereview.appspot.com/1103007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@569 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBToI400 and ARGBToI411 using AVX2. YUY2ToI420 and UVYVToI420 use AVX2.…
ARGBToI400 and ARGBToI411 using AVX2. YUY2ToI420 and UVYVToI420 use AVX2. CopyPlane use rep movsb for AVX2. CopyPlane2 use rep movsb for AVX2 and CopyPlane if strides match AVX2, which will do a single rep movsb for entire image if stride == width. MergeUV for I420ToNV12. BUG=181 TESTED=unittests pass Review URL: https://webrtc-codereview.appspot.com/1103007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@569 16f28f9a-4ce2-e073-06de-1de4eb20be90
b444bae8 · fbarchard@google.com · c9562334 · b444bae8 · b444bae8 · b444bae8
Commit b444bae8 authored Feb 11, 2013 by fbarchard@google.com
10 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 568
+Version: 569
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -123,12 +123,20 @@ extern "C" {
 // TODO(fbarchard): Port to gcc.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_COPYROW_AVX2
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // TODO(fbarchard): Hook these up to all functions. e.g. format conversion.
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_SPLITUVROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
 #endif
 #endif

@@ -375,7 +383,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                        int width);

 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
 void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
@@ -449,8 +456,6 @@ void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);

 void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb, int src_stride_argb,
-                                uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
                          uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
@@ -570,8 +575,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                           int pix);
 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                               int pix);
-void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix);
 void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
                                     uint8* dst_v, int pix);
 void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
@@ -593,8 +596,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                     int width);
 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
                               uint8* dst_uv, int width);
-void MergeUVRow_Unaligned_AVX2(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width);
 void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                         int width);
 void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
@@ -603,6 +604,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                         int width);

 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX2(const uint8* src, uint8* dst, int count);
 void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
@@ -1154,6 +1156,11 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
                              uint8* dst_argb,
                              int width);

+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix);
@@ -1175,6 +1182,11 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
                   uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToUV422Row_C(const uint8* src_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix);
@@ -1185,7 +1197,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
                             uint8* dst_u, uint8* dst_v, int pix);
-
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix);
@@ -1197,6 +1213,11 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
                                uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
                                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix);
@@ -1208,6 +1229,11 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
                   uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToUV422Row_C(const uint8* src_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 568
+#define LIBYUV_VERSION 569

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -290,24 +290,31 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
 }

 static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                       uint8* dst, int dst_stride_frame,
+                       uint8* dst, int dst_stride,
                       int width, int height) {
  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#elif defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(width, 4)) {
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
    CopyRow = CopyRow_X86;
+  }
+#endif
 #if defined(HAS_COPYROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
-        IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) &&
-        IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
-        IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
-      CopyRow = CopyRow_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src, 16) &&
+      IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
 #endif
+#if defined(HAS_COPYROW_AVX2)
+  // TODO(fbarchard): Detect Fast String support.
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    CopyRow = CopyRow_AVX2;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_NEON;
  }
 #endif
 #if defined(HAS_COPYROW_MIPS)
@@ -319,9 +326,9 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
  // Copy plane
  for (int y = 0; y < height - 1; y += 2) {
    CopyRow(src, dst, width);
-    CopyRow(src + src_stride_0, dst + dst_stride_frame, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
    src += src_stride_0 + src_stride_1;
-    dst += dst_stride_frame * 2;
+    dst += dst_stride * 2;
  }
  if (height & 1) {
    CopyRow(src, dst, width);
@@ -381,12 +388,7 @@ static int X420ToI420(const uint8* src_y,
  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
    SplitUVRow = SplitUVRow_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
-      SplitUVRow = SplitUVRow_Unaligned_AVX2;
-      if (IS_ALIGNED(src_uv, 32) && IS_ALIGNED(src_stride_uv, 32) &&
-          IS_ALIGNED(dst_u, 32) && IS_ALIGNED(dst_stride_u, 32) &&
-          IS_ALIGNED(dst_v, 32) && IS_ALIGNED(dst_stride_v, 32)) {
-        SplitUVRow = SplitUVRow_AVX2;
-      }
+      SplitUVRow = SplitUVRow_AVX2;
    }
  }
 #endif
@@ -413,8 +415,12 @@ static int X420ToI420(const uint8* src_y,
 #endif

  if (dst_y) {
-    CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
-               width, height);
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
  }

  int halfheight = (height + 1) >> 1;
@@ -519,6 +525,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
    CopyRow = CopyRow_SSE2;
  }
 #endif
+#if defined(HAS_COPYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    CopyRow = CopyRow_AVX2;
+  }
+#endif
 #if defined(HAS_COPYROW_MIPS)
  if (TestCpuFlag(kCpuHasMIPS)) {
    CopyRow = CopyRow_MIPS;
@@ -544,7 +555,20 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
      }
    }
  }
-#elif defined(HAS_YUY2TOYROW_NEON)
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    clear = true;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    YUY2ToYRow = YUY2ToYRow_Any_NEON;
    if (width >= 16) {
@@ -573,6 +597,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
    CopyRow(src_y, dst_y, width);
    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
  }
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return 0;
 }

@@ -610,7 +639,20 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
      }
    }
  }
-#elif defined(HAS_YUY2TOYROW_NEON)
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    bool clear = true;
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    YUY2ToYRow = YUY2ToYRow_Any_NEON;
    if (width >= 16) {
@@ -636,6 +678,12 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
    YUY2ToYRow(src_yuy2, dst_y, width);
  }
+
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return 0;
 }

@@ -673,7 +721,20 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
      }
    }
  }
-#elif defined(HAS_UYVYTOYROW_NEON)
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    bool clear = true;
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    UYVYToYRow = UYVYToYRow_Any_NEON;
    if (width >= 16) {
@@ -699,6 +760,12 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
    UYVYToYRow(src_uyvy, dst_y, width);
  }
+
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return 0;
 }

@@ -747,14 +814,8 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_AVX2;
-      ARGBToYRow = ARGBToYRow_Unaligned_AVX2;
-      if (IS_ALIGNED(src_argb, 32) && IS_ALIGNED(src_stride_argb, 32)) {
-       ARGBToUVRow = ARGBToUVRow_AVX2;
-        if (IS_ALIGNED(dst_y, 32) && IS_ALIGNED(dst_stride_y, 32)) {
-         ARGBToYRow = ARGBToYRow_AVX2;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -451,15 +451,12 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
+  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    clear = true;
    MergeUVRow_ = MergeUVRow_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_AVX2;
-      if (IS_ALIGNED(src_u, 32) && IS_ALIGNED(src_stride_u, 32) &&
-          IS_ALIGNED(src_v, 32) && IS_ALIGNED(src_stride_v, 32) &&
-          IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
-        MergeUVRow_ = MergeUVRow_AVX2;
-      }
+      MergeUVRow_ = MergeUVRow_AVX2;
    }
  }
 #endif
@@ -481,6 +478,12 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
    src_v += src_stride_v;
    dst_uv += dst_stride_uv;
  }
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
+
  return 0;
 }


--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -183,7 +183,18 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
      }
    }
  }
-#elif defined(HAS_ARGBTOYROW_NEON)
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    clear = true;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    ARGBToYRow = ARGBToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
@@ -206,6 +217,11 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return 0;
 }

@@ -277,10 +293,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
    MergeUVRow_ = MergeUVRow_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_AVX2;
-      if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
-        MergeUVRow_ = MergeUVRow_AVX2;
-      }
+      MergeUVRow_ = MergeUVRow_AVX2;
    }
  }
 #endif
@@ -383,10 +396,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
    MergeUVRow_ = MergeUVRow_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_AVX2;
-      if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
-        MergeUVRow_ = MergeUVRow_AVX2;
-      }
+      MergeUVRow_ = MergeUVRow_AVX2;
    }
  }
 #endif
@@ -624,7 +634,18 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
      }
    }
  }
-#elif defined(HAS_ARGBTOYROW_NEON)
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    clear = true;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    ARGBToYRow = ARGBToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
@@ -638,6 +659,11 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_y += dst_stride_y;
  }
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return 0;
 }


--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -34,11 +34,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
  }

  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
 #if defined(HAS_COPYROW_X86)
  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
    CopyRow = CopyRow_X86;
@@ -51,6 +46,17 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
    CopyRow = CopyRow_SSE2;
  }
 #endif
+#if defined(HAS_COPYROW_AVX2)
+  // TODO(fbarchard): Detect Fast String support.
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    CopyRow = CopyRow_AVX2;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
 #if defined(HAS_COPYROW_MIPS)
  if (TestCpuFlag(kCpuHasMIPS)) {
    CopyRow = CopyRow_MIPS;
@@ -119,11 +125,11 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_MIRRORROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
-        IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-        IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+    MirrorRow = MirrorRow_SSSE3;
+  }
 #endif

  // Mirror plane
@@ -148,17 +154,15 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
    src_stride_yuy2 = -src_stride_yuy2;
  }
  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
+                         uint8* dst_u, uint8* dst_v, int pix);
  void (*YUY2ToYRow)(const uint8* src_yuy2,
                     uint8* dst_y, int pix);
  YUY2ToYRow = YUY2ToYRow_C;
  YUY2ToUV422Row = YUY2ToUV422Row_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
@@ -170,13 +174,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
      }
    }
  }
-#elif defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      YUY2ToYRow = YUY2ToYRow_Any_NEON;
-      if (width > 16) {
-        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-      }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    bool clear = true;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
    }
    if (IS_ALIGNED(width, 16)) {
      YUY2ToYRow = YUY2ToYRow_NEON;
@@ -193,6 +208,12 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
+
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return 0;
 }

@@ -210,17 +231,15 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
    src_stride_uyvy = -src_stride_uyvy;
  }
  void (*UYVYToUV422Row)(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
+                         uint8* dst_u, uint8* dst_v, int pix);
  void (*UYVYToYRow)(const uint8* src_uyvy,
                     uint8* dst_y, int pix);
  UYVYToYRow = UYVYToYRow_C;
  UYVYToUV422Row = UYVYToUV422Row_C;
 #if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
-      UYVYToYRow = UYVYToYRow_Any_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
@@ -232,13 +251,24 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
      }
    }
  }
-#elif defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-      }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    bool clear = true;
+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUV422Row = UYVYToUV422Row_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
    }
    if (IS_ALIGNED(width, 16)) {
      UYVYToYRow = UYVYToYRow_NEON;
@@ -255,6 +285,12 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
+
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
  return 0;
 }


--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -196,7 +196,9 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
    }

 #ifdef HAS_ARGBTOYROW_AVX2
-YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_Unaligned_AVX2, 4, 1, 32)
+YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
+YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
+YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
 #endif
 #ifdef HAS_ARGBTOYROW_SSSE3
 YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
@@ -266,7 +268,9 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
    }

 #ifdef HAS_ARGBTOYROW_AVX2
-UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_Unaligned_AVX2, ARGBToUVRow_C, 4, 31)
+UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
+UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
+UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
 #endif
 #ifdef HAS_ARGBTOUVROW_SSSE3
 UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
@@ -306,6 +310,12 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
 UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
         ARGBToUV444Row_C, 4, 15, 0)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
+         YUY2ToUV422Row_C, 2, 31, 1)
+UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
+         UYVYToUV422Row_C, 2, 31, 1)
+#endif
 #ifdef HAS_ARGBTOUVROW_SSSE3
 UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
         ARGBToUV422Row_C, 4, 15, 1)
@@ -343,7 +353,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
 SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
 #endif
 #ifdef HAS_SPLITUVROW_AVX2
-SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31)
+SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
 #endif
 #ifdef HAS_SPLITUVROW_NEON
 SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
@@ -369,7 +379,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
 MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX2
-MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31)
+MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
 #endif
 #ifdef HAS_MERGEUVROW_NEON
 MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)

--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/row_x86.asm
+++ b/source/row_x86.asm
@@ -64,9 +64,7 @@ YUY2TOYROW UYVY,a,
 YUY2TOYROW UYVY,u,_Unaligned
 INIT_YMM AVX2
 YUY2TOYROW YUY2,a,
-YUY2TOYROW YUY2,u,_Unaligned
 YUY2TOYROW UYVY,a,
-YUY2TOYROW UYVY,u,_Unaligned

 ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)

@@ -107,7 +105,6 @@ SplitUVRow a,
 SplitUVRow u,_Unaligned
 INIT_YMM AVX2
 SplitUVRow a,
-SplitUVRow u,_Unaligned

 ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 ;                      int width);
@@ -121,11 +118,17 @@ cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
    mov%1      m0, [src_uq]
    mov%1      m1, [src_vq]
    lea        src_uq, [src_uq + mmsize]
-    mova       m2, m0
-    punpcklbw  m0, m0, m1       // first 8 UV pairs
-    punpckhbw  m2, m2, m1       // next 8 UV pairs
-    mov%1      [dst_uvq], m0
+    punpcklbw  m2, m0, m1       // first 8 UV pairs
+    punpckhbw  m0, m0, m1       // next 8 UV pairs
+%if cpuflag(AVX2)
+    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
+    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
+    mov%1      [dst_uvq], m1
    mov%1      [dst_uvq + mmsize], m2
+%else
+    mov%1      [dst_uvq], m2
+    mov%1      [dst_uvq + mmsize], m0
+%endif
    lea        dst_uvq, [dst_uvq + mmsize * 2]
    sub        pixd, mmsize
    jg         .convertloop
@@ -140,4 +143,4 @@ MergeUVRow_ a,
 MergeUVRow_ u,_Unaligned
 INIT_YMM AVX2
 MergeUVRow_ a,
-MergeUVRow_ u,_Unaligned
+