ARGBShuffle AVX2

BUG=196 TESTED=BGRAToARGB* Review URL: https://webrtc-codereview.appspot.com/1171006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@596 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBShuffle AVX2
BUG=196 TESTED=BGRAToARGB* Review URL: https://webrtc-codereview.appspot.com/1171006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@596 16f28f9a-4ce2-e073-06de-1de4eb20be90
1096543e · fbarchard@google.com · 304a611d · 1096543e · 1096543e · 1096543e
Commit 1096543e authored Mar 08, 2013 by fbarchard@google.com
12 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 595
+Version: 596
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -340,6 +340,13 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 #define HAS_ARGBAFFINEROW_SSE2
 #endif  // LIBYUV_DISABLE_X86
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -35,8 +35,6 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 // Conversions.
-#define HAS_ABGRTOARGBROW_SSSE3
-#define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
@@ -47,12 +45,10 @@ extern "C" {
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565ROW_SSE2
-#define HAS_ARGBTORGBAROW_SSSE3
 #define HAS_ARGBTOUV422ROW_SSSE3
 #define HAS_ARGBTOUV444ROW_SSSE3
 #define HAS_ARGBTOUVROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOARGBROW_SSSE3
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_SSE2
@@ -84,7 +80,6 @@ extern "C" {
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB24TOYROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
-#define HAS_RGBATOARGBROW_SSSE3
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
 #define HAS_SETROW_X86
@@ -98,6 +93,7 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
 // Effects
 #define HAS_ARGBADDROW_SSE2
@@ -140,6 +136,7 @@ extern "C" {
 #define HAS_HALFROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
 // Effects
 #define HAS_ARGBATTENUATEROW_AVX2
@@ -177,7 +174,6 @@ extern "C" {
 // The following are available on Neon platforms
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_ABGRTOARGBROW_NEON
 #define HAS_ABGRTOUVROW_NEON
 #define HAS_ABGRTOYROW_NEON
 #define HAS_ARGB1555TOARGBROW_NEON
@@ -192,13 +188,11 @@ extern "C" {
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
-#define HAS_ARGBTORGBAROW_NEON
 #define HAS_ARGBTOUV411ROW_NEON
 #define HAS_ARGBTOUV422ROW_NEON
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYROW_NEON
-#define HAS_BGRATOARGBROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_COPYROW_NEON
@@ -233,7 +227,6 @@ extern "C" {
 #define HAS_RGB565TOARGBROW_NEON
 #define HAS_RGB565TOUVROW_NEON
 #define HAS_RGB565TOYROW_NEON
-#define HAS_RGBATOARGBROW_NEON
 #define HAS_RGBATOUVROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
@@ -633,11 +626,24 @@ void SetRow_C(uint8* dst, uint32 v32, int count);
 void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
                   int height);
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
+// ARGBShufflers for BGRAToARGB etc.
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-void ABGRToARGBRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_argb,
+                      const uint8* shuffler, int pix);
-                                   int pix);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix);
+                          const uint8* shuffler, int pix);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                    const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
@@ -645,9 +651,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                            int pix);
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix);
-void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix);
-void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix);
-void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
@@ -655,9 +659,6 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
                            int pix);
 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
                            int pix);
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
-void RGBAToARGBRow_C(const uint8* src_rgba, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
@@ -680,14 +681,12 @@ void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
 void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
                                int pix);
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 595
+#define LIBYUV_VERSION 596
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -15,6 +15,7 @@
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
+#include "libyuv/planar_functions.h"
 #include "libyuv/rotate_argb.h"
 #include "libyuv/row.h"
 #include "libyuv/video_common.h"
@@ -295,41 +296,30 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
  return 0;
 }
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
 // Convert BGRA to ARGB.
 LIBYUV_API
 int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
-  if (!src_bgra || !dst_argb ||
+  return ARGBShuffle(src_bgra, src_stride_bgra,
-      width <= 0 || height == 0) {
+                     dst_argb, dst_stride_argb,
-    return -1;
+                     reinterpret_cast<const uint8*>(&kShuffleMaskBGRAToARGB),
-  }
+                     width, height);
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-  void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix) =
-      BGRAToARGBRow_C;
-#if defined(HAS_BGRATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    BGRAToARGBRow = BGRAToARGBRow_SSSE3;
-  }
-#elif defined(HAS_BGRATOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    BGRAToARGBRow = BGRAToARGBRow_NEON;
-  }
-#endif
-  for (int y = 0; y < height; ++y) {
-    BGRAToARGBRow(src_bgra, dst_argb, width);
-    src_bgra += src_stride_bgra;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
 }
 // Convert ABGR to ARGB.
@@ -337,40 +327,10 @@ LIBYUV_API
 int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
-  if (!src_abgr || !dst_argb ||
+  return ARGBShuffle(src_abgr, src_stride_abgr,
-      width <= 0 || height == 0) {
+                     dst_argb, dst_stride_argb,
-    return -1;
+                     reinterpret_cast<const uint8*>(&kShuffleMaskABGRToARGB),
-  }
+                     width, height);
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-  void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix) =
-      ABGRToARGBRow_C;
-#if defined(HAS_ABGRTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
-    // TODO(fbarchard): Port to posix.
-#if defined(_M_IX86)
-      ABGRToARGBRow = ABGRToARGBRow_Unaligned_SSSE3;
-#endif
-      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16))
-        ABGRToARGBRow = ABGRToARGBRow_SSSE3;
-  }
-#elif defined(HAS_ABGRTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ABGRToARGBRow = ABGRToARGBRow_NEON;
-  }
-#endif
-  for (int y = 0; y < height; ++y) {
-    ABGRToARGBRow(src_abgr, dst_argb, width);
-    src_abgr += src_stride_abgr;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
 }
 // Convert RGBA to ARGB.
@@ -378,36 +338,10 @@ LIBYUV_API
 int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
-  if (!src_rgba || !dst_argb ||
+  return ARGBShuffle(src_rgba, src_stride_rgba,
-      width <= 0 || height == 0) {
+                     dst_argb, dst_stride_argb,
-    return -1;
+                     reinterpret_cast<const uint8*>(&kShuffleMaskRGBAToARGB),
-  }
+                     width, height);
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
-  }
-  void (*RGBAToARGBRow)(const uint8* src_rgba, uint8* dst_argb, int pix) =
-      RGBAToARGBRow_C;
-#if defined(HAS_RGBATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RGBAToARGBRow = RGBAToARGBRow_SSSE3;
-  }
-#elif defined(HAS_RGBATOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    RGBAToARGBRow = RGBAToARGBRow_NEON;
-  }
-#endif
-  for (int y = 0; y < height; ++y) {
-    RGBAToARGBRow(src_rgba, dst_argb, width);
-    src_rgba += src_stride_rgba;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
 }
 // Convert RGB24 to ARGB.

--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -667,42 +667,20 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
  return 0;
 }
+// Shuffle table for converting ARGB to RGBA.
+static const uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
 // Convert ARGB to RGBA.
 LIBYUV_API
 int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
               uint8* dst_rgba, int dst_stride_rgba,
               int width, int height) {
-  if (!src_argb || !dst_rgba ||
+  return ARGBShuffle(src_argb, src_stride_argb,
-      width <= 0 || height == 0) {
+                     dst_rgba, dst_stride_rgba,
-    return -1;
+                     reinterpret_cast<const uint8*>(&kShuffleMaskARGBToRGBA),
-  }
+                     width, height);
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToRGBARow)(const uint8* src_argb, uint8* dst_rgba, int pix) =
-      ARGBToRGBARow_C;
-#if defined(HAS_ARGBTORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-    ARGBToRGBARow = ARGBToRGBARow_SSSE3;
-  }
-#elif defined(HAS_ARGBTORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    ARGBToRGBARow = ARGBToRGBARow_NEON;
-  }
-#endif
-  for (int y = 0; y < height; ++y) {
-    ARGBToRGBARow(src_argb, dst_rgba, width);
-    src_argb += src_stride_argb;
-    dst_rgba += dst_stride_rgba;
-  }
-  return 0;
 }
 // Convert ARGB To RGB24.

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1603,6 +1603,71 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
  return 0;
 }
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height) {
+  if (!src_bgra || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  // Coalesce contiguous rows.
+  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
+    return ARGBShuffle(src_bgra, 0, dst_argb, 0, shuffler, width * height, 1);
+  }
+  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+    clear = true;
+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+  for (int y = 0; y < height; ++y) {
+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
+  return 0;
+}
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -266,6 +266,7 @@ YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,
 YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
     4, 4, 7)
 #endif
+#undef YANY
 // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
 #define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK)                     \
@@ -444,6 +445,30 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
 #endif
 #undef MATHROW_ANY
+// Shuffle may want to work in place, so last16 method can not be used.
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
+    void NAMEANY(const uint8* src_argb, uint8* dst_argb,                       \
+                 const uint8* shuffler, int width) {                           \
+      int n = width & ~MASK;                                                   \
+      ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n);                           \
+      ARGBTOY_C(src_argb + n * SBPP,                                           \
+                dst_argb  + n * BPP, shuffler, width & MASK);                  \
+    }
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
+     ARGBShuffleRow_C, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,
+     ARGBShuffleRow_C, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
+     ARGBShuffleRow_C, 4, 4, 3)
+#endif
+#undef YANY
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -30,54 +30,6 @@ static inline void WRITEWORD(uint8* p, uint32 v) {
 }
 #endif
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8 a = src_bgra[0];
-    uint8 r = src_bgra[1];
-    uint8 g = src_bgra[2];
-    uint8 b = src_bgra[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_bgra += 4;
-  }
-}
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8 r = src_abgr[0];
-    uint8 g = src_abgr[1];
-    uint8 b = src_abgr[2];
-    uint8 a = src_abgr[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_abgr += 4;
-  }
-}
-void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
-  for (int x = 0; x < width; ++x) {
-    // To support in-place conversion.
-    uint8 a = src_abgr[0];
-    uint8 b = src_abgr[1];
-    uint8 g = src_abgr[2];
-    uint8 r = src_abgr[3];
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    dst_argb += 4;
-    src_abgr += 4;
-  }
-}
 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
  for (int x = 0; x < width; ++x) {
    uint8 b = src_rgb24[0];
@@ -152,21 +104,6 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
  }
 }
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
-    uint8 a = src_argb[3];
-    dst_rgb[0] = a;
-    dst_rgb[1] = b;
-    dst_rgb[2] = g;
-    dst_rgb[3] = r;
-    dst_rgb += 4;
-    src_argb += 4;
-  }
-}
 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0];
@@ -1569,7 +1506,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
    uint32 g = src_argb[1];
    uint32 r = src_argb[2];
    const uint32 a = src_argb[3];
-    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.16 fixed point
+    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
    b = (b * ia) >> 8;
    g = (g * ia) >> 8;
    r = (r * ia) >> 8;
@@ -1689,6 +1626,29 @@ void ARGBToBayerRow_C(const uint8* src_argb,
  }
 }
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int pix) {
+  int index0 = shuffler[0];
+  int index1 = shuffler[1];
+  int index2 = shuffler[2];
+  int index3 = shuffler[3];
+  // Shuffle a row of ARGB.
+  for (int x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 b = src_argb[index0];
+    uint8 g = src_argb[index1];
+    uint8 r = src_argb[index2];
+    uint8 a = src_argb[index3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
 void I422ToYUY2Row_C(const uint8* src_y,
                     const uint8* src_u,
                     const uint8* src_v,

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -860,58 +860,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  );
 }
-void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
-  asm volatile (
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d2                         \n"  // swap G, R
-    "vswp.u8    d0, d3                         \n"  // swap B, A
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
-  asm volatile (
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d0, d2                         \n"  // swap R, B
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
-  asm volatile (
-    ".p2align  2                                \n"
-  "1:                                           \n"
-    "vld4.8     {d0, d1, d2, d3}, [%0]!         \n"  // load 8 pixels of RGBA.
-    "subs       %2, %2, #8                      \n"  // 8 processed per loop.
-    "vmov.u8    d4, d0                          \n"  // move A after RGB
-    "vst4.8     {d1, d2, d3, d4}, [%1]!         \n"  // store 8 pixels of ARGB.
-    "bgt        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  asm volatile (
    "vmov.u8    d4, #255                       \n"  // Alpha
@@ -1052,23 +1000,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
  );
 }
-void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
-  asm volatile (
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmov.u8    d0, d4                         \n"  // move A before RGB.
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of RGBA.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgba),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
  asm volatile (
    ".p2align  2                               \n"
@@ -1242,22 +1173,41 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
 }
 // Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
-void ARGBToBayerRow_NEON(const uint8* src_argb,
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                         uint8* dst_bayer, uint32 selector, int pix) {
+                         uint32 selector, int pix) {
  asm volatile (
-    "vmov.u32   d2[0], %2                      \n"  // selector
+    "vmov.u32   d2[0], %3                      \n"  // selector
  "1:                                          \n"
    "vld1.u8    {q0}, [%0]!                    \n"  // load row 4 pixels.
-    "subs       %3, %3, #4                     \n"  // 4 processed per loop
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
    "vtbl.8     d3, {d0, d1}, d2               \n"  // look up 4 pixels
    "vst1.u32   {d3[0]}, [%1]!                 \n"  // store 4.
    "bgt        1b                             \n"
  : "+r"(src_argb),         // %0
    "+r"(dst_bayer),        // %1
-    "+r"(selector),         // %2
+    "+r"(pix)               // %2
-    "+r"(pix)               // %3
+  : "r"(selector),          // %3
-  :
+  : "cc", "memory", "q0", "d2"  // Clobber List
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    "vld1.u8    {q2}, [%3]                     \n"  // shuffler
+  "1:                                          \n"
+    "vld1.u8    {q0}, [%0]!                    \n"  // load 4 pixels.
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    "vst1.u8    {q1}, [%1]!                    \n"  // store 4.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
  );
 }

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -101,26 +101,6 @@ CONST uvec8 kShuffleMaskRAWToARGB = {
  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
-// Shuffle table for converting ABGR to ARGB.
-CONST uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-// Shuffle table for converting BGRA to ARGB.
-CONST uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-// Shuffle table for converting RGBA to ARGB.
-CONST uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
-// Shuffle table for converting ARGB to RGBA.
-CONST uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
 // Shuffle table for converting ARGB to RGB24.
 CONST uvec8 kShuffleMaskARGBToRGB24 = {
  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
@@ -202,101 +182,6 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
  );
 }
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskABGRToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskBGRAToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskRGBAToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgba),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskARGBToRGBA)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-  );
-}
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
@@ -4684,15 +4569,15 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                  uint8* dst_uv, int pix) {
  asm volatile (
-    "sub        %0,%1                            \n"
+    "sub       %0,%1                           \n"
-    ".p2align  4                                 \n"
+    ".p2align  4                               \n"
-  "1:                                            \n"
+  "1:                                          \n"
-    "movdqa     (%0),%%xmm0                      \n"
+    "movdqa    (%0),%%xmm0                     \n"
-    "pavgb      (%0,%3),%%xmm0                   \n"
+    "pavgb     (%0,%3),%%xmm0                  \n"
-    "sub        $0x10,%2                         \n"
+    "sub       $0x10,%2                        \n"
-    "movdqa     %%xmm0,(%0,%1)                   \n"
+    "movdqa    %%xmm0,(%0,%1)                  \n"
-    "lea        0x10(%0),%0                      \n"
+    "lea       0x10(%0),%0                     \n"
-    "jg         1b                               \n"
+    "jg        1b                              \n"
  : "+r"(src_uv),  // %0
    "+r"(dst_uv),  // %1
    "+r"(pix)      // %2
@@ -4707,17 +4592,17 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
                          uint32 selector, int pix) {
  asm volatile (
-    "movd   %3,%%xmm5                          \n"
+    "movd      %3,%%xmm5                       \n"
-    "pshufd $0x0,%%xmm5,%%xmm5                 \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    "movdqa (%0),%%xmm0                        \n"
+    "movdqa    (%0),%%xmm0                     \n"
-    "lea    0x10(%0),%0                        \n"
+    "lea       0x10(%0),%0                     \n"
-    "pshufb %%xmm5,%%xmm0                      \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub    $0x4,%2                            \n"
+    "sub       $0x4,%2                         \n"
-    "movd   %%xmm0,(%1)                        \n"
+    "movd      %%xmm0,(%1)                     \n"
-    "lea    0x4(%1),%1                         \n"
+    "lea       0x4(%1),%1                      \n"
-    "jg     1b                                 \n"
+    "jg        1b                              \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_bayer), // %1
    "+r"(pix)        // %2
@@ -4729,12 +4614,67 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
  );
 }
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  asm volatile (
+    "movdqa    (%3),%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "lea       0x20(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                    const uint8* shuffler, int pix) {
+  asm volatile (
+    "movdqa    (%3),%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "movdqu    %%xmm1,0x10(%1)                 \n"
+    "lea       0x20(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_frame, int width) {
 asm volatile (
-    "sub        %1,%2                            \n"
+    "sub       %1,%2                             \n"
    ".p2align  4                                 \n"
  "1:                                            \n"
    "movq      (%1),%%xmm2                       \n"

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -132,26 +132,6 @@ static const uvec8 kShuffleMaskRAWToARGB = {
  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
-// Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-// Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-// Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
-// Shuffle table for converting ARGB to RGBA.
-static const uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
 // Shuffle table for converting ARGB to RGB24.
 static const uvec8 kShuffleMaskARGBToRGB24 = {
  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
@@ -230,112 +210,6 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
  }
 }
-__declspec(naked) __declspec(align(16))
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
-  __asm {
-    mov       eax, [esp + 4]   // src_bgra
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskBGRAToARGB
-    sub       edx, eax
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
-__declspec(naked) __declspec(align(16))
-void ABGRToARGBRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_argb,
-                                   int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_abgr
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskABGRToARGB
-    sub       edx, eax
-    align      16
- convertloop:
-    movdqu    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqu    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
-__declspec(naked) __declspec(align(16))
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
-  __asm {
-    mov       eax, [esp + 4]   // src_abgr
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskABGRToARGB
-    sub       edx, eax
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
-__declspec(naked) __declspec(align(16))
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
-  __asm {
-    mov       eax, [esp + 4]   // src_rgba
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskRGBAToARGB
-    sub       edx, eax
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
-__declspec(naked) __declspec(align(16))
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
-  __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgba
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm5, kShuffleMaskARGBToRGBA
-    sub       edx, eax
-    align      16
- convertloop:
-    movdqa    xmm0, [eax]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [eax + edx], xmm0
-    lea       eax, [eax + 16]
-    jg        convertloop
-    ret
-  }
-}
 __declspec(naked) __declspec(align(16))
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  __asm {
@@ -5635,8 +5509,8 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
 #endif  // HAS_HALFROW_AVX2
 __declspec(naked) __declspec(align(16))
-void ARGBToBayerRow_SSSE3(const uint8* src_argb,
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                          uint8* dst_bayer, uint32 selector, int pix) {
+                          uint32 selector, int pix) {
  __asm {
    mov        eax, [esp + 4]    // src_argb
    mov        edx, [esp + 8]    // dst_bayer
@@ -5657,6 +5531,88 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  }
 }
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_bayer
+    mov        ecx, [esp + 12]   // shuffler
+    movdqa     xmm5, [ecx]
+    mov        ecx, [esp + 16]   // pix
+    align      16
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    sub        ecx, 8
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    jg         wloop
+    ret
+  }
+}
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                    const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_bayer
+    mov        ecx, [esp + 12]   // shuffler
+    movdqa     xmm5, [ecx]
+    mov        ecx, [esp + 16]   // pix
+    align      16
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    sub        ecx, 8
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    jg         wloop
+    ret
+  }
+}
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]     // src_argb
+    mov        edx, [esp + 8]     // dst_bayer
+    mov        ecx, [esp + 12]    // shuffler
+    vmovdqa    xmm5, [ecx]
+    vpermq     ymm5, ymm5, 0x44   // same shuffle in high as low.
+    mov        ecx, [esp + 16]    // pix
+    align      16
+  wloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vpshufb    ymm0, ymm0, ymm5
+    vpshufb    ymm1, ymm1, ymm5
+    sub        ecx, 16
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    jg         wloop
+    ret
+  }
+}
+#endif
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....