Mirror a plane at a time so each can check cpu/alignment independently

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/370001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@148 16f28f9a-4ce2-e073-06de-1de4eb20be90

Mirror a plane at a time so each can check cpu/alignment independently
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/370001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@148 16f28f9a-4ce2-e073-06de-1de4eb20be90
42831e0a · fbarchard@google.com · ba03e4d9 · 42831e0a · 42831e0a · 42831e0a
Commit 42831e0a authored Jan 21, 2012 by fbarchard@google.com
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 146
+Version: 147
 License: BSD
 License File: LICENSE


--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -278,27 +278,40 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }

-// Copy ARGB with optional flipping
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height) {
-  if (!src_argb ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+// Mirror a plane of data
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+                 uint8* dst_y, int dst_stride_y,
+                 int width, int height) {
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width);
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    MirrorRow = MirrorRow_NEON;
+  } else
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+    MirrorRow = MirrorRow_SSSE3;
+  } else
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+    MirrorRow = MirrorRow_SSE2;
+  } else
+#endif
+  {
+    MirrorRow = MirrorRow_C;
  }

-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-            width * 4, height);
-  return 0;
+  // Mirror plane
+  for (int y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
 }

+// Mirror I420 with optional flipping
 int I420Mirror(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
@@ -311,13 +324,10 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
      width <= 0 || height == 0) {
    return -1;
  }
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    halfheight = (height + 1) >> 1;
+    int halfheight = (height + 1) >> 1;
    src_y = src_y + (height - 1) * src_stride_y;
    src_u = src_u + (halfheight - 1) * src_stride_u;
    src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -325,60 +335,35 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
  }
-  void (*ReverseRow)(const uint8* src, uint8* dst, int width);
-#if defined(HAS_REVERSE_ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(width, 32)) {
-    ReverseRow = ReverseRow_NEON;
-  } else
-#endif
-#if defined(HAS_REVERSE_ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    ReverseRow = ReverseRow_SSSE3;
-  } else
-#endif
-#if defined(HAS_REVERSE_ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    ReverseRow = ReverseRow_SSE2;
-  } else
-#endif
-  {
-    ReverseRow = ReverseRow_C;
-  }

-  // Y Plane
-  int y;
-  for (y = 0; y < height; ++y) {
-    ReverseRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
  }
-  // U Plane
-  for (y = 0; y < halfheight; ++y) {
-    ReverseRow(src_u, dst_u, halfwidth);
-    src_u += src_stride_u;
-    dst_u += dst_stride_u;
+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// Copy ARGB with optional flipping
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
  }
-  // V Plane
-  for (y = 0; y < halfheight; ++y) {
-    ReverseRow(src_v, dst_v, halfwidth);
-    src_v += src_stride_v;
-    dst_v += dst_stride_v;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
  return 0;
 }


--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -24,7 +24,7 @@ extern "C" {
    !defined(YUV_DISABLE_ASM)
 // Note static const preferred, but gives internal compiler error on gcc 4.2
 // Shuffle table for reversing the bytes of UV channels.
-uvec8 kShuffleReverseUV = {
+uvec8 kShuffleMirrorUV = {
  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };

@@ -47,7 +47,7 @@ uvec8 kShuffleReverseUV = {
 #endif
 #endif

-typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
+typedef void (*mirror_uv_func)(const uint8*, uint8*, uint8*, int);
 typedef void (*rotate_uv_wx8_func)(const uint8*, int,
                                   uint8*, int,
                                   uint8*, int, int);
@@ -58,10 +58,10 @@ typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
 typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);

 #ifdef __ARM_NEON__
-#define HAS_REVERSE_ROW_NEON
-void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_REVERSE_ROW_UV_NEON
-void ReverseRowUV_NEON(const uint8* src,
+#define HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+#define HAS_MIRRORROW_UV_NEON
+void MirrorRowUV_NEON(const uint8* src,
                        uint8* dst_a, uint8* dst_b,
                        int width);
 #define HAS_TRANSPOSE_WX8_NEON
@@ -852,37 +852,37 @@ void RotatePlane270(const uint8* src, int src_stride,
 void RotatePlane180(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
-  void (*ReverseRow)(const uint8* src, uint8* dst, int width);
-#if defined(HAS_REVERSE_ROW_NEON)
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width);
+#if defined(HAS_MIRRORROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    ReverseRow = ReverseRow_NEON;
+    MirrorRow = MirrorRow_NEON;
  } else
 #endif
-#if defined(HAS_REVERSE_ROW_SSSE3)
+#if defined(HAS_MIRRORROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ReverseRow = ReverseRow_SSSE3;
+    MirrorRow = MirrorRow_SSSE3;
  } else
 #endif
-#if defined(HAS_REVERSE_ROW_SSE2)
+#if defined(HAS_MIRRORROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ReverseRow = ReverseRow_SSE2;
+    MirrorRow = MirrorRow_SSE2;
  } else
 #endif
  {
-    ReverseRow = ReverseRow_C;
+    MirrorRow = MirrorRow_C;
  }

  // Rotate by 180 is a mirror and vertical flip
  src += src_stride * (height - 1);

  for (int y = 0; y < height; ++y) {
-    ReverseRow(src, dst, width);
+    MirrorRow(src, dst, width);
    src -= src_stride;
    dst += dst_stride;
  }
@@ -1004,9 +1004,9 @@ void RotateUV270(const uint8* src, int src_stride,
 }

 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_REVERSE_ROW_UV_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
 __declspec(naked)
-void ReverseRowUV_SSSE3(const uint8* src,
+void MirrorRowUV_SSSE3(const uint8* src,
                         uint8* dst_a, uint8* dst_b,
                         int width) {
 __asm {
@@ -1015,7 +1015,7 @@ __asm {
    mov       edx, [esp + 4 + 8]   // dst_a
    mov       edi, [esp + 4 + 12]  // dst_b
    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm5, kShuffleReverseUV
+    movdqa    xmm5, kShuffleMirrorUV
    lea       eax, [eax + ecx * 2 - 16]

 convertloop:
@@ -1035,8 +1035,8 @@ __asm {

 #elif (defined(__i386__) || defined(__x86_64__)) && \
    !defined(YUV_DISABLE_ASM)
-#define HAS_REVERSE_ROW_UV_SSSE3
-void ReverseRowUV_SSSE3(const uint8* src,
+#define HAS_MIRRORROW_UV_SSSE3
+void MirrorRowUV_SSSE3(const uint8* src,
                        uint8* dst_a, uint8* dst_b,
                        int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
@@ -1057,7 +1057,7 @@ void ReverseRowUV_SSSE3(const uint8* src,
    "+r"(dst_a),    // %1
    "+r"(dst_b),    // %2
    "+r"(temp_width)  // %3
-  : "m"(kShuffleReverseUV) // %4
+  : "m"(kShuffleMirrorUV) // %4
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm5"
@@ -1066,7 +1066,7 @@ void ReverseRowUV_SSSE3(const uint8* src,
 }
 #endif

-static void ReverseRowUV_C(const uint8* src,
+static void MirrorRowUV_C(const uint8* src,
                            uint8* dst_a, uint8* dst_b,
                            int width) {
  int i;
@@ -1083,29 +1083,29 @@ void RotateUV180(const uint8* src, int src_stride,
                 uint8* dst_b, int dst_stride_b,
                 int width, int height) {
  int i;
-  reverse_uv_func ReverseRow;
+  mirror_uv_func MirrorRow;

-#if defined(HAS_REVERSE_ROW_UV_NEON)
+#if defined(HAS_MIRRORROW_UV_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    ReverseRow = ReverseRowUV_NEON;
+    MirrorRow = MirrorRowUV_NEON;
  } else
 #endif
-#if defined(HAS_REVERSE_ROW_UV_SSSE3)
+#if defined(HAS_MIRRORROW_UV_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
-    ReverseRow = ReverseRowUV_SSSE3;
+    MirrorRow = MirrorRowUV_SSSE3;
  } else
 #endif
  {
-    ReverseRow = ReverseRowUV_C;
+    MirrorRow = MirrorRowUV_C;
  }

  dst_a += dst_stride_a * (height - 1);
  dst_b += dst_stride_b * (height - 1);

  for (i = 0; i < height; ++i) {
-    ReverseRow(src, dst_a, dst_b, width);
+    MirrorRow(src, dst_a, dst_b, width);

    src   += src_stride;      // down one line at a time
    dst_a -= dst_stride_a;    // nominally up one line at a time

--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -19,7 +19,7 @@ extern "C" {

 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)

-void ReverseRow_NEON(const uint8* src, uint8* dst, int width) {
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  asm volatile (
    // compute where to start writing destination
    "add         %1, %2                        \n"
@@ -38,7 +38,7 @@ void ReverseRow_NEON(const uint8* src, uint8* dst, int width) {
    "beq         2f                            \n"

    // back of destination by the size of the register that is
-    // going to be reversed
+    // going to be mirrord
    "sub         %1, #16                       \n"

    // the loop needs to run on blocks of 16.  what will be left
@@ -50,12 +50,12 @@ void ReverseRow_NEON(const uint8* src, uint8* dst, int width) {
    "1:                                        \n"
      "vld1.8      {q0}, [%0]!                 \n"  // src += 16

-        // reverse the bytes in the 64 bit segments.  unable to reverse
+        // mirror the bytes in the 64 bit segments.  unable to mirror
        // the bytes in the entire 128 bits in one go.
      "vrev64.8    q0, q0                      \n"

-        // because of the inability to reverse the entire 128 bits
-        // reverse the writing out of the two 64 bit segments.
+        // because of the inability to mirror the entire 128 bits
+        // mirror the writing out of the two 64 bit segments.
      "vst1.8      {d1}, [%1]!                 \n"
      "vst1.8      {d0}, [%1], r3              \n"  // dst -= 16

@@ -272,7 +272,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
  );
 }

-void ReverseRowUV_NEON(const uint8* src,
+void MirrorRowUV_NEON(const uint8* src,
                       uint8* dst_a, uint8* dst_b,
                       int width) {
  asm volatile (
@@ -291,7 +291,7 @@ void ReverseRowUV_NEON(const uint8* src,
    "mov         r12, #-8                      \n"

    // back of destination by the size of the register that is
-    // going to be reversed
+    // going to be mirrord
    "sub         %1, #8                        \n"
    "sub         %2, #8                        \n"

@@ -304,7 +304,7 @@ void ReverseRowUV_NEON(const uint8* src,
    "1:                                        \n"
      "vld2.8      {d0, d1}, [%0]!             \n"  // src += 16

-      // reverse the bytes in the 64 bit segments
+      // mirror the bytes in the 64 bit segments
      "vrev64.8    q0, q0                      \n"

      "vst1.8      {d0}, [%1], r12             \n"  // dst_a -= 8

--- a/source/row.h
+++ b/source/row.h
@@ -39,8 +39,8 @@
 #define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
-#define HAS_REVERSE_ROW_SSSE3
-#define HAS_REVERSE_ROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_SSE2
 #endif

 // The following are available on Windows platforms
@@ -58,7 +58,7 @@

 // The following are available on Neon platforms
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
-#define HAS_REVERSE_ROW_NEON
+#define HAS_MIRRORROW_NEON
 #define HAS_FASTCONVERTYUVTOARGBROW_NEON
 #define HAS_FASTCONVERTYUVTOBGRAROW_NEON
 #define HAS_FASTCONVERTYUVTOABGRROW_NEON
@@ -107,10 +107,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);

-void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
-void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
-void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
-void ReverseRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);

 void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -360,7 +360,7 @@ void FastConvertYToARGBRow_C(const uint8* y_buf,
  }
 }

-void ReverseRow_C(const uint8* src, uint8* dst, int width) {
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
  src += width - 1;
  for (int i = 0; i < width; ++i) {
    dst[i] = src[0];

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -644,14 +644,14 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
 }
 #endif

-#ifdef HAS_REVERSE_ROW_SSSE3
+#ifdef HAS_MIRRORROW_SSSE3

 // Shuffle table for reversing the bytes.
-CONST uvec8 kShuffleReverse = {
+CONST uvec8 kShuffleMirror = {
  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };

-void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
  "movdqa     %3,%%xmm5                        \n"
@@ -666,7 +666,7 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
  : "+r"(src),  // %0
    "+r"(dst),  // %1
    "+r"(temp_width)  // %2
-  : "m"(kShuffleReverse) // %3
+  : "m"(kShuffleMirror) // %3
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm5"
@@ -675,15 +675,15 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
 }
 #endif

-#ifdef HAS_REVERSE_ROW_SSE2
+#ifdef HAS_MIRRORROW_SSE2

-void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
  "lea        -0x10(%0),%0                     \n"
  "1:                                          \n"
-    "movdqa     (%0,%2),%%xmm0                 \n"
-    "movdqa     %%xmm0,%%xmm1                  \n"
+    "movdqu     (%0,%2),%%xmm0                 \n"
+    "movdqu     %%xmm0,%%xmm1                  \n"
    "psllw      $0x8,%%xmm0                    \n"
    "psrlw      $0x8,%%xmm1                    \n"
    "por        %%xmm1,%%xmm0                  \n"
@@ -691,7 +691,7 @@ void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
    "pshufhw    $0x1b,%%xmm0,%%xmm0            \n"
    "pshufd     $0x4e,%%xmm0,%%xmm0            \n"
    "sub        $0x10,%2                       \n"
-    "movdqa     %%xmm0,(%1)                    \n"
+    "movdqu     %%xmm0,(%1)                    \n"
    "lea        0x10(%1),%1                    \n"
    "ja         1b                             \n"
  : "+r"(src),  // %0

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1169,20 +1169,20 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
 #endif
 #endif

-#ifdef HAS_REVERSE_ROW_SSSE3
+#ifdef HAS_MIRRORROW_SSSE3

 // Shuffle table for reversing the bytes.
-static const uvec8 kShuffleReverse = {
+static const uvec8 kShuffleMirror = {
  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };

 __declspec(naked)
-void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 __asm {
    mov       eax, [esp + 4]   // src
    mov       edx, [esp + 8]   // dst
    mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, kShuffleReverse
+    movdqa    xmm5, kShuffleMirror
    lea       eax, [eax - 16]
 convertloop:
    movdqa    xmm0, [eax + ecx]
@@ -1196,18 +1196,20 @@ __asm {
 }
 #endif

-#ifdef HAS_REVERSE_ROW_SSE2
+#ifdef HAS_MIRRORROW_SSE2

+// SSE2 version has movdqu so it can be used on misaligned buffers when SSSE3
+// version can not.
 __declspec(naked)
-void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 __asm {
    mov       eax, [esp + 4]   // src
    mov       edx, [esp + 8]   // dst
    mov       ecx, [esp + 12]  // width
    lea       eax, [eax - 16]
 convertloop:
-    movdqa    xmm0, [eax + ecx]
-    movdqa    xmm1, xmm0        // swap bytes
+    movdqu    xmm0, [eax + ecx]
+    movdqu    xmm1, xmm0        // swap bytes
    psllw     xmm0, 8
    psrlw     xmm1, 8
    por       xmm0, xmm1
@@ -1215,7 +1217,7 @@ __asm {
    pshufhw   xmm0, xmm0, 0x1b
    pshufd    xmm0, xmm0, 0x4e  // swap qwords
    sub       ecx, 16
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
    lea       edx, [edx + 16]
    ja        convertloop
    ret