splituv and mirroruv in row use 2 pixels at a time in C

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/432006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@201 16f28f9a-4ce2-e073-06de-1de4eb20be90

splituv and mirroruv in row use 2 pixels at a time in C
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/432006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@201 16f28f9a-4ce2-e073-06de-1de4eb20be90
16a96645 · fbarchard@google.com · f69e90a1 · 16a96645 · 16a96645 · 16a96645
Commit 16a96645 authored Mar 02, 2012 by fbarchard@google.com
9 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 200
+Version: 201
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 200
+#define LIBYUV_VERSION 201
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -23,12 +23,6 @@ extern "C" {
 #if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
    !defined(YUV_DISABLE_ASM)
-// Note static const preferred, but gives internal compiler error on gcc 4.2
-// Shuffle table for reversing the bytes of UV channels.
-uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
@@ -759,8 +753,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 static void TransposeWx8_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int w) {
-  int i;
+  for (int i = 0; i < w; ++i) {
-  for (i = 0; i < w; ++i) {
    dst[0] = src[0 * src_stride];
    dst[1] = src[1 * src_stride];
    dst[2] = src[2 * src_stride];
@@ -777,9 +770,8 @@ static void TransposeWx8_C(const uint8* src, int src_stride,
 static void TransposeWxH_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int width, int height) {
-  int i, j;
+  for (int i = 0; i < width; ++i)
-  for (i = 0; i < width; ++i)
+    for (int j = 0; j < height; ++j)
-    for (j = 0; j < height; ++j)
      dst[i * dst_stride + j] = src[j * src_stride + i];
 }
@@ -1005,79 +997,6 @@ void RotateUV270(const uint8* src, int src_stride,
              width, height);
 }
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_MIRRORROW_UV_SSSE3
-__declspec(naked)
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
-                       int width) {
-  __asm {
-    push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_a
-    mov       edi, [esp + 4 + 12]  // dst_b
-    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, kShuffleMirrorUV
-    lea       eax, [eax + ecx * 2 - 16]
-    sub       edi, edx
- convertloop:
-    movdqa    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm1
-    sub       ecx, 8
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [edx + edi], xmm0
-    lea       edx, [edx + 8]
-    ja        convertloop
-    pop       edi
-    ret
-  }
-}
-#elif (defined(__i386__) || defined(__x86_64__)) && \
-    !defined(YUV_DISABLE_ASM)
-#define HAS_MIRRORROW_UV_SSSE3
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
-                       int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile (
-    "movdqa     %4,%%xmm1                        \n"
-    "lea        -16(%0,%3,2),%0                  \n"
-    "sub        %1,%2                            \n"
-  "1:                                            \n"
-    "movdqa     (%0),%%xmm0                      \n"
-    "lea        -16(%0),%0                       \n"
-    "pshufb     %%xmm1,%%xmm0                    \n"
-    "sub        $8,%3                            \n"
-    "movlpd     %%xmm0,(%1)                      \n"
-    "movhpd     %%xmm0,(%1,%2)                   \n"
-    "lea        8(%1),%1                         \n"
-    "ja         1b                               \n"
-  : "+r"(src),      // %0
-    "+r"(dst_a),    // %1
-    "+r"(dst_b),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV) // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-#endif
-static void MirrorRowUV_C(const uint8* src,
-                            uint8* dst_a, uint8* dst_b,
-                            int width) {
-  src += (width << 1) - 2;
-  for (int i = 0; i < width; ++i) {
-    dst_a[i] = src[0];
-    dst_b[i] = src[1];
-    src -= 2;
-  }
-}
 void RotateUV180(const uint8* src, int src_stride,
                 uint8* dst_a, int dst_stride_a,
                 uint8* dst_b, int dst_stride_b,

--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -19,94 +19,6 @@ extern "C" {
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // compute where to start writing destination
-    "add         %1, %2                        \n"
-    // work on segments that are multiples of 16
-    "lsrs        r3, %2, #4                    \n"
-    // the output is written in two block.  8 bytes followed
-    // by another 8.  reading is done sequentially, from left to
-    // right.  writing is done from right to left in block sizes
-    // %1, the destination pointer is incremented after writing
-    // the first of the two blocks.  need to subtract that 8 off
-    // along with 16 to get the next location.
-    "mov         r3, #-24                      \n"
-    "beq         2f                            \n"
-    // back of destination by the size of the register that is
-    // going to be mirrord
-    "sub         %1, #16                       \n"
-    // the loop needs to run on blocks of 16.  what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %2, #16                       \n"
-    "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                 \n"  // src += 16
-        // mirror the bytes in the 64 bit segments.  unable to mirror
-        // the bytes in the entire 128 bits in one go.
-      "vrev64.8    q0, q0                      \n"
-        // because of the inability to mirror the entire 128 bits
-        // mirror the writing out of the two 64 bit segments.
-      "vst1.8      {d1}, [%1]!                 \n"
-      "vst1.8      {d0}, [%1], r3              \n"  // dst -= 16
-      "subs        %2, #16                     \n"
-    "bge         1b                            \n"
-    // add 16 back to the counter.  if the result is 0 there is no
-    // residuals so jump past
-    "adds        %2, #16                       \n"
-    "beq         5f                            \n"
-    "add         %1, #16                       \n"
-    "2:                                        \n"
-    "mov         r3, #-3                       \n"
-    "sub         %1, #2                        \n"
-    "subs        %2, #2                        \n"
-    // check for 16*n+1 scenarios where segments_of_2 should not
-    // be run, but there is something left over.
-    "blt         4f                            \n"
-// do this in neon registers as per
-// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-    "3:                                        \n"
-    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
-    "vst1.8      {d1[0]}, [%1]!                \n"
-    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
-    "subs        %2, #2                        \n"
-    "bge         3b                            \n"
-    "adds        %2, #2                        \n"
-    "beq         5f                            \n"
-    "4:                                        \n"
-    "add         %1, #1                        \n"
-    "vld1.8      {d0[0]}, [%0]                 \n"
-    "vst1.8      {d0[0]}, [%1]                 \n"
-    "5:                                        \n"
-    : "+r"(src),              // %0
-      "+r"(dst),              // %1
-      "+r"(width)             // %2
-    :
-    : "memory", "cc", "r3", "q0"
-  );
-}
 static const uvec8 vtbl_4x4_transpose =
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
@@ -272,80 +184,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
  );
 }
-void MirrorRowUV_NEON(const uint8* src,
-                       uint8* dst_a, uint8* dst_b,
-                       int width) {
-  asm volatile (
-    // compute where to start writing destination
-    "add         %1, %3                        \n"  // dst_a + width
-    "add         %2, %3                        \n"  // dst_b + width
-    // work on input segments that are multiples of 16, but
-    // width that has been passed is output segments, half
-    // the size of input.
-    "lsrs        r12, %3, #3                   \n"
-    "beq         2f                            \n"
-    // the output is written in to two blocks.
-    "mov         r12, #-8                      \n"
-    // back of destination by the size of the register that is
-    // going to be mirrord
-    "sub         %1, #8                        \n"
-    "sub         %2, #8                        \n"
-    // the loop needs to run on blocks of 8.  what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %3, #8                        \n"
-    "1:                                        \n"
-      "vld2.8      {d0, d1}, [%0]!             \n"  // src += 16
-      // mirror the bytes in the 64 bit segments
-      "vrev64.8    q0, q0                      \n"
-      "vst1.8      {d0}, [%1], r12             \n"  // dst_a -= 8
-      "vst1.8      {d1}, [%2], r12             \n"  // dst_b -= 8
-      "subs        %3, #8                      \n"
-      "bge         1b                          \n"
-    // add 8 back to the counter.  if the result is 0 there is no
-    // residuals so return
-    "adds        %3, #8                        \n"
-    "beq         4f                            \n"
-    "add         %1, #8                        \n"
-    "add         %2, #8                        \n"
-    "2:                                        \n"
-    "mov         r12, #-1                      \n"
-    "sub         %1, #1                        \n"
-    "sub         %2, #1                        \n"
-    "3:                                        \n"
-      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
-      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
-      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
-      "subs        %3, %3, #1                  \n"
-      "bgt         3b                          \n"
-    "4:                                        \n"
-    : "+r"(src),              // %0
-      "+r"(dst_a),            // %1
-      "+r"(dst_b),            // %2
-      "+r"(width)             // %3
-    :
-    : "memory", "cc", "r12", "q0"
-  );
-}
 static const uvec8 vtbl_4x4_transpose_di =
  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };

--- a/source/row.h
+++ b/source/row.h
@@ -54,6 +54,7 @@ extern "C" {
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROW_SSE2
+#define HAS_MIRRORROWUV_SSSE3
 #define HAS_SPLITUV_SSE2
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
@@ -66,6 +67,7 @@ extern "C" {
 // The following are available on Neon platforms
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_MIRRORROW_NEON
+#define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
 #define HAS_COPYROW_NEON
 #define HAS_I420TOARGBROW_NEON
@@ -126,6 +128,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -18,8 +18,8 @@ namespace libyuv {
 extern "C" {
 #endif
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    // To support in-place conversion.
    uint8 r = src_abgr[0];
    uint8 g = src_abgr[1];
@@ -34,8 +34,8 @@ void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
  }
 }
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    // To support in-place conversion.
    uint8 a = src_bgra[0];
    uint8 r = src_bgra[1];
@@ -50,8 +50,8 @@ void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
  }
 }
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_rgb24[0];
    uint8 g = src_rgb24[1];
    uint8 r = src_rgb24[2];
@@ -64,8 +64,8 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  }
 }
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 r = src_raw[0];
    uint8 g = src_raw[1];
    uint8 b = src_raw[2];
@@ -78,8 +78,8 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
  }
 }
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_rgb[0] & 0x1f;
    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
    uint8 r = src_rgb[1] >> 3;
@@ -92,8 +92,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
  }
 }
-void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
+void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_rgb[0] & 0x1f;
    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
    uint8 r = (src_rgb[1] & 0x7c) >> 2;
@@ -107,8 +107,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
  }
 }
-void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
+void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 a = src_rgb[1] >> 4;
    uint8 r = src_rgb[1] & 0x0f;
    uint8 g = src_rgb[0] >> 4;
@@ -122,8 +122,8 @@ void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
  }
 }
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0];
    uint8 g = src_argb[1];
    uint8 r = src_argb[2];
@@ -135,8 +135,8 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0];
    uint8 g = src_argb[1];
    uint8 r = src_argb[2];
@@ -149,8 +149,8 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
 }
 // TODO(fbarchard): support big endian CPU
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0] >> 3;
    uint8 g = src_argb[1] >> 2;
    uint8 r = src_argb[2] >> 3;
@@ -160,8 +160,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0] >> 3;
    uint8 g = src_argb[1] >> 3;
    uint8 r = src_argb[2] >> 3;
@@ -172,8 +172,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0] >> 4;
    uint8 g = src_argb[1] >> 4;
    uint8 r = src_argb[2] >> 4;
@@ -233,9 +233,9 @@ MAKEROWY(ARGB,2,1,0)
 MAKEROWY(BGRA,1,2,3)
 MAKEROWY(ABGR,0,1,2)
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
  // Copy a Y to RGB.
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 y = src_y[0];
    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
    dst_argb[3] = 255u;
@@ -360,20 +360,42 @@ void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
 void MirrorRow_C(const uint8* src, uint8* dst, int width) {
  src += width - 1;
-  for (int i = 0; i < width; ++i) {
+  for (int x = 0; x < width - 1; x += 2) {
-    dst[i] = src[0];
+    dst[x] = src[0];
-    --src;
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
  }
 }
-void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
-  // Copy a row of UV.
+  src_uv += (width - 1) << 1;
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width - 1; x += 2) {
-    dst_u[0] = src_uv[0];
+    dst_u[x] = src_uv[0];
-    dst_v[0] = src_uv[1];
+    dst_u[x + 1] = src_uv[-2];
-    src_uv += 2;
+    dst_v[x] = src_uv[1];
-    dst_u += 1;
+    dst_v[x + 1] = src_uv[-2 + 1];
-    dst_v += 1;
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
  }
 }
@@ -383,9 +405,9 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
 // Filter 2 rows of YUY2 UV's (422) into U and V (420)
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int pix) {
+                   uint8* dst_u, uint8* dst_v, int width) {
  // Output a row of UV values, filtering 2 rows of YUY2
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < width; x += 2) {
    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
    src_yuy2 += 4;
@@ -394,20 +416,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
  }
 }
-void YUY2ToYRow_C(const uint8* src_yuy2,
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
-                  uint8* dst_y, int pix) {
  // Copy a row of yuy2 Y values
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width - 1; x += 2) {
-    dst_y[0] = src_yuy2[0];
+    dst_y[x] = src_yuy2[0];
-    src_yuy2 += 2;
+    dst_y[x + 1] = src_yuy2[2];
-    dst_y += 1;
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
  }
 }
 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int pix) {
+                   uint8* dst_u, uint8* dst_v, int width) {
  // Copy a row of uyvy UV values
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < width; x += 2) {
    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
    src_uyvy += 4;
@@ -416,13 +440,15 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
  }
 }
-void UYVYToYRow_C(const uint8* src_uyvy,
+void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
-                  uint8* dst_y, int pix) {
  // Copy a row of uyvy Y values
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width - 1; x += 2) {
-    dst_y[0] = src_uyvy[1];
+    dst_y[x] = src_yuy2[1];
-    src_uyvy += 2;
+    dst_y[x + 1] = src_yuy2[3];
-    dst_y += 1;
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[1];
  }
 }

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1493,7 +1493,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 #endif
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 CONST uvec8 kShuffleMirror = {
  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
@@ -1524,7 +1523,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif
 #ifdef HAS_MIRRORROW_SSE2
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
@@ -1554,6 +1552,40 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 }
 #endif
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+CONST uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       -16(%0,%3,2),%0                 \n"
+    "sub       %1,%2                           \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "lea       -16(%0),%0                      \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "sub       $8,%3                           \n"
+    "movlpd    %%xmm0,(%1)                     \n"
+    "movhpd    %%xmm0,(%1,%2)                  \n"
+    "lea       8(%1),%1                        \n"
+    "ja        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV) // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif
 #ifdef HAS_SPLITUV_SSE2
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1501,7 +1501,6 @@ __asm {
 #endif
 #ifdef HAS_MIRRORROW_SSE2
 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 // version can not.
 __declspec(naked)
@@ -1529,6 +1528,41 @@ __asm {
 }
 #endif
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+__declspec(naked)
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+ convertloop:
+    movdqa    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    sub       ecx, 8
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    ja        convertloop
+    pop       edi
+    ret
+  }
+}
+#endif
 #ifdef HAS_SPLITUV_SSE2
 __declspec(naked)
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {