CopyRow instead of memcpy for copying planes

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/396008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@175 16f28f9a-4ce2-e073-06de-1de4eb20be90

CopyRow instead of memcpy for copying planes
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/396008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@175 16f28f9a-4ce2-e073-06de-1de4eb20be90
19932f8d · fbarchard@google.com · 2d11d43a · 19932f8d · 19932f8d · 19932f8d
Commit 19932f8d authored Feb 16, 2012 by fbarchard@google.com
12 changed files
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -12,6 +12,8 @@
 #define INCLUDE_LIBYUV_CONVERT_H_
 #include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -13,6 +13,10 @@
 #include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 174
+#define LIBYUV_VERSION 175
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -10,8 +10,6 @@
 #include "libyuv/convert.h"
-#include <string.h>  // For memcpy()
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/format_conversion.h"
@@ -283,14 +281,34 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
 static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
                       uint8* dst, int dst_stride_frame,
                       int width, int height) {
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#elif defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) &&
+        IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) &&
+        IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
+        IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+      CopyRow = CopyRow_SSE2;
+    }
+#endif
+  }
+#endif
  // Copy plane
-  for (int y = 0; y < height; y += 2) {
+  for (int y = 0; y < height - 1; y += 2) {
-    memcpy(dst, src, width);
+    CopyRow(src, dst, width);
-    src += src_stride_0;
+    CopyRow(src + src_stride_0, dst + dst_stride_frame, width);
-    dst += dst_stride_frame;
+    src += src_stride_0 + src_stride_1;
-    memcpy(dst, src, width);
+    dst += dst_stride_frame * 2;
-    src += src_stride_1;
+  }
-    dst += dst_stride_frame;
+  if (height & 1) {
+    CopyRow(src, dst, width);
  }
 }
@@ -514,6 +532,24 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
    dst_stride_u = -dst_stride_u;
    dst_stride_v = -dst_stride_v;
  }
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#elif defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+        IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+        IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+      CopyRow = CopyRow_SSE2;
+    }
+#endif
+  }
+#endif
  void (*SplitYUY2)(const uint8* src_yuy2,
                    uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
 #if defined(HAS_SPLITYUY2_SSE2)
@@ -528,7 +564,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
    SplitYUY2 = SplitYUY2_C;
  }
  for (int y = 0; y < height; y += 2) {
-    memcpy(dst_y, src_y, width);
+    CopyRow(src_y, dst_y, width);
    dst_y += dst_stride_y;
    src_y += src_stride_y;

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -10,8 +10,6 @@
 #include "libyuv/convert_from.h"
-#include <string.h>  // For memcpy()
 #include "libyuv/basic_types.h"
 #include "libyuv/convert.h"  // For I420Copy
 #include "libyuv/cpu_id.h"
@@ -43,33 +41,53 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
    dst_stride_u = -dst_stride_u;
    dst_stride_v = -dst_stride_v;
  }
+  int halfwidth = (width + 1) >> 1;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) {
+    CopyRow = CopyRow_NEON;
+  }
+#elif defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(halfwidth, 4)) {
+    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
+        IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+        IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+        IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+        IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+      CopyRow = CopyRow_SSE2;
+    }
+#endif
+  }
+#endif
  // Copy Y plane
  if (dst_y) {
    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
  }
-  int halfwidth = (width + 1) >> 1;
  // UpSample U plane.
  int y;
  for (y = 0; y < height - 1; y += 2) {
-    memcpy(dst_u, src_u, halfwidth);
+    CopyRow(src_u, dst_u, halfwidth);
-    memcpy(dst_u + dst_stride_u, src_u, halfwidth);
+    CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
    src_u += src_stride_u;
    dst_u += dst_stride_u * 2;
  }
  if (height & 1) {
-    memcpy(dst_u, src_u, halfwidth);
+    CopyRow(src_u, dst_u, halfwidth);
  }
  // UpSample V plane.
  for (y = 0; y < height - 1; y += 2) {
-    memcpy(dst_v, src_v, halfwidth);
+    CopyRow(src_v, dst_v, halfwidth);
-    memcpy(dst_v + dst_stride_v, src_v, halfwidth);
+    CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
    src_v += src_stride_v;
    dst_v += dst_stride_v * 2;
  }
  if (height & 1) {
-    memcpy(dst_v, src_v, halfwidth);
+    CopyRow(src_v, dst_v, halfwidth);
  }
  return 0;
 }

--- a/source/convertfrom.cc
+++ b/source/convertfrom.cc
+// TODO(fbarchard): Remove once builds have switched to convert_from
+#include "convert_from.cc"
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -10,8 +10,6 @@
 #include "libyuv/planar_functions.h"
-#include <string.h>  // For memcpy()
 #include "libyuv/cpu_id.h"
 #include "row.h"
@@ -20,110 +18,28 @@ namespace libyuv {
 extern "C" {
 #endif
-// CopyRows copys 'count' bytes using a 16 byte load/store, 64 bytes at time
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_COPYROW_SSE2
-__declspec(naked)
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
-  __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    sub        edx, eax
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     [eax + edx], xmm0
-    movdqa     [eax + edx + 16], xmm1
-    lea        eax, [eax + 32]
-    sub        ecx, 32
-    ja         convertloop
-    ret
-  }
-}
-#define HAS_COPYROW_X86
-__declspec(naked)
-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
-  __asm {
-    mov        eax, esi
-    mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    shr        ecx, 2
-    rep movsd
-    mov        edi, edx
-    mov        esi, eax
-    ret
-  }
-}
-#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
-#define HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "sub        %0,%1                            \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    %%xmm0,(%0,%1)                  \n"
-    "movdqa    %%xmm1,0x10(%0,%1)              \n"
-    "lea       0x20(%0),%0                     \n"
-    "sub       $0x20,%2                        \n"
-    "ja        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-#define HAS_COPYROW_X86
-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
-  size_t width_tmp = static_cast<size_t>(width);
-  asm volatile (
-    "shr       $0x2,%2                         \n"
-    "rep movsl                                 \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
-  :
-  : "memory", "cc"
-  );
-}
-#endif
-void CopyRow_C(const uint8* src, uint8* dst, int count) {
-  memcpy(dst, src, count);
-}
 // Copy a plane of data
 void CopyPlane(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height) {
-  void (*CopyRow)(const uint8* src, uint8* dst, int width);
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
+#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasSSE2) &&
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
-      IS_ALIGNED(width, 32) &&
+    CopyRow = CopyRow_NEON;
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+  }
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+#elif defined(HAS_COPYROW_X86)
-    CopyRow = CopyRow_SSE2;
+  if (IS_ALIGNED(width, 4)) {
-  } else
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
    CopyRow = CopyRow_X86;
-  } else
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) &&
+        IS_ALIGNED(width, 32) &&
+        IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+        IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+      CopyRow = CopyRow_SSE2;
+    }
 #endif
-  {
-    CopyRow = CopyRow_C;
  }
+#endif
  // Copy plane
  for (int y = 0; y < height; ++y) {

--- a/source/row.h
+++ b/source/row.h
@@ -47,6 +47,8 @@ extern "C" {
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROW_SSE2
 #define HAS_SPLITUV_SSE2
+#define HAS_COPYROW_SSE2
+#define HAS_COPYROW_X86
 #define HAS_YUY2TOYROW_SSE2
 #define HAS_UYVYTOYROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
@@ -69,6 +71,7 @@ extern "C" {
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_MIRRORROW_NEON
 #define HAS_SPLITUV_NEON
+#define HAS_COPYROW_NEON
 #define HAS_I420TOARGBROW_NEON
 #define HAS_I420TOBGRAROW_NEON
 #define HAS_I420TOABGRROW_NEON
@@ -131,6 +134,11 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_X86(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
 void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -377,6 +377,10 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  }
 }
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+  memcpy(dst, src, count);
+}
 // Filter 2 rows of YUY2 UV's (422) into U and V (420)
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
                   uint8* dst_u, uint8* dst_v, int pix) {
@@ -423,36 +427,36 @@ void UYVYToYRow_C(const uint8* src_uyvy,
 }
 // Wrappers to handle odd sizes/alignments
-#define MAKEYUVANY(NAMEANY, NAME)                                              \
+#define MAKEYUVANY(NAMEANY, NAME, COPYROW)                                     \
-void NAMEANY(const uint8* y_buf,                                               \
+    void NAMEANY(const uint8* y_buf,                                           \
-             const uint8* u_buf,                                               \
+                 const uint8* u_buf,                                           \
-             const uint8* v_buf,                                               \
+                 const uint8* v_buf,                                           \
-             uint8* rgb_buf,                                                   \
+                 uint8* rgb_buf,                                               \
-             int width) {                                                      \
+                 int width) {                                                  \
-  SIMD_ALIGNED(uint8 row[kMaxStride]);                                         \
+      SIMD_ALIGNED(uint8 row[kMaxStride]);                                     \
-  NAME(y_buf, u_buf, v_buf, row, width);                                       \
+      NAME(y_buf, u_buf, v_buf, row, width);                                   \
-  memcpy(rgb_buf, row, width << 2);                                            \
+      COPYROW(row, rgb_buf, width << 2);                                       \
-}
+    }
 #if defined(HAS_I420TOARGBROW_SSSE3)
-MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3)
+MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3, CopyRow_X86)
-MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3)
+MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3, CopyRow_X86)
-MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3)
+MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3, CopyRow_X86)
 #endif
 #if defined(HAS_I420TOARGBROW_NEON)
-MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON)
+MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, CopyRow_C)
-MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON)
+MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, CopyRow_C)
-MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON)
+MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, CopyRow_C)
 #endif
 #define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP)                                 \
-void NAMEANY(const uint8* argb_buf,                                            \
+    void NAMEANY(const uint8* argb_buf,                                        \
-             uint8* rgb_buf,                                                   \
+                 uint8* rgb_buf,                                               \
-             int width) {                                                      \
+                 int width) {                                                  \
-  SIMD_ALIGNED(uint8 row[kMaxStride]);                                         \
+      SIMD_ALIGNED(uint8 row[kMaxStride]);                                     \
-  ARGBTORGB(argb_buf, row, width);                                             \
+      ARGBTORGB(argb_buf, row, width);                                         \
-  memcpy(rgb_buf, row, width * BPP);                                           \
+      memcpy(rgb_buf, row, width * BPP);                                       \
-}
+    }
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
 MAKEYUVANYRGB(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -184,7 +184,27 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 }
 #endif
-#endif // __ARM_NEON__
+#if defined(HAS_COPYROW_NEON)
+// TODO(fbarchard): Test with and without pld
+//  "pld        [%0, #0xC0]                    \n"  // preload
+// Copy multiple of 64
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "1:                                          \n"
+    "vld1.u8    {q0,q1,q2,q3}, [%0]!           \n"  // load 64
+    "subs       %2, %2, #64                    \n"  // 64 processed per loop
+    "vst1.u8    {q0,q1,q2,q3}, [%1]!           \n"  // store 64
+    "bhi        1b                             \n"
+    : "+r"(src),
+      "+r"(dst),
+      "+r"(count)           // Output registers
+    :                       // Input registers
+    : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+  );
+}
+#endif  // HAS_COPYROW_NEON
+#endif  // __ARM_NEON__
 #ifdef __cplusplus
 }  // extern "C"

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -848,7 +848,6 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
    "pcmpeqb    %%xmm5,%%xmm5                    \n"
    "psrlw      $0x8,%%xmm5                      \n"
    "sub        %1,%2                            \n"
  "1:                                            \n"
    "movdqa     (%0),%%xmm0                      \n"
    "movdqa     0x10(%0),%%xmm1                  \n"
@@ -879,6 +878,45 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 }
 #endif
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "sub        %0,%1                            \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    %%xmm0,(%0,%1)                  \n"
+    "movdqa    %%xmm1,0x10(%0,%1)              \n"
+    "lea       0x20(%0),%0                     \n"
+    "sub       $0x20,%2                        \n"
+    "ja        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif  // HAS_COPYROW_SSE2
+#ifdef HAS_COPYROW_X86
+void CopyRow_X86(const uint8* src, uint8* dst, int width) {
+  size_t width_tmp = static_cast<size_t>(width);
+  asm volatile (
+    "shr       $0x2,%2                         \n"
+    "rep movsl                                 \n"
+  : "+S"(src),  // %0
+    "+D"(dst),  // %1
+    "+c"(width_tmp) // %2
+  :
+  : "memory", "cc"
+  );
+}
+#endif
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
  asm volatile (

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1569,6 +1569,46 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 }
 #endif
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
+__declspec(naked)
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    sub        edx, eax
+  convertloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     [eax + edx], xmm0
+    movdqa     [eax + edx + 16], xmm1
+    lea        eax, [eax + 32]
+    sub        ecx, 32
+    ja         convertloop
+    ret
+  }
+}
+#endif  // HAS_COPYROW_SSE2
+#ifdef HAS_COPYROW_X86
+__declspec(naked)
+void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, esi
+    mov        edx, edi
+    mov        esi, [esp + 4]   // src
+    mov        edi, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep movsd
+    mov        edi, edx
+    mov        esi, eax
+    ret
+  }
+}
+#endif
 #ifdef HAS_YUY2TOYROW_SSE2
 __declspec(naked)
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,