remove mmx functions

BUG=none TEST=builds Review URL: http://webrtc-codereview.appspot.com/269010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@77 16f28f9a-4ce2-e073-06de-1de4eb20be90

remove mmx functions
BUG=none TEST=builds Review URL: http://webrtc-codereview.appspot.com/269010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@77 16f28f9a-4ce2-e073-06de-1de4eb20be90
eaedc1d7 · fbarchard@google.com · c82af4a5 · eaedc1d7 · eaedc1d7 · eaedc1d7
Commit eaedc1d7 authored Nov 11, 2011 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 69
+Version: 77
 License: BSD
 License File: LICENSE

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1149,11 +1149,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
  } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
-  if (width % 2 == 0) {
-    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
-  } else
 #endif
  {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@@ -1167,8 +1162,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
      src_v += src_stride_v;
    }
  }
-  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
-  EMMS();
  return 0;
 }
@@ -1201,11 +1194,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
  } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOBGRAROW_MMX)
-  if (width % 2 == 0) {
-    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_MMX;
-  } else
 #endif
  {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
@@ -1219,7 +1207,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
      src_v += src_stride_v;
    }
  }
-  EMMS();
  return 0;
 }
@@ -1252,11 +1239,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
  } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOABGRROW_MMX)
-  if (width % 2 == 0) {
-    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_MMX;
-  } else
 #endif
  {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
@@ -1270,7 +1252,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
      src_v += src_stride_v;
    }
  }
-  EMMS();
  return 0;
 }
@@ -1303,11 +1284,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
  } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
-  if (width % 2 == 0) {
-    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
-  } else
 #endif
  {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@@ -1319,8 +1295,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
    src_u += src_stride_u;
    src_v += src_stride_v;
  }
-  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
-  EMMS();
  return 0;
 }
@@ -1353,13 +1327,9 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
  } else
 #endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
-    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_MMX;
-#else
  {
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
  }
-#endif
  for (int y = 0; y < height; ++y) {
    FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
    dst_argb += dst_stride_argb;
@@ -1367,8 +1337,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
    src_u += src_stride_u;
    src_v += src_stride_v;
  }
-  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
-  EMMS();
  return 0;
 }
@@ -1391,11 +1359,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
  } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
-  if (width % 2 == 0) {
-    FastConvertYToARGBRow = FastConvertYToARGBRow_MMX;
-  } else
 #endif
  {
    FastConvertYToARGBRow = FastConvertYToARGBRow_C;
@@ -1405,8 +1368,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
  }
-  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
-  EMMS();
  return 0;
 }

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -13,21 +13,19 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "rotate_priv.h"
+#include "row.h"
 namespace libyuv {
-#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
+#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+    !defined(__APPLE__) && \
-#if defined(_MSC_VER)
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#else
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
-#endif
 // Shuffle table for reversing the bytes.
-extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
+static const uvec8 kShuffleReverse =
  { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
 // Shuffle table for reversing the bytes of UV channels.
-extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
+static const uvec8 kShuffleReverseUV =
  { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
 #endif
@@ -73,7 +71,7 @@ __asm {
    mov       edx, [esp + 12 + 12]  // dst
    mov       esi, [esp + 12 + 16]  // dst_stride
    mov       ecx, [esp + 12 + 20]  // width
- convertloop :
+ convertloop:
    // Read in the data from the source pointer.
    // First round of bit swap.
    movq      xmm0, qword ptr [eax]
@@ -172,7 +170,7 @@ __asm {
    and       esp, ~15
    mov       [esp + 16], ecx
    mov       ecx, [ecx + 16 + 28]  // w
- convertloop :
+ convertloop:
    // Read in the data from the source pointer.
    // First round of bit swap.
    movdqa    xmm0, [eax]
@@ -863,9 +861,9 @@ __asm {
    mov       eax, [esp + 4]   // src
    mov       edx, [esp + 8]   // dst
    mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, _kShuffleReverse
+    movdqa    xmm5, kShuffleReverse
    lea       eax, [eax + ecx - 16]
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    lea       eax, [eax - 16]
    pshufb    xmm0, xmm5
@@ -878,12 +876,16 @@ __asm {
 }
 #elif (defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__APPLE__) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_REVERSE_LINE_SSSE3
 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
-  "movdqa     (%3),%%xmm5                      \n"
+  "movdqa     %0,%%xmm5                        \n"
+  :: "m"(kShuffleReverse)
+  );
+  asm volatile (
  "lea        -0x10(%0,%2,1),%0                \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
@@ -896,12 +898,12 @@ static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
  : "+r"(src),  // %0
    "+r"(dst),  // %1
    "+r"(temp_width)  // %2
-  : "r"(kShuffleReverse)  // %3
+  :
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm5"
 #endif
-);
+  );
 }
 #endif
@@ -1066,10 +1068,10 @@ __asm {
    mov       edx, [esp + 4 + 8]   // dst_a
    mov       edi, [esp + 4 + 12]  // dst_b
    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm5, _kShuffleReverseUV
+    movdqa    xmm5, kShuffleReverseUV
    lea       eax, [eax + ecx * 2 - 16]
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    lea       eax, [eax - 16]
    pshufb    xmm0, xmm5
@@ -1085,6 +1087,7 @@ __asm {
 }
 #elif (defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__APPLE__) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_REVERSE_LINE_UV_SSSE3
 void ReverseLineUV_SSSE3(const uint8* src,
@@ -1092,28 +1095,31 @@ void ReverseLineUV_SSSE3(const uint8* src,
                         int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
-  "movdqa     (%4),%%xmm5                      \n"
+  "movdqa     %0,%%xmm5                        \n"
-  "lea        -0x10(%0,%3,2),%0                \n"
+  :: "m"(kShuffleReverseUV)
+  );
+  asm volatile (
+  "lea        -16(%0,%3,2),%0                  \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
-  "lea        -0x10(%0),%0                     \n"
+  "lea        -16(%0),%0                       \n"
  "pshufb     %%xmm5,%%xmm0                    \n"
  "movlpd     %%xmm0,(%1)                      \n"
-  "lea        0x8(%1),%1                       \n"
+  "lea        8(%1),%1                         \n"
  "movhpd     %%xmm0,(%2)                      \n"
-  "lea        0x8(%2),%2                       \n"
+  "lea        8(%2),%2                         \n"
-  "sub        $0x8,%3                          \n"
+  "sub        $8,%3                            \n"
  "ja         1b                               \n"
  : "+r"(src),      // %0
    "+r"(dst_a),    // %1
    "+r"(dst_b),    // %2
    "+r"(temp_width)  // %3
-  : "r"(kShuffleReverseUV)  // %4
+  :
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm5"
 #endif
-);
+  );
 }
 #endif

--- a/source/row.h
+++ b/source/row.h
@@ -51,15 +51,6 @@
 #define HAS_FASTCONVERTYTOARGBROW_SSE2
 #endif
-// The following are available on Windows and GCC 32 bit
-#if (defined(WIN32) || \
-    defined(__i386__)) && \
-    !defined(LIBYUV_DISABLE_ASM)
-#define HAS_FASTCONVERTYUVTOARGBROW_MMX
-#define HAS_FASTCONVERTYUVTOBGRAROW_MMX
-#define HAS_FASTCONVERTYUVTOABGRROW_MMX
-#endif
 // The following are available on Windows
 #if defined(WIN32) && \
    !defined(LIBYUV_DISABLE_ASM)
@@ -128,12 +119,14 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
 #if defined(_MSC_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+typedef __declspec(align(16)) signed char vec8[16];
+typedef __declspec(align(16)) unsigned char uvec8[16];
+typedef __declspec(align(16)) signed short vec16[8];
 #else // __GNUC__
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
 typedef signed char __attribute__((vector_size(16))) vec8;
 typedef unsigned char __attribute__((vector_size(16))) uvec8;
+typedef signed short __attribute__((vector_size(16))) vec16;
 #endif
 extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
@@ -204,36 +197,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                int width);
 #endif
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
-void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-void FastConvertYToARGBRow_MMX(const uint8* y_buf,
-                               uint8* rgb_buf,
-                               int width);
-#endif
 #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
@@ -268,42 +231,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
 #endif
-// Method to force C version.
-//#define USE_MMX 0
-//#define USE_SSE2 0
-#if !defined(USE_MMX)
-// Windows, Mac and Linux use MMX
-#if defined(__i386__) || defined(_MSC_VER)
-#define USE_MMX 1
-#else
-#define USE_MMX 0
-#endif
-#endif
-#if !defined(USE_SSE2)
-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
-#define USE_SSE2 1
-#else
-#define USE_SSE2 0
-#endif
-#endif
-// x64 uses MMX2 (SSE) so emms is not required.
-// Warning C4799: function has no EMMS instruction.
-// EMMS() is slow and should be called by the calling function once per image.
-#if USE_MMX && !defined(ARCH_CPU_X86_64)
-#if defined(_MSC_VER)
-#define EMMS() __asm emms
-#pragma warning(disable: 4799)
-#else
-#define EMMS() asm("emms")
-#endif
-#else
-#define EMMS()
-#endif
 }  // extern "C"
 #endif  // LIBYUV_SOURCE_ROW_H_
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -542,231 +542,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,  // rdi
 #endif
 );
 }
-#endif
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
-// 32 bit mmx gcc version
-#ifdef OSX
-#define UNDERSCORE "_"
-#else
-#define UNDERSCORE ""
-#endif
-void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-  asm(
-  ".text                                       \n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToARGBRow_MMX         \n"
-"_FastConvertYUVToARGBRow_MMX:                 \n"
-#else
-  ".global FastConvertYUVToARGBRow_MMX         \n"
-"FastConvertYUVToARGBRow_MMX:                  \n"
-#endif
-  "pusha                                       \n"
-  "mov    0x24(%esp),%edx                      \n"
-  "mov    0x28(%esp),%edi                      \n"
-  "mov    0x2c(%esp),%esi                      \n"
-  "mov    0x30(%esp),%ebp                      \n"
-  "mov    0x34(%esp),%ecx                      \n"
-"1:                                            \n"
-  "movzbl (%edi),%eax                          \n"
-  "lea    1(%edi),%edi                         \n"
-  "movzbl (%esi),%ebx                          \n"
-  "lea    1(%esi),%esi                         \n"
-  "movq   " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax                          \n"
-  "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx                       \n"
-  "movq   " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx                         \n"
-  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1                            \n"
-  "paddsw %mm0,%mm2                            \n"
-  "psraw  $0x6,%mm1                            \n"
-  "psraw  $0x6,%mm2                            \n"
-  "packuswb %mm2,%mm1                          \n"
-  "movq   %mm1,0x0(%ebp)                       \n"
-  "lea    8(%ebp),%ebp                         \n"
-  "sub    $0x2,%ecx                            \n"
-  "ja     1b                                   \n"
-  "popa                                        \n"
-  "ret                                         \n"
-);
-void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-  asm(
-  ".text                                       \n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToBGRARow_MMX         \n"
-"_FastConvertYUVToBGRARow_MMX:                 \n"
-#else
-  ".global FastConvertYUVToBGRARow_MMX         \n"
-"FastConvertYUVToBGRARow_MMX:                  \n"
-#endif
-  "pusha                                       \n"
-  "mov    0x24(%esp),%edx                      \n"
-  "mov    0x28(%esp),%edi                      \n"
-  "mov    0x2c(%esp),%esi                      \n"
-  "mov    0x30(%esp),%ebp                      \n"
-  "mov    0x34(%esp),%ecx                      \n"
-"1:                                            \n"
-  "movzbl (%edi),%eax                          \n"
-  "lea    1(%edi),%edi                         \n"
-  "movzbl (%esi),%ebx                          \n"
-  "lea    1(%esi),%esi                         \n"
-  "movq   " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax                          \n"
-  "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx                       \n"
-  "movq   " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx                         \n"
-  "movq   " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1                            \n"
-  "paddsw %mm0,%mm2                            \n"
-  "psraw  $0x6,%mm1                            \n"
-  "psraw  $0x6,%mm2                            \n"
-  "packuswb %mm2,%mm1                          \n"
-  "movq   %mm1,0x0(%ebp)                       \n"
-  "lea    8(%ebp),%ebp                         \n"
-  "sub    $0x2,%ecx                            \n"
-  "ja     1b                                   \n"
-  "popa                                        \n"
-  "ret                                         \n"
-);
-void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width);
-  asm(
-  ".text                                       \n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToABGRRow_MMX         \n"
-"_FastConvertYUVToABGRRow_MMX:                 \n"
-#else
-  ".global FastConvertYUVToABGRRow_MMX         \n"
-"FastConvertYUVToABGRRow_MMX:                  \n"
-#endif
-  "pusha                                       \n"
-  "mov    0x24(%esp),%edx                      \n"
-  "mov    0x28(%esp),%edi                      \n"
-  "mov    0x2c(%esp),%esi                      \n"
-  "mov    0x30(%esp),%ebp                      \n"
-  "mov    0x34(%esp),%ecx                      \n"
-"1:                                            \n"
-  "movzbl (%edi),%eax                          \n"
-  "lea    1(%edi),%edi                         \n"
-  "movzbl (%esi),%ebx                          \n"
-  "lea    1(%esi),%esi                         \n"
-  "movq   " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax                          \n"
-  "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx                       \n"
-  "movq   " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx                         \n"
-  "movq   " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1                            \n"
-  "paddsw %mm0,%mm2                            \n"
-  "psraw  $0x6,%mm1                            \n"
-  "psraw  $0x6,%mm2                            \n"
-  "packuswb %mm2,%mm1                          \n"
-  "movq   %mm1,0x0(%ebp)                       \n"
-  "lea    8(%ebp),%ebp                         \n"
-  "sub    $0x2,%ecx                            \n"
-  "ja     1b                                   \n"
-  "popa                                        \n"
-  "ret                                         \n"
-);
-void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-  asm(
-  ".text                                       \n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUV444ToARGBRow_MMX      \n"
-"_FastConvertYUV444ToARGBRow_MMX:              \n"
-#else
-  ".global FastConvertYUV444ToARGBRow_MMX      \n"
-"FastConvertYUV444ToARGBRow_MMX:               \n"
-#endif
-  "pusha                                       \n"
-  "mov    0x24(%esp),%edx                      \n"
-  "mov    0x28(%esp),%edi                      \n"
-  "mov    0x2c(%esp),%esi                      \n"
-  "mov    0x30(%esp),%ebp                      \n"
-  "mov    0x34(%esp),%ecx                      \n"
-"1:                                            \n"
-  "movzbl (%edi),%eax                          \n"
-  "lea    1(%edi),%edi                         \n"
-  "movzbl (%esi),%ebx                          \n"
-  "lea    1(%esi),%esi                         \n"
-  "movq   " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax                          \n"
-  "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
-  "lea    1(%edx),%edx                         \n"
-  "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
-  "psraw  $0x6,%mm0                            \n"
-  "packuswb %mm0,%mm0                          \n"
-  "movd   %mm0,0x0(%ebp)                       \n"
-  "lea    4(%ebp),%ebp                         \n"
-  "sub    $0x1,%ecx                            \n"
-  "ja     1b                                   \n"
-  "popa                                        \n"
-  "ret                                         \n"
-);
-void FastConvertYToARGBRow_MMX(const uint8* y_buf,
-                               uint8* rgb_buf,
-                               int width);
-  asm(
-  ".text                                       \n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYToARGBRow_MMX           \n"
-"_FastConvertYToARGBRow_MMX:                   \n"
-#else
-  ".global FastConvertYToARGBRow_MMX           \n"
-"FastConvertYToARGBRow_MMX:                    \n"
-#endif
-  "push   %ebx                                 \n"
-  "mov    0x8(%esp),%eax                       \n"
-  "mov    0xc(%esp),%edx                       \n"
-  "mov    0x10(%esp),%ecx                      \n"
-"1:                                            \n"
-  "movzbl (%eax),%ebx                          \n"
-  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
-  "psraw  $0x6,%mm0                            \n"
-  "movzbl 0x1(%eax),%ebx                       \n"
-  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
-  "psraw  $0x6,%mm1                            \n"
-  "packuswb %mm1,%mm0                          \n"
-  "lea    0x2(%eax),%eax                       \n"
-  "movq   %mm0,(%edx)                          \n"
-  "lea    0x8(%edx),%edx                       \n"
-  "sub    $0x2,%ecx                            \n"
-  "ja     1b                                   \n"
-  "pop    %ebx                                 \n"
-  "ret                                         \n"
-);
 #endif
 #ifdef HAS_ARGBTOYROW_SSSE3

--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -14,6 +14,7 @@
 #include <string.h>
 #include "libyuv/cpu_id.h"
+#include "row.h"
 #if defined(_MSC_VER)
 #define ALIGN16(var) __declspec(align(16)) var
@@ -21,6 +22,7 @@
 #define ALIGN16(var) var __attribute__((aligned(16)))
 #endif
 // Note: A Neon reference manual
 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
 // Note: Some SSE2 reference manuals