Remove code alignment declspec from Visual C versions for vs2014 compatibility.

BUG=422 TESTED=local vs2013 build still passes. Review URL: https://webrtc-codereview.appspot.com/45959004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1365 16f28f9a-4ce2-e073-06de-1de4eb20be90

Remove code alignment declspec from Visual C versions for vs2014 compatibility.
BUG=422 TESTED=local vs2013 build still passes. Review URL: https://webrtc-codereview.appspot.com/45959004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1365 16f28f9a-4ce2-e073-06de-1de4eb20be90
c7161d1c · fbarchard@google.com · 1eb51bcf · c7161d1c · c7161d1c · c7161d1c
Commit c7161d1c authored Apr 12, 2015 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1364
+Version: 1365
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1364
+#define LIBYUV_VERSION 1365
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -19,7 +19,7 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
    defined(_MSC_VER) && !defined(__clang__)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]    // src_a
@@ -60,7 +60,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]    // src_a
@@ -134,7 +134,7 @@ static uvec32 kHashMul3 = {
 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
    _asm _emit 0x40 _asm _emit reg
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
@@ -185,7 +185,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src

--- a/source/mjpeg_validate.cc
+++ b/source/mjpeg_validate.cc
@@ -23,7 +23,7 @@ extern "C" {
 #ifdef ENABLE_SCASB
 // Multiple of 1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
  __asm {
    mov        edx, edi

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -73,7 +73,7 @@ void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
 #if !defined(LIBYUV_DISABLE_X86) && \
    defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
 #define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                               uint8* dst, int dst_stride, int width) {
  __asm {
@@ -165,7 +165,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 }
 #define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -147,8 +147,6 @@ static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
 // 64 bit
 #if defined(_M_X64)
-__declspec(align(16))
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -198,10 +196,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
    width -= 8;
  }
 }
 // 32 bit
 #else  // defined(_M_X64)
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Constants for ARGB.
@@ -324,7 +320,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
 };
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  __asm {
    mov        eax, [esp + 4]        // src_y
@@ -353,7 +349,7 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
 #ifdef HAS_J400TOARGBROW_AVX2
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
  __asm {
    mov         eax, [esp + 4]        // src_y
@@ -383,7 +379,7 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
 }
 #endif  // HAS_J400TOARGBROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  __asm {
    mov       eax, [esp + 4]   // src_rgb24
@@ -421,7 +417,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
                        int pix) {
  __asm {
@@ -467,7 +463,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                          int pix) {
  __asm {
@@ -523,7 +519,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
                          int pix) {
  __asm {
@@ -574,7 +570,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
 #endif  // HAS_RGB565TOARGBROW_AVX2
 #ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
                            int pix) {
  __asm {
@@ -624,7 +620,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
 #endif  // HAS_ARGB1555TOARGBROW_AVX2
 #ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix) {
  __asm {
@@ -660,7 +656,7 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
 #endif  // HAS_ARGB4444TOARGBROW_AVX2
 // 24 instructions
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                            int pix) {
  __asm {
@@ -713,7 +709,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
 }
 // 18 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix) {
  __asm {
@@ -751,7 +747,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov       eax, [esp + 4]   // src_argb
@@ -789,7 +785,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov       eax, [esp + 4]   // src_argb
@@ -828,7 +824,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
 }
 // 4 pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov       eax, [esp + 4]   // src_argb
@@ -866,7 +862,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 }
 // 8 pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
                                const uint32 dither4, int pix) {
  __asm {
@@ -912,7 +908,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
 }
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
                                const uint32 dither4, int pix) {
  __asm {
@@ -955,7 +951,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov       eax, [esp + 4]   // src_argb
@@ -996,7 +992,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov       eax, [esp + 4]   // src_argb
@@ -1026,7 +1022,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 }
 #ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov        eax, [esp + 4]      // src_argb
@@ -1063,7 +1059,7 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 #endif  // HAS_ARGBTORGB565ROW_AVX2
 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov        eax, [esp + 4]      // src_argb
@@ -1103,7 +1099,7 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
  __asm {
    mov        eax, [esp + 4]   // src_argb
@@ -1134,7 +1130,7 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  __asm {
    mov        eax, [esp + 4]   /* src_argb */
@@ -1169,7 +1165,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  __asm {
    mov        eax, [esp + 4]   /* src_argb */
@@ -1288,7 +1284,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  __asm {
    mov        eax, [esp + 4]   /* src_argb */
@@ -1321,7 +1317,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  __asm {
    mov        eax, [esp + 4]   /* src_argb */
@@ -1354,7 +1350,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  __asm {
    mov        eax, [esp + 4]   /* src_argb */
@@ -1387,7 +1383,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
  __asm {
@@ -1457,7 +1453,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
  __asm {
@@ -1594,7 +1590,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
                          uint8* dst_u, uint8* dst_v, int width) {
  __asm {
@@ -1651,7 +1647,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
                          uint8* dst_u, uint8* dst_v, int width) {
  __asm {
@@ -1709,7 +1705,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
  __asm {
@@ -1779,7 +1775,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
  __asm {
@@ -1849,7 +1845,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
  __asm {
@@ -2005,7 +2001,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 #ifdef HAS_I422TOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_AVX2(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2041,7 +2037,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_J422TOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void J422ToARGBRow_AVX2(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2077,7 +2073,7 @@ void J422ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I444ToARGBRow_AVX2(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2113,7 +2109,7 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_I411TOARGBROW_AVX2
 // 16 pixels
 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I411ToARGBRow_AVX2(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2149,7 +2145,7 @@ void I411ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_NV12TOARGBROW_AVX2
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV12ToARGBRow_AVX2(const uint8* y_buf,
                        const uint8* uv_buf,
                        uint8* dst_argb,
@@ -2180,7 +2176,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_NV21TOARGBROW_AVX2
 // 16 pixels.
 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV21ToARGBRow_AVX2(const uint8* y_buf,
                        const uint8* uv_buf,
                        uint8* dst_argb,
@@ -2212,7 +2208,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
 // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToBGRARow_AVX2(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2258,7 +2254,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
 // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGBARow_AVX2(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2304,7 +2300,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
 // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToABGRRow_AVX2(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2542,7 +2538,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
 // 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2575,7 +2571,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -2609,7 +2605,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRAWRow_SSSE3(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@@ -2643,7 +2639,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
                           const uint8* u_buf,
                           const uint8* v_buf,
@@ -2682,7 +2678,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2716,7 +2712,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // JPeg color space version of I422ToARGB
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void J422ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2750,7 +2746,7 @@ void J422ToARGBRow_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2785,7 +2781,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* uv_buf,
                         uint8* dst_argb,
@@ -2813,7 +2809,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* uv_buf,
                         uint8* dst_argb,
@@ -2839,7 +2835,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2869,7 +2865,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToABGRRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2900,7 +2896,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGBARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2934,7 +2930,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
 #ifdef HAS_I400TOARGBROW_SSE2
 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I400ToARGBRow_SSE2(const uint8* y_buf,
                        uint8* rgb_buf,
                        int width) {
@@ -2982,7 +2978,7 @@ void I400ToARGBRow_SSE2(const uint8* y_buf,
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I400ToARGBRow_AVX2(const uint8* y_buf,
                        uint8* rgb_buf,
                        int width) {
@@ -3037,7 +3033,7 @@ static const uvec8 kShuffleMirror = {
 };
 // TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  __asm {
    mov       eax, [esp + 4]   // src
@@ -3058,7 +3054,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_MIRRORROW_SSSE3
 #ifdef HAS_MIRRORROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov       eax, [esp + 4]   // src
@@ -3081,7 +3077,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_MIRRORROW_AVX2
 #ifdef HAS_MIRRORROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov       eax, [esp + 4]   // src
@@ -3112,7 +3108,7 @@ static const uvec8 kShuffleMirrorUV = {
  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                       int width) {
  __asm {
@@ -3142,7 +3138,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 #endif  // HAS_MIRRORROW_UV_SSSE3
 #ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov       eax, [esp + 4]   // src
@@ -3169,7 +3165,7 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = {
  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov       eax, [esp + 4]   // src
@@ -3190,7 +3186,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBMIRRORROW_AVX2
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
    push       edi
@@ -3228,7 +3224,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 #endif  // HAS_SPLITUVROW_SSE2
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
    push       edi
@@ -3266,7 +3262,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 #endif  // HAS_SPLITUVROW_AVX2
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                     int width) {
  __asm {
@@ -3297,7 +3293,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif  //  HAS_MERGEUVROW_SSE2
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                     int width) {
  __asm {
@@ -3331,7 +3327,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  __asm {
    mov        eax, [esp + 4]   // src
@@ -3354,7 +3350,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 #ifdef HAS_COPYROW_AVX
 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
  __asm {
    mov        eax, [esp + 4]   // src
@@ -3378,7 +3374,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_AVX
 // Multiple of 1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
  __asm {
    mov        eax, esi
@@ -3395,7 +3391,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov        eax, [esp + 4]   // src
@@ -3431,7 +3427,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov        eax, [esp + 4]   // src
@@ -3460,7 +3456,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov        eax, [esp + 4]   // src
@@ -3498,7 +3494,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  __asm {
    mov        eax, [esp + 4]   // src
@@ -3530,7 +3526,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 #ifdef HAS_SETROW_X86
 // Write 'count' bytes using an 8 bit value repeated.
 // Count should be multiple of 4.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SetRow_X86(uint8* dst, uint8 v8, int count) {
  __asm {
    movzx      eax, byte ptr [esp + 8]    // v8
@@ -3547,7 +3543,7 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
 }
 // Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
  __asm {
    mov        edx, edi
@@ -3561,7 +3557,7 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
 }
 // Write 'count' 32 bit values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
  __asm {
    mov        edx, edi
@@ -3576,7 +3572,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
 #endif  // HAS_SETROW_X86
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
                     uint8* dst_y, int pix) {
  __asm {
@@ -3603,7 +3599,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -3647,7 +3643,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
                         uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -3686,7 +3682,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_AVX2(const uint8* src_uyvy,
                     uint8* dst_y, int pix) {
  __asm {
@@ -3711,7 +3707,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -3755,7 +3751,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
                         uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -3796,7 +3792,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
 #endif  // HAS_YUY2TOYROW_AVX2
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
                     uint8* dst_y, int pix) {
  __asm {
@@ -3821,7 +3817,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -3864,7 +3860,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
                         uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -3900,7 +3896,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_SSE2(const uint8* src_uyvy,
                     uint8* dst_y, int pix) {
  __asm {
@@ -3923,7 +3919,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -3966,7 +3962,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
                         uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -4005,7 +4001,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                       uint8* dst_argb, int width) {
  __asm {
@@ -4139,7 +4135,7 @@ static const uvec8 kShuffleAlpha = {
 //    pshufb     xmm3, kShuffleAlpha // alpha
 // Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
  __asm {
@@ -4255,7 +4251,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
  __asm {
    mov        eax, [esp + 4]   // src_argb0
@@ -4304,7 +4300,7 @@ static const uvec8 kShuffleAlpha1 = {
  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  __asm {
    mov        eax, [esp + 4]   // src_argb0
@@ -4348,7 +4344,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 static const uvec8 kShuffleAlpha_AVX2 = {
  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
  __asm {
    mov        eax, [esp + 4]   // src_argb0
@@ -4385,7 +4381,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
  __asm {
@@ -4439,7 +4435,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = {
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
  __asm {
@@ -4473,7 +4469,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  }
 }
 #else  // USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
  __asm {
@@ -4540,7 +4536,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  __asm {
    mov        eax, [esp + 4]   /* src_argb */
@@ -4599,7 +4595,7 @@ static const vec8 kARGBToSepiaR = {
 };
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  __asm {
    mov        eax, [esp + 4]   /* dst_argb */
@@ -4656,7 +4652,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                              const int8* matrix_argb, int width) {
  __asm {
@@ -4717,7 +4713,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                          int interval_offset, int width) {
  __asm {
@@ -4762,7 +4758,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                       uint32 value) {
  __asm {
@@ -4796,7 +4792,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
  __asm {
@@ -4835,7 +4831,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
  __asm {
@@ -4883,7 +4879,7 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
  __asm {
@@ -4912,7 +4908,7 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
  __asm {
@@ -4949,7 +4945,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
  __asm {
@@ -4978,7 +4974,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
  __asm {
@@ -5010,7 +5006,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                    const uint8* src_y2, uint8* dst_sobelx, int width) {
  __asm {
@@ -5066,7 +5062,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                    uint8* dst_sobely, int width) {
  __asm {
@@ -5119,7 +5115,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                   uint8* dst_argb, int width) {
  __asm {
@@ -5166,7 +5162,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                          uint8* dst_y, int width) {
  __asm {
@@ -5199,7 +5195,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                     uint8* dst_argb, int width) {
  __asm {
@@ -5486,7 +5482,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width) {
@@ -5571,7 +5567,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) {
@@ -5668,7 +5664,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 #endif  // HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) {
@@ -5769,7 +5765,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) {
@@ -5876,7 +5872,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 #endif  // HAS_INTERPOLATEROW_SSE2
 // Specialized ARGB to Bayer that just isolates G channel.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
  __asm {
@@ -5906,7 +5902,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
 }
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
  __asm {
@@ -5932,7 +5928,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 }
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                         const uint8* shuffler, int pix) {
  __asm {
@@ -5960,7 +5956,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                         const uint8* shuffler, int pix) {
  __asm {
@@ -6082,7 +6078,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
@@ -6119,7 +6115,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
  }
 }
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToUYVYRow_SSE2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
@@ -6157,7 +6153,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
 }
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
                            uint8* dst_argb, const float* poly,
                            int width) {
@@ -6216,7 +6212,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
                            uint8* dst_argb, const float* poly,
                            int width) {
@@ -6256,7 +6252,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
                           int width) {
  __asm {
@@ -6290,7 +6286,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  __asm {
    push       esi
@@ -6321,7 +6317,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                 int width,
                                 const uint8* luma, uint32 lumacoeff) {

--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -95,7 +95,7 @@ static uvec16 kScaleAb2 =
  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 // Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  __asm {
@@ -121,7 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 32x1 rectangle to 16x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width) {
  __asm {
@@ -157,7 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 32x2 rectangle to 16x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
@@ -200,7 +200,7 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 #ifdef HAS_SCALEROWDOWN2_AVX2
 // Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  __asm {
@@ -228,7 +228,7 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 64x1 rectangle to 32x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width) {
  __asm {
@@ -265,7 +265,7 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 64x2 rectangle to 32x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
@@ -307,7 +307,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 #endif  // HAS_SCALEROWDOWN2_AVX2
 // Point samples 32 pixels to 8 pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  __asm {
@@ -338,7 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 32x4 rectangle to 8x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
@@ -401,7 +401,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 // Then shuffled to do the scaling.
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
  __asm {
@@ -448,7 +448,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 // xmm7 kRound34
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -505,7 +505,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 }
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -567,7 +567,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 // 3/8 point sampler
 // Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
  __asm {
@@ -598,7 +598,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -663,7 +663,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 }
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -709,7 +709,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
 // Reads 16xN bytes and produces 16 shorts at a time.
 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                       uint16* dst_ptr, int src_width,
                       int src_height) {
@@ -775,7 +775,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 // when drmemory bug fixed.
 // https://code.google.com/p/drmemory/issues/detail?id=1396
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx) {
  __asm {
@@ -852,7 +852,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 }
 // Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx) {
  __asm {
@@ -877,7 +877,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                            ptrdiff_t src_stride,
                            uint8* dst_argb, int dst_width) {
@@ -902,7 +902,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 }
 // Blends 8x1 rectangle to 4x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
                                  ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width) {
@@ -930,7 +930,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 }
 // Blends 8x2 rectangle to 4x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                               ptrdiff_t src_stride,
                               uint8* dst_argb, int dst_width) {
@@ -964,7 +964,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 }
 // Reads 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                               int src_stepx,
                               uint8* dst_argb, int dst_width) {
@@ -1000,7 +1000,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
 }
 // Blends four 2x2 to 4x1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                  ptrdiff_t src_stride,
                                  int src_stepx,
@@ -1048,7 +1048,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
 }
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
  __asm {
@@ -1139,7 +1139,7 @@ static uvec8 kShuffleFractions = {
  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                               int dst_width, int x, int dx) {
  __asm {
@@ -1210,7 +1210,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 }
 // Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                           int dst_width, int x, int dx) {
  __asm {
@@ -1235,7 +1235,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
 }
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv_X86(int num, int div) {
  __asm {
    mov        eax, [esp + 4]    // num
@@ -1248,7 +1248,7 @@ int FixedDiv_X86(int num, int div) {
 }
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv1_X86(int num, int div) {
  __asm {
    mov        eax, [esp + 4]    // num