Remove old alpha blend, expose GetARGB2Blend, fix ComputeSumSquareErrorPlane on SSE2

BUG=29 TEST=none Review URL: https://webrtc-codereview.appspot.com/469005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90

Remove old alpha blend, expose GetARGB2Blend, fix ComputeSumSquareErrorPlane on SSE2
BUG=29 TEST=none Review URL: https://webrtc-codereview.appspot.com/469005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90
d2f4413d · fbarchard@google.com · c757f308 · d2f4413d · d2f4413d · d2f4413d
Commit d2f4413d authored Apr 04, 2012 by fbarchard@google.com
17 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 233
+Version: 234
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);
-// Alpha Blend ARGB row of pixels.
+typedef void (*ARGBBlendRow)(const uint8* src_argb0,
-void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width);
+                             const uint8* src_argb1,
+                             uint8* dst_argb, int width);
-// Alpha Blend 2 rows of ARGB pixels and store to destination.
+// Get function to Alpha Blend ARGB pixels and store to destination.
-void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
+ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width);
-                   uint8* dst_argb, int width);
-// Alpha Blend ARGB.
+// Alpha Blend ARGB images and store to destination.
-int ARGBBlend(const uint8* src_argb, int src_stride_argb,
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);
-// Alpha Blend 2 ARGB images and store to destination.
-int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
-               const uint8* src_argb1, int src_stride_argb1,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
 // Convert I422 to YUY2.
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,

--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@@ -20,7 +20,7 @@ extern "C" {
 // Supported rotation
 enum RotationMode {
-  kRotate0 = 0, // No rotation
+  kRotate0 = 0,  // No rotation
  kRotate90 = 90,  // Rotate 90 degrees clockwise
  kRotate180 = 180,  // Rotate 180 degrees
  kRotate270 = 270,  // Rotate 270 degrees clockwise

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define INCLUDE_LIBYUV_VERSION 233
+#define LIBYUV_VERSION 234
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -25,18 +25,37 @@ namespace libyuv {
 extern "C" {
 #endif
-// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
  uint32 hash = seed;
-  if (count > 0) {
+  for (int i = 0; i < count; ++i) {
-    do {
+    hash += (hash << 5) + src[i];
-      hash = hash * 33 + *src++;
-    } while (--count);
  }
  return hash;
 }
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+// hash seed of 5381 recommended.
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  while (count >= static_cast<uint64>(kBlockSize)) {
+    seed = HashDjb2_C(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  int remainder = static_cast<int>(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = static_cast<int>(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SUMSQUAREERROR_NEON
 static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
@@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
  return sse;
 }
-#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
                                  int count) {
  __asm {
@@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
    movdqa     xmm2, [eax + edx]
    lea        eax,  [eax + 16]
    sub        ecx, 16
-    movdqa     xmm3, xmm1
+    movdqa     xmm3, xmm1  // abs trick
    psubusb    xmm1, xmm2
    psubusb    xmm2, xmm3
    por        xmm1, xmm2
@@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
                                  int count) {
@@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
 static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
                               int count) {
  uint32 sse = 0u;
-  for (int x = 0; x < count; ++x) {
+  for (int i = 0; i < count; ++i) {
-    int diff = src_a[0] - src_b[0];
+    int diff = src_a[i] - src_b[i];
    sse += static_cast<uint32>(diff * diff);
-    src_a += 1;
-    src_b += 1;
  }
  return sse;
 }
@@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
 #elif defined(HAS_SUMSQUAREERROR_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+    // Note only used for multiples of 16 so count is not checked.
    SumSquareError = SumSquareError_SSE2;
  }
 #endif
@@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
    SumSquareError = SumSquareError_NEON;
  }
 #elif defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
+      IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
    SumSquareError = SumSquareError_SSE2;
  }
 #endif

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_HALFROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                         uint8* dst_uv, int pix) {
  __asm {
@@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_HALFROW_SSE2
 static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                         uint8* dst_uv, int pix) {
@@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
 // Blends 32x2 pixels to 16x1
 // source in scale.cc
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
                           uint8* dst, int dst_width);
-#elif defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) && \
+#elif !defined(YUV_DISABLE_ASM) && \
-    !defined(YUV_DISABLE_ASM)
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
                           uint8* dst_ptr, int dst_width);
 #endif
@@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
                    width, height);
 }
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SPLITYUY2_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SplitYUY2_SSE2(const uint8* src_yuy2,
                           uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SPLITYUY2_SSE2
 static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
                           uint8* dst_u, uint8* dst_v, int pix) {

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_I42XTOYUY2ROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void I42xToYUY2Row_SSE2(const uint8* src_y,
                               const uint8* src_u,
                               const uint8* src_v,
@@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
 }
 #define HAS_I42XTOUYVYROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void I42xToUYVYRow_SSE2(const uint8* src_y,
                               const uint8* src_u,
                               const uint8* src_v,
@@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
    ret
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_I42XTOYUY2ROW_SSE2
 static void I42xToYUY2Row_SSE2(const uint8* src_y,
                               const uint8* src_u,

--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -24,9 +24,9 @@ extern "C" {
 // and vst would select which 2 components to write.  The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
                                 uint8* dst_bayer, uint32 selector, int pix) {
  __asm {
@@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
    mov        ecx, [esp + 16]   // pix
    pshufd     xmm5, xmm5, 0
+    align      16
  wloop:
    movdqa     xmm0, [eax]
    lea        eax, [eax + 16]
@@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_ARGBTOBAYERROW_SSSE3
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
  return 0;
 }
-// Alpha Blend ARGB
+// Get a blender that optimized for the CPU, alignment and pixel count.
-void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) {
+// As there are 6 blenders to choose from, the caller should try to use
-#if defined(HAS_ARGBBLENDROW_SSSE3)
+// the same blend function for all pixels if possible.
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) {
-    ARGBBlendRow_SSSE3(src_argb, dst_argb, width);
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-    return;
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow_SSE2(src_argb, dst_argb, width);
-    return;
-  }
-#endif
-  ARGBBlendRow_C(src_argb, dst_argb, width);
-}
-// Alpha Blend 2 rows of ARGB pixels and store to destination.
-void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
-                   uint8* dst_argb, int width) {
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width);
-    return;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width);
-    return;
-  }
-#endif
-  ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width);
-}
-// Alpha Blend ARGB
-// TODO(fbarchard): Call 3 pointer low levels to reduce code size.
-int ARGBBlend(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
-      ARGBBlendRow_C;
 #if defined(HAS_ARGBBLENDROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow = ARGBBlendRow_SSE2;
+    ARGBBlendRow = ARGBBlendRow1_SSE2;
-    if (IS_ALIGNED(width, 4) &&
+    if (width >= 4) {
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      ARGBBlendRow = ARGBBlendRow_Any_SSE2;
-      ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
+      if (IS_ALIGNED(width, 4) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
+      }
    }
  }
 #endif
 #if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
+    ARGBBlendRow = ARGBBlendRow_Any_SSSE3;
    if (IS_ALIGNED(width, 4) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
    }
  }
 #endif
+  return ARGBBlendRow;
-  for (int y = 0; y < height; ++y) {
-    ARGBBlendRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
 }
 // Alpha Blend 2 ARGB images and store to destination.
-int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
               const uint8* src_argb1, int src_stride_argb1,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
@@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-  void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) =
-                        uint8* dst_argb, int width) = ARGBBlend2Row_C;
+      GetARGBBlend(dst_argb, dst_stride_argb, width);
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlend2Row = ARGBBlend2Row_SSE2;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlend2Row = ARGBBlend2Row_SSSE3;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3;
-    }
-  }
-#endif
  for (int y = 0; y < height; ++y) {
-    ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
    src_argb0 += src_stride_argb0;
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
@@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 // SetRow8 writes 'count' bytes using a 32 bit value repeated
 // SetRow32 writes 'count' words using a 32 bit value repeated
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SETROW_NEON
 static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
  asm volatile (
@@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
  }
 }
-#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SETROW_X86
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
  __asm {
    mov        edx, edi
@@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
  }
 }
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SetRows32_X86(uint8* dst, uint32 v32, int width,
                         int dst_stride, int height) {
  __asm {
@@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SETROW_X86
 static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
  size_t width_tmp = static_cast<size_t>(width);
@@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
  return 0;
 }
+// TODO(fbarchard): Add TestCpuFlag(kCpuHasX86) to allow C code to be tested.
 // Draw a rectangle into ARGB
 int ARGBRect(uint8* dst_argb, int dst_stride_argb,
             int dst_x, int dst_y,
@@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
 #if defined(HAS_SETROW_X86)
  SetRows32_X86(dst, value, width, dst_stride_argb, height);
-#elif defined(HAS_SETROW_NEON)
+#else
+#if defined(HAS_SETROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    SetRows32_NEON(dst, value, width, dst_stride_argb, height);
    return 0;
  }
+#endif
  SetRows32_C(dst, value, width, dst_stride_argb, height);
 #endif
  return 0;

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -21,8 +21,8 @@ namespace libyuv {
 extern "C" {
 #endif
-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+#if !defined(YUV_DISABLE_ASM) && \
-    !defined(YUV_DISABLE_ASM)
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
@@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         int width);
 #endif
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                               uint8* dst, int dst_stride, int width) {
  __asm {
@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 }
 #define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    ret
  }
 }
-#elif defined(__i386__) || defined(__x86_64__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSE_WX8_SSSE3
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                               uint8* dst, int dst_stride, int width) {
@@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  );
 }
-#if defined (__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                    uint8* dst_a, int dst_stride_a,
@@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    "pop    %ebx                               \n"
    "ret                                       \n"
 );
-#elif defined(__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,

--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -17,7 +17,7 @@ namespace libyuv {
 extern "C" {
 #endif
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 static const uvec8 vtbl_4x4_transpose =
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

--- a/source/row.h
+++ b/source/row.h
@@ -18,6 +18,7 @@ namespace libyuv {
 extern "C" {
 #endif
+// TODO(fbarchard): Remove kMaxStride
 #define kMaxStride (2560 * 4)
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
@@ -26,8 +27,9 @@ extern "C" {
 #endif
 // The following are available on all x86 platforms
-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+#if !defined(YUV_DISABLE_ASM) && \
-    !defined(YUV_DISABLE_ASM)
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_BGRATOARGBROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
@@ -66,7 +68,7 @@ extern "C" {
 #endif
 // The following are available on Neon platforms
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
@@ -78,7 +80,7 @@ extern "C" {
 // The following are only available on Win32
 // TODO(fbarchard): Port to GCC
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBBLENDROW_SSSE3
 #endif
@@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf,
                     int width);
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                               int width);
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-// ARGB preattenuated alpha blend with 2 sources and a destination.
-void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                                 uint8* dst_argb, int width);
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                                uint8* dst_argb, int width);
-void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
+                               uint8* dst_argb, int width);
-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width);
-void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
+                            uint8* dst_argb, int width);
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                           uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
 // 'Any' functions handle any size and alignment.
 void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
 }
 #define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f
-void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    uint32 a = src_argb[3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[0];
-        const uint32 fg = src_argb[1];
-        const uint32 fr = src_argb[2];
-        const uint32 bb = dst_argb[0];
-        const uint32 bg = dst_argb[1];
-        const uint32 br = dst_argb[2];
-        dst_argb[0] = BLENDER(fb, bb, a);
-        dst_argb[1] = BLENDER(fg, bg, a);
-        dst_argb[2] = BLENDER(fr, br, a);
-        dst_argb[3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb) =
-            *reinterpret_cast<const uint32*>(src_argb);
-      }
-    }
-    a = src_argb[4 + 3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[4 + 0];
-        const uint32 fg = src_argb[4 + 1];
-        const uint32 fr = src_argb[4 + 2];
-        const uint32 bb = dst_argb[4 + 0];
-        const uint32 bg = dst_argb[4 + 1];
-        const uint32 br = dst_argb[4 + 2];
-        dst_argb[4 + 0] = BLENDER(fb, bb, a);
-        dst_argb[4 + 1] = BLENDER(fg, bg, a);
-        dst_argb[4 + 2] = BLENDER(fr, br, a);
-        dst_argb[4 + 3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb + 4) =
-            *reinterpret_cast<const uint32*>(src_argb + 4);
-      }
-    }
-    src_argb += 8;
-    dst_argb += 8;
-  }
-  if (width & 1) {
-    const uint32 a = src_argb[3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[0];
-        const uint32 fg = src_argb[1];
-        const uint32 fr = src_argb[2];
-        const uint32 bb = dst_argb[0];
-        const uint32 bg = dst_argb[1];
-        const uint32 br = dst_argb[2];
-        dst_argb[0] = BLENDER(fb, bb, a);
-        dst_argb[1] = BLENDER(fg, bg, a);
-        dst_argb[2] = BLENDER(fr, br, a);
-        dst_argb[3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb) =
-            *reinterpret_cast<const uint32*>(src_argb);
-      }
-    }
-  }
-}
 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
-void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
  for (int x = 0; x < width - 1; x += 2) {
    uint32 a = src_argb0[3];

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -16,7 +16,7 @@ extern "C" {
 #endif
 // This module is for GCC Neon
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define YUVTORGB                                                               \
    "vld1.u8    {d0}, [%0]!                    \n"                             \

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 // This module is for GCC x86 and x64
-#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 // GCC 4.2 on OSX has link error when passing static or const to inline.
 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
@@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time
 // Destination aligned to 16 bytes, multiple of 4 pixels
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                               int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-  // 8 pixel loop
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm3                     \n"  // first 4 pixels
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "movdqu    0x10(%0),%%xmm3                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "jle       9f                              \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"  // next 4 pixels
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    0x10(%1),%%xmm2                 \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    0x10(%1),%%xmm1                 \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "jg        1b                              \n"
-  "9:                                          \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-// Blend 1 pixel at a time, unaligned
-void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-  // 1 pixel loop
-  "1:                                          \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%2                         \n"
-    "movd      %%xmm0,(%1)                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
-    src_argb += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSE2
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time
-// Destination aligned to 16 bytes, multiple of 4 pixels
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                               uint8* dst_argb, int width) {
  asm volatile (
    "pcmpeqb   %%xmm7,%%xmm7                   \n"
@@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 // Blend 1 pixel at a time, unaligned
-void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
  asm volatile (
    "pcmpeqb   %%xmm7,%%xmm7                   \n"
@@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  );
 }
-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+                            uint8* dst_argb, int width) {
  // Do 1 to 3 pixels to get destination aligned.
  if ((uintptr_t)(dst_argb) & 15) {
    int count = width;
    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
      count = (-(intptr_t)(dst_argb) >> 2) & 3;
    }
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
    src_argb0 += count * 4;
    src_argb1 += count * 4;
    dst_argb += count * 4;
@@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  }
  // Do multiple of 4 pixels
  if (width & ~3) {
-    ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
+    ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
  }
  // Do remaining 1 to 3 pixels
  if (width & 3) {
@@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    src_argb1 += (width & ~3) * 4;
    dst_argb += (width & ~3) * 4;
    width &= 3;
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
  }
 }
 #endif  // HAS_ARGBBLENDROW_SSE2
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus

--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) {
 *
 */
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
                        uint8* dst, int dst_width) {
@@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
 */
 // Constants for SSE2 code
-#elif defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) && \
+#elif !defined(YUV_DISABLE_ASM) && \
-    !defined(YUV_DISABLE_ASM)
+    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
 #if defined(_MSC_VER)
 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__) && \
+#elif defined(__i386__) && \
-    defined(__i386__)
+    (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__))
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #else
 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
@@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) =
  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 #endif
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SCALEROWDOWN2_SSE2
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  __asm {
@@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
 }
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
@@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN4_SSE2
 // Point samples 32 pixels to 8 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  __asm {
@@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
 // Blends 32x4 rectangle to 8x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
  __asm {
@@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN8_SSE2
 // Point samples 32 pixels to 4 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  __asm {
@@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
 // Blends 32x8 rectangle to 4x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
  __asm {
@@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
                                 uint8* dst_ptr, int dst_width) {
  __asm {
@@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 // 3/8 point sampler
 // Scale 32 pixels to 12
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
                                 uint8* dst_ptr, int dst_width) {
  __asm {
@@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
 }
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
 }
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEADDROWS_SSE2
 // Reads 16xN bytes and produces 16 shorts at a time.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                              uint16* dst_ptr, int src_width,
                              int src_height) {
@@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
 #define HAS_SCALEFILTERROWS_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                                 int src_stride, int dst_width,
                                 int source_y_fraction) {
@@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
 #define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                  int src_stride, int dst_width,
                                  int source_y_fraction) {
@@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                    int dst_width) {
  __asm {
@@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
@@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
  );
 }
-#if defined(__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined(__i386__)
 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                      uint8* dst_ptr, int dst_width);
  asm(
@@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "ret                                       \n"
 );
-#elif defined(__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
  asm volatile (