Change scale to unaligned movdqu.

BUG=365 TESTED=scale unittests R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22879004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1101 16f28f9a-4ce2-e073-06de-1de4eb20be90

Change scale to unaligned movdqu.
BUG=365 TESTED=scale unittests R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22879004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1101 16f28f9a-4ce2-e073-06de-1de4eb20be90
044f914c · fbarchard@google.com · 9c4c8218 · 044f914c · 044f914c · 044f914c
Commit 044f914c authored Oct 01, 2014 by fbarchard@google.com
11 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1099
+Version: 1101
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -1748,12 +1748,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
 void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
                                ptrdiff_t src_stride_ptr, int width,
                                int source_y_fraction);
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride_ptr, int width,
-                                   int source_y_fraction);
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride_ptr, int width,
-                                    int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
                             ptrdiff_t src_stride_ptr, int width,
                             int source_y_fraction);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1099
+#define LIBYUV_VERSION 1101
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1798,27 +1798,17 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -579,11 +579,11 @@ NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
     InterpolateRow_C, 1, 1, 32)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3,
     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2,
     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -6322,7 +6322,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
-#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -6356,225 +6355,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    punpcklwd  xmm5, xmm5
    pshufd     xmm5, xmm5, 0
-    align      4
-  xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_SSSE3
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    cmp        eax, 64
-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
-    cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
-    cmp        eax, 192
-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
-    movd       xmm5, eax            // xmm5 = y fraction
-    punpcklbw  xmm5, xmm5
-    psrlw      xmm5, 1
-    punpcklwd  xmm5, xmm5
-    punpckldq  xmm5, xmm5
-    punpcklqdq xmm5, xmm5
-    pxor       xmm4, xmm4
-    align      4
-  xloop:
-    movdqa     xmm0, [esi]  // row0
-    movdqa     xmm2, [esi + edx]  // row1
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    psubw      xmm2, xmm0  // row1 - row0
-    psubw      xmm3, xmm1
-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
-    paddw      xmm3, xmm3
-    pmulhw     xmm2, xmm5  // scale diff
-    pmulhw     xmm3, xmm5
-    paddw      xmm0, xmm2  // sum rows
-    paddw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride, int dst_width,
-                                    int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    shr        eax, 1
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 128.  Blend 100 / 0.
-    cmp        eax, 32
-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
-    cmp        eax, 64
-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
-    cmp        eax, 96
-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
-    movd       xmm0, eax  // high fraction 0..127
-    neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
    align      4
  xloop:
    movdqu     xmm0, [esi]
@@ -6650,7 +6430,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) {
  __asm {

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -59,17 +59,10 @@ static void ScalePlaneDown2(int src_width, int src_height,
  }
 #elif defined(HAS_SCALEROWDOWN2_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
-        ScaleRowDown2Box_Unaligned_SSE2);
-    if (IS_ALIGNED(src_ptr, 16) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
        (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
        ScaleRowDown2Box_SSE2);
  }
-  }
 #elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
@@ -114,18 +107,10 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
  }
 #elif defined(HAS_SCALEROWDOWN2_16_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ?
-        ScaleRowDown2_Unaligned_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
-        ScaleRowDown2Box_Unaligned_16_SSE2);
-    if (IS_ALIGNED(src_ptr, 16) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
        ScaleRowDown2Box_16_SSE2);
  }
-  }
 #elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
@@ -889,23 +874,17 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
@@ -991,23 +970,17 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_16_SSE2;
    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
      InterpolateRow = InterpolateRow_16_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_16_SSSE3;
    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
      InterpolateRow = InterpolateRow_16_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
@@ -1090,23 +1063,17 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
@@ -1229,23 +1196,17 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
    InterpolateRow = InterpolateRow_Any_16_SSE2;
    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_16_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
    InterpolateRow = InterpolateRow_Any_16_SSSE3;
    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_16_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -193,23 +193,17 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
@@ -289,23 +283,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
@@ -430,12 +418,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
@@ -470,23 +455,17 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {

--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -888,25 +888,17 @@ void ScalePlaneVertical(int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
@@ -970,25 +962,17 @@ void ScalePlaneVertical_16(int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
    InterpolateRow = InterpolateRow_Any_16_SSE2;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_16_SSE2;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
    InterpolateRow = InterpolateRow_Any_16_SSSE3;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      InterpolateRow = InterpolateRow_16_SSSE3;
    }
  }
-  }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {

--- a/source/scale_posix.cc
+++ b/source/scale_posix.cc
@@ -100,110 +100,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  asm volatile (
    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
@@ -226,8 +122,7 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                        ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -236,7 +131,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
    LABELALIGN
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "movdqa    %%xmm0,%%xmm2                   \n"
    "psrlw     $0x8,%%xmm0                     \n"
@@ -262,8 +157,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
  );
 }
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -273,9 +167,9 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
    BUNDLEALIGN
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "pavgb     %%xmm3,%%xmm1                   \n"
@@ -315,8 +209,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "pand      %%xmm5,%%xmm0                   \n"
    "pand      %%xmm5,%%xmm1                   \n"
@@ -348,8 +242,8 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
    BUNDLEALIGN
    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
@@ -412,8 +306,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "movdqa    %%xmm2,%%xmm1                   \n"
    "palignr   $0x8,%%xmm0,%%xmm1              \n"
@@ -461,7 +355,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
    "pavgb     %%xmm7,%%xmm6                   \n"
    "pshufb    %%xmm2,%%xmm6                   \n"
@@ -479,7 +373,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    "psrlw     $0x2,%%xmm6                     \n"
    "packuswb  %%xmm6,%%xmm6                   \n"
    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
    BUNDLEALIGN
    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
    "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -533,7 +427,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
    "pavgb     %%xmm6,%%xmm7                   \n"
    "pavgb     %%xmm7,%%xmm6                   \n"
@@ -553,7 +447,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    "psrlw     $0x2,%%xmm6                     \n"
    "packuswb  %%xmm6,%%xmm6                   \n"
    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "pavgb     %%xmm6,%%xmm7                   \n"
@@ -590,8 +484,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "pshufb    %%xmm4,%%xmm0                   \n"
    "pshufb    %%xmm5,%%xmm1                   \n"
@@ -631,7 +525,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
    "lea       " MEMLEA(0x10,0) ",%0           \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@@ -679,7 +573,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
    "movhlps   %%xmm0,%%xmm1                   \n"
    "movhlps   %%xmm6,%%xmm7                   \n"
@@ -741,7 +635,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    "mov       %0,%3                           \n"
    "add       %6,%0                           \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@@ -753,7 +647,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    LABELALIGN
  "2:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
    "add       %6,%0                           \n"
    "movdqa    %%xmm2,%%xmm3                   \n"
    "punpcklbw %%xmm4,%%xmm2                   \n"
@@ -765,8 +659,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    LABELALIGN
  "3:                                          \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    "lea       " MEMLEA(0x10,3) ",%0           \n"
    "lea       " MEMLEA(0x20,1) ",%1           \n"
    "sub       $0x10,%4                        \n"
@@ -870,14 +764,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
    "punpcklbw %%xmm0,%%xmm0                   \n"
    "punpckhbw %%xmm1,%%xmm1                   \n"
    "sub       $0x20,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "jg        1b                              \n"
@@ -898,12 +792,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "jg        1b                              \n"
  : "+r"(src_argb),  // %0
@@ -923,15 +817,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "movdqa    %%xmm0,%%xmm2                   \n"
    "shufps    $0x88,%%xmm1,%%xmm0             \n"
    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "jg        1b                              \n"
  : "+r"(src_argb),  // %0
@@ -951,8 +845,8 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    BUNDLEALIGN
    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
@@ -964,7 +858,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "jg        1b                              \n"
  : "+r"(src_argb),   // %0
@@ -1003,7 +897,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
    "punpckldq %%xmm3,%%xmm2                   \n"
    "punpcklqdq %%xmm2,%%xmm0                  \n"
    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
    "lea       " MEMLEA(0x10,2) ",%2           \n"
    "jg        1b                              \n"
  : "+r"(src_argb),      // %0
@@ -1056,7 +950,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
    "lea       " MEMLEA(0x10,2) ",%2           \n"
    "jg        1b                              \n"
  : "+r"(src_argb),       // %0
@@ -1156,14 +1050,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
  asm volatile (
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
    "punpckldq %%xmm0,%%xmm0                   \n"
    "punpckhdq %%xmm1,%%xmm1                   \n"
    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "jg        1b                              \n"

--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -103,117 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    mov        edx, [esp + 12]       // dst_ptr
    mov        ecx, [esp + 16]       // dst_width
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-    ret
-  }
-}
-// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-    ret
-  }
-}
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-    pop        esi
-    ret
-  }
-}
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
    align      4
  wloop:
    movdqu     xmm0, [eax]
@@ -234,8 +123,7 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
 // Blends 32x1 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                        ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width) {
  __asm {
    mov        eax, [esp + 4]        // src_ptr
@@ -273,8 +161,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
    push       esi
@@ -331,8 +218,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
    pand       xmm0, xmm5
    pand       xmm1, xmm5
@@ -366,16 +253,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
+    movdqu     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
+    movdqu     xmm3, [eax + esi + 16]
    pavgb      xmm0, xmm2            // average rows
    pavgb      xmm1, xmm3
-    movdqa     xmm2, [eax + esi * 2]
+    movdqu     xmm2, [eax + esi * 2]
-    movdqa     xmm3, [eax + esi * 2 + 16]
+    movdqu     xmm3, [eax + esi * 2 + 16]
-    movdqa     xmm4, [eax + edi]
+    movdqu     xmm4, [eax + edi]
-    movdqa     xmm5, [eax + edi + 16]
+    movdqu     xmm5, [eax + edi + 16]
    lea        eax, [eax + 32]
    pavgb      xmm2, xmm4
    pavgb      xmm3, xmm5
@@ -429,8 +316,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
    movdqa     xmm2, xmm1
    palignr    xmm1, xmm0, 8
@@ -483,8 +370,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    align      4
  wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
+    movdqu     xmm1, [eax + esi]
    pavgb      xmm0, xmm1
    pshufb     xmm0, xmm2
    pmaddubsw  xmm0, xmm5
@@ -501,8 +388,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    psrlw      xmm0, 2
    packuswb   xmm0, xmm0
    movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
+    movdqu     xmm1, [eax + esi + 16]
    lea        eax, [eax + 32]
    pavgb      xmm0, xmm1
    pshufb     xmm0, xmm4
@@ -542,8 +429,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    align      4
  wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
+    movdqu     xmm1, [eax + esi]
    pavgb      xmm1, xmm0
    pavgb      xmm0, xmm1
    pshufb     xmm0, xmm2
@@ -562,8 +449,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    psrlw      xmm0, 2
    packuswb   xmm0, xmm0
    movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
+    movdqu     xmm1, [eax + esi + 16]
    lea        eax, [eax + 32]
    pavgb      xmm1, xmm0
    pavgb      xmm0, xmm1
@@ -599,8 +486,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    align      4
  xloop:
-    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
    lea        eax, [eax + 32]
    pshufb     xmm0, xmm4
    pshufb     xmm1, xmm5
@@ -635,8 +522,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
    align      4
  xloop:
-    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
-    movdqa     xmm6, [eax + esi]
+    movdqu     xmm6, [eax + esi]
    movhlps    xmm1, xmm0
    movhlps    xmm7, xmm6
    punpcklbw  xmm0, xmm5
@@ -645,7 +532,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
    punpcklbw  xmm7, xmm5
    paddusw    xmm0, xmm6
    paddusw    xmm1, xmm7
-    movdqa     xmm6, [eax + esi * 2]
+    movdqu     xmm6, [eax + esi * 2]
    lea        eax, [eax + 16]
    movhlps    xmm7, xmm6
    punpcklbw  xmm6, xmm5
@@ -701,7 +588,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
    align      4
  xloop:
-    movdqa     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
    pavgb      xmm0, [eax + esi]
    lea        eax, [eax + 16]
@@ -750,7 +637,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    align      4
  xloop:
    // first row
-    movdqa     xmm0, [esi]
+    movdqu     xmm0, [esi]
    lea        eax, [esi + edx]
    movdqa     xmm1, xmm0
    punpcklbw  xmm0, xmm4
@@ -763,7 +650,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    // sum remaining rows
    align      4
  yloop:
-    movdqa     xmm2, [eax]       // read 16 pixels
+    movdqu     xmm2, [eax]       // read 16 pixels
    lea        eax, [eax + edx]  // advance to next row
    movdqa     xmm3, xmm2
    punpcklbw  xmm2, xmm4
@@ -775,8 +662,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    align      4
  ydone:
-    movdqa     [edi], xmm0
+    movdqu     [edi], xmm0
-    movdqa     [edi + 16], xmm1
+    movdqu     [edi + 16], xmm1
    lea        edi, [edi + 32]
    sub        ecx, 16
@@ -891,14 +778,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
    lea        eax,  [eax + 16]
    movdqa     xmm1, xmm0
    punpcklbw  xmm0, xmm0
    punpckhbw  xmm1, xmm1
    sub        ecx, 32
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx + 16], xmm1
    lea        edx, [edx + 32]
    jg         wloop
@@ -920,12 +807,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
    shufps     xmm0, xmm1, 0xdd
    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jg         wloop
@@ -947,15 +834,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
    movdqa     xmm2, xmm0
    shufps     xmm0, xmm1, 0x88      // even pixels
    shufps     xmm2, xmm1, 0xdd      // odd pixels
    pavgb      xmm0, xmm2
    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jg         wloop
@@ -978,10 +865,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
+    movdqu     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
+    movdqu     xmm3, [eax + esi + 16]
    lea        eax,  [eax + 32]
    pavgb      xmm0, xmm2            // average rows
    pavgb      xmm1, xmm3
@@ -990,7 +877,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
    shufps     xmm2, xmm1, 0xdd      // odd pixels
    pavgb      xmm0, xmm2
    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jg         wloop
@@ -1027,7 +914,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
    punpckldq  xmm2, xmm3
    punpcklqdq xmm0, xmm2
    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jg         wloop
@@ -1076,7 +963,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
    shufps     xmm2, xmm1, 0xdd      // odd pixels
    pavgb      xmm0, xmm2
    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jg         wloop
@@ -1267,14 +1154,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
    align      4
  wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
    lea        eax,  [eax + 16]
    movdqa     xmm1, xmm0
    punpckldq  xmm0, xmm0
    punpckhdq  xmm1, xmm1
    sub        ecx, 8
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx + 16], xmm1
    lea        edx, [edx + 32]
    jg         wloop