yasm ALIGN uppercase

BUG=none TEST=untested R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/4769005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@885 16f28f9a-4ce2-e073-06de-1de4eb20be90

yasm ALIGN uppercase
BUG=none TEST=untested R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/4769005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@885 16f28f9a-4ce2-e073-06de-1de4eb20be90
04f40278 · fbarchard@google.com · 545a51c1 · 04f40278 · 04f40278 · 04f40278
Commit 04f40278 authored Dec 03, 2013 by fbarchard@google.com
10 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 884
+Version: 885
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
 /*
 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
 #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
 #include "libyuv/basic_types.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // Copy ARGB to ARGB.
 #define ARGBToARGB ARGBCopy
 LIBYUV_API
 int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);
 // Convert ARGB To BGRA. (alias)
 #define ARGBToBGRA BGRAToARGB
 LIBYUV_API
 int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
 // Convert ARGB To ABGR. (alias)
 #define ARGBToABGR ABGRToARGB
 LIBYUV_API
 int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
 // Convert ARGB To RGBA.
 LIBYUV_API
 int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
 // Convert ARGB To RGB24.
 LIBYUV_API
 int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
                uint8* dst_rgb24, int dst_stride_rgb24,
                int width, int height);
 // Convert ARGB To RAW.
 LIBYUV_API
 int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
              uint8* dst_rgb, int dst_stride_rgb,
              int width, int height);
 // Convert ARGB To RGB565.
 LIBYUV_API
 int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);
 // Convert ARGB To ARGB1555.
 LIBYUV_API
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_argb1555, int dst_stride_argb1555,
                   int width, int height);
 // Convert ARGB To ARGB4444.
 LIBYUV_API
 int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_argb4444, int dst_stride_argb4444,
                   int width, int height);
 // Convert ARGB To I444.
 LIBYUV_API
 int ARGBToI444(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 // Convert ARGB To I422.
 LIBYUV_API
 int ARGBToI422(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 // Convert ARGB To I420. (also in convert.h)
 LIBYUV_API
 int ARGBToI420(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
 int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
               uint8* dst_yj, int dst_stride_yj,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 // Convert ARGB To I411.
 LIBYUV_API
 int ARGBToI411(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 // Convert ARGB to J400. (JPeg full range).
 LIBYUV_API
 int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_yj, int dst_stride_yj,
               int width, int height);
 // Convert ARGB to I400.
 LIBYUV_API
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
 // Convert ARGB To NV12.
 LIBYUV_API
 int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_uv, int dst_stride_uv,
               int width, int height);
 // Convert ARGB To NV21.
 LIBYUV_API
 int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_vu, int dst_stride_vu,
               int width, int height);
 // Convert ARGB To NV21.
 LIBYUV_API
 int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_vu, int dst_stride_vu,
               int width, int height);
 // Convert ARGB To YUY2.
 LIBYUV_API
 int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
               uint8* dst_yuy2, int dst_stride_yuy2,
               int width, int height);
 // Convert ARGB To UYVY.
 LIBYUV_API
 int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
               uint8* dst_uyvy, int dst_stride_uyvy,
               int width, int height);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 #endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 884
+#define LIBYUV_VERSION 885
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/compare_common.cc
+++ b/source/compare_common.cc
 /*
 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/basic_types.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
  uint32 sse = 0u;
  for (int i = 0; i < count; ++i) {
    int diff = src_a[i] - src_b[i];
    sse += static_cast<uint32>(diff * diff);
  }
  return sse;
 }
 // hash seed of 5381 recommended.
 // Internal C version of HashDjb2 with int sized count for efficiency.
 uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
  uint32 hash = seed;
  for (int i = 0; i < count; ++i) {
    hash += (hash << 5) + src[i];
  }
  return hash;
 }
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
 /*
 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/basic_types.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  volatile uint32 sse;
  asm volatile (
    "vmov.u8    q8, #0                         \n"
    "vmov.u8    q10, #0                        \n"
    "vmov.u8    q9, #0                         \n"
    "vmov.u8    q11, #0                        \n"
    ".p2align  2                               \n"
  "1:                                          \n"
    "vld1.8     {q0}, [%0]!                    \n"
    "vld1.8     {q1}, [%1]!                    \n"
    "subs       %2, %2, #16                    \n"
    "vsubl.u8   q2, d0, d2                     \n"
    "vsubl.u8   q3, d1, d3                     \n"
    "vmlal.s16  q8, d4, d4                     \n"
    "vmlal.s16  q9, d6, d6                     \n"
    "vmlal.s16  q10, d5, d5                    \n"
    "vmlal.s16  q11, d7, d7                    \n"
    "bgt        1b                             \n"
    "vadd.u32   q8, q8, q9                     \n"
    "vadd.u32   q10, q10, q11                  \n"
    "vadd.u32   q11, q8, q10                   \n"
    "vpaddl.u32 q1, q11                        \n"
    "vadd.u64   d0, d2, d3                     \n"
    "vmov.32    %3, d0[0]                      \n"
    : "+r"(src_a),
      "+r"(src_b),
      "+r"(count),
      "=r"(sse)
    :
    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  return sse;
 }
 #endif  // __ARM_NEON__
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/source/compare_posix.cc
+++ b/source/compare_posix.cc
 /*
 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/basic_types.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #if defined(__native_client__) && defined(__x86_64__)
 #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
 #define MEMLEA(offset, base) #offset "(%q" #base ")"
 #else
 #define MEMACCESS(base) "(%" #base ")"
 #define MEMLEA(offset, base) #offset "(%" #base ")"
 #endif
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  uint32 sse;
  asm volatile (  // NOLINT
    "pxor      %%xmm0,%%xmm0                   \n"
    "pxor      %%xmm5,%%xmm5                   \n"
    ".p2align  2                               \n"
    "1:                                        \n"
    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
    "lea       " MEMLEA(0x10, 0) ",%0          \n"
    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
    "lea       " MEMLEA(0x10, 1) ",%1          \n"
    "sub       $0x10,%2                        \n"
    "movdqa    %%xmm1,%%xmm3                   \n"
    "psubusb   %%xmm2,%%xmm1                   \n"
    "psubusb   %%xmm3,%%xmm2                   \n"
    "por       %%xmm2,%%xmm1                   \n"
    "movdqa    %%xmm1,%%xmm2                   \n"
    "punpcklbw %%xmm5,%%xmm1                   \n"
    "punpckhbw %%xmm5,%%xmm2                   \n"
    "pmaddwd   %%xmm1,%%xmm1                   \n"
    "pmaddwd   %%xmm2,%%xmm2                   \n"
    "paddd     %%xmm1,%%xmm0                   \n"
    "paddd     %%xmm2,%%xmm0                   \n"
    "jg        1b                              \n"
    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
    "paddd     %%xmm1,%%xmm0                   \n"
    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
    "paddd     %%xmm1,%%xmm0                   \n"
    "movd      %%xmm0,%3                       \n"
  : "+r"(src_a),      // %0
    "+r"(src_b),      // %1
    "+r"(count),      // %2
    "=g"(sse)         // %3
  :
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
  );  // NOLINT
  return sse;
 }
 #endif  // defined(__x86_64__) || defined(__i386__)
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
 #define HAS_HASHDJB2_SSE41
 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
 static uvec32 kHashMul0 = {
  0x0c3525e1,  // 33 ^ 15
  0xa3476dc1,  // 33 ^ 14
  0x3b4039a1,  // 33 ^ 13
  0x4f5f0981,  // 33 ^ 12
 };
 static uvec32 kHashMul1 = {
  0x30f35d61,  // 33 ^ 11
  0x855cb541,  // 33 ^ 10
  0x040a9121,  // 33 ^ 9
  0x747c7101,  // 33 ^ 8
 };
 static uvec32 kHashMul2 = {
  0xec41d4e1,  // 33 ^ 7
  0x4cfa3cc1,  // 33 ^ 6
  0x025528a1,  // 33 ^ 5
  0x00121881,  // 33 ^ 4
 };
 static uvec32 kHashMul3 = {
  0x00008c61,  // 33 ^ 3
  0x00000441,  // 33 ^ 2
  0x00000021,  // 33 ^ 1
  0x00000001,  // 33 ^ 0
 };
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  uint32 hash;
  asm volatile (  // NOLINT
    "movd      %2,%%xmm0                       \n"
    "pxor      %%xmm7,%%xmm7                   \n"
    "movdqa    %4,%%xmm6                       \n"
    ".p2align  2                               \n"
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
    "lea       " MEMLEA(0x10, 0) ",%0          \n"
    "pmulld    %%xmm6,%%xmm0                   \n"
    "movdqa    %5,%%xmm5                       \n"
    "movdqa    %%xmm1,%%xmm2                   \n"
    "punpcklbw %%xmm7,%%xmm2                   \n"
    "movdqa    %%xmm2,%%xmm3                   \n"
    "punpcklwd %%xmm7,%%xmm3                   \n"
    "pmulld    %%xmm5,%%xmm3                   \n"
    "movdqa    %6,%%xmm5                       \n"
    "movdqa    %%xmm2,%%xmm4                   \n"
    "punpckhwd %%xmm7,%%xmm4                   \n"
    "pmulld    %%xmm5,%%xmm4                   \n"
    "movdqa    %7,%%xmm5                       \n"
    "punpckhbw %%xmm7,%%xmm1                   \n"
    "movdqa    %%xmm1,%%xmm2                   \n"
    "punpcklwd %%xmm7,%%xmm2                   \n"
    "pmulld    %%xmm5,%%xmm2                   \n"
    "movdqa    %8,%%xmm5                       \n"
    "punpckhwd %%xmm7,%%xmm1                   \n"
    "pmulld    %%xmm5,%%xmm1                   \n"
    "paddd     %%xmm4,%%xmm3                   \n"
    "paddd     %%xmm2,%%xmm1                   \n"
    "sub       $0x10,%1                        \n"
    "paddd     %%xmm3,%%xmm1                   \n"
    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
    "paddd     %%xmm2,%%xmm1                   \n"
    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
    "paddd     %%xmm2,%%xmm1                   \n"
    "paddd     %%xmm1,%%xmm0                   \n"
    "jg        1b                              \n"
    "movd      %%xmm0,%3                       \n"
  : "+r"(src),        // %0
    "+r"(count),      // %1
    "+rm"(seed),      // %2
    "=g"(hash)        // %3
  : "m"(kHash16x33),  // %4
    "m"(kHashMul0),   // %5
    "m"(kHashMul1),   // %6
    "m"(kHashMul2),   // %7
    "m"(kHashMul3)    // %8
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
  );  // NOLINT
  return hash;
 }
 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
 /*
 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/basic_types.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 __declspec(naked) __declspec(align(16))
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]    // src_a
    mov        edx, [esp + 8]    // src_b
    mov        ecx, [esp + 12]   // count
    pxor       xmm0, xmm0
    pxor       xmm5, xmm5
    align      4
  wloop:
    movdqa     xmm1, [eax]
    lea        eax,  [eax + 16]
    movdqa     xmm2, [edx]
    lea        edx,  [edx + 16]
    sub        ecx, 16
    movdqa     xmm3, xmm1  // abs trick
    psubusb    xmm1, xmm2
    psubusb    xmm2, xmm3
    por        xmm1, xmm2
    movdqa     xmm2, xmm1
    punpcklbw  xmm1, xmm5
    punpckhbw  xmm2, xmm5
    pmaddwd    xmm1, xmm1
    pmaddwd    xmm2, xmm2
    paddd      xmm0, xmm1
    paddd      xmm0, xmm2
    jg         wloop
    pshufd     xmm1, xmm0, 0xee
    paddd      xmm0, xmm1
    pshufd     xmm1, xmm0, 0x01
    paddd      xmm0, xmm1
    movd       eax, xmm0
    ret
  }
 }
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable: 4752)
 __declspec(naked) __declspec(align(16))
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]    // src_a
    mov        edx, [esp + 8]    // src_b
    mov        ecx, [esp + 12]   // count
    vpxor      ymm0, ymm0, ymm0  // sum
    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
    sub        edx, eax
    align      4
  wloop:
    vmovdqu    ymm1, [eax]
    vmovdqu    ymm2, [eax + edx]
    lea        eax,  [eax + 32]
    sub        ecx, 32
    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
    vpsubusb   ymm2, ymm2, ymm1
    vpor       ymm1, ymm2, ymm3
    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
    vpunpckhbw ymm1, ymm1, ymm5
    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
    vpmaddwd   ymm1, ymm1, ymm1
    vpaddd     ymm0, ymm0, ymm1
    vpaddd     ymm0, ymm0, ymm2
    jg         wloop
    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
    vpaddd     ymm0, ymm0, ymm1
    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
    vpaddd     ymm0, ymm0, ymm1
    vpermq     ymm1, ymm0, 0x02  // high + low lane.
    vpaddd     ymm0, ymm0, ymm1
    vmovd      eax, xmm0
    vzeroupper
    ret
  }
 }
 #endif  // _MSC_VER >= 1700
 #define HAS_HASHDJB2_SSE41
 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
 static uvec32 kHashMul0 = {
  0x0c3525e1,  // 33 ^ 15
  0xa3476dc1,  // 33 ^ 14
  0x3b4039a1,  // 33 ^ 13
  0x4f5f0981,  // 33 ^ 12
 };
 static uvec32 kHashMul1 = {
  0x30f35d61,  // 33 ^ 11
  0x855cb541,  // 33 ^ 10
  0x040a9121,  // 33 ^ 9
  0x747c7101,  // 33 ^ 8
 };
 static uvec32 kHashMul2 = {
  0xec41d4e1,  // 33 ^ 7
  0x4cfa3cc1,  // 33 ^ 6
  0x025528a1,  // 33 ^ 5
  0x00121881,  // 33 ^ 4
 };
 static uvec32 kHashMul3 = {
  0x00008c61,  // 33 ^ 3
  0x00000441,  // 33 ^ 2
  0x00000021,  // 33 ^ 1
  0x00000001,  // 33 ^ 0
 };
 // 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
 // 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
 // 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
 // 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
 // 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
    _asm _emit 0x40 _asm _emit reg
 __declspec(naked) __declspec(align(16))
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
    mov        ecx, [esp + 8]    // count
    movd       xmm0, [esp + 12]  // seed
    pxor       xmm7, xmm7        // constant 0 for unpck
    movdqa     xmm6, kHash16x33
    align      4
  wloop:
    movdqu     xmm1, [eax]       // src[0-15]
    lea        eax, [eax + 16]
    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
    movdqa     xmm5, kHashMul0
    movdqa     xmm2, xmm1
    punpcklbw  xmm2, xmm7        // src[0-7]
    movdqa     xmm3, xmm2
    punpcklwd  xmm3, xmm7        // src[0-3]
    pmulld(0xdd)                 // pmulld     xmm3, xmm5
    movdqa     xmm5, kHashMul1
    movdqa     xmm4, xmm2
    punpckhwd  xmm4, xmm7        // src[4-7]
    pmulld(0xe5)                 // pmulld     xmm4, xmm5
    movdqa     xmm5, kHashMul2
    punpckhbw  xmm1, xmm7        // src[8-15]
    movdqa     xmm2, xmm1
    punpcklwd  xmm2, xmm7        // src[8-11]
    pmulld(0xd5)                 // pmulld     xmm2, xmm5
    movdqa     xmm5, kHashMul3
    punpckhwd  xmm1, xmm7        // src[12-15]
    pmulld(0xcd)                 // pmulld     xmm1, xmm5
    paddd      xmm3, xmm4        // add 16 results
    paddd      xmm1, xmm2
    sub        ecx, 16
    paddd      xmm1, xmm3
    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
    paddd      xmm1, xmm2
    pshufd     xmm2, xmm1, 0x01
    paddd      xmm1, xmm2
    paddd      xmm0, xmm1
    jg         wloop
    movd       eax, xmm0         // return hash
    ret
  }
 }
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 __declspec(naked) __declspec(align(16))
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
    mov        ecx, [esp + 8]    // count
    movd       xmm0, [esp + 12]  // seed
    movdqa     xmm6, kHash16x33
    align      4
  wloop:
    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
    pmulld     xmm3, kHashMul0
    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
    pmulld     xmm4, kHashMul1
    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
    pmulld     xmm2, kHashMul2
    lea        eax, [eax + 16]
    pmulld     xmm1, kHashMul3
    paddd      xmm3, xmm4        // add 16 results
    paddd      xmm1, xmm2
    sub        ecx, 16
    paddd      xmm1, xmm3
    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
    paddd      xmm1, xmm2
    pshufd     xmm2, xmm1, 0x01
    paddd      xmm1, xmm2
    paddd      xmm0, xmm1
    jg         wloop
    movd       eax, xmm0         // return hash
    ret
  }
 }
 #endif  // _MSC_VER >= 1700
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -49,17 +49,17 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
      ARGBToYRow_C;
  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                         int pix) = ARGBToUV444Row_C;
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
      if (IS_ALIGNED(width, 16)) {
        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
          ARGBToUV444Row = ARGBToUV444Row_SSSE3;
        }
      }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
    ARGBToYRow = ARGBToYRow_Any_SSSE3;

--- a/source/row_x86.asm
+++ b/source/row_x86.asm
@@ -28,7 +28,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
    psrlw      m2, m2, 8
 %endif
-    align      4
+    ALIGN      4
 .convertloop:
    mov%2      m0, [src_yuy2q]
    mov%2      m1, [src_yuy2q + mmsize]
@@ -74,7 +74,7 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
    psrlw      m4, m4, 8
    sub        dst_vq, dst_uq
-    align      4
+    ALIGN      4
 .convertloop:
    mov%1      m0, [src_uvq]
    mov%1      m1, [src_uvq + mmsize]
@@ -113,7 +113,7 @@ SplitUVRow a,
 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
    sub        src_vq, src_uq
-    align      4
+    ALIGN      4
 .convertloop:
    mov%1      m0, [src_uq]
    mov%1      m1, [src_vq]

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc