add stride to pointer in C and pass as register to inline.

BUG=357 TESTED=clang on ios R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/29489004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1086 16f28f9a-4ce2-e073-06de-1de4eb20be90

add stride to pointer in C and pass as register to inline.
BUG=357 TESTED=clang on ios R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/29489004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1086 16f28f9a-4ce2-e073-06de-1de4eb20be90
aec76f2e · fbarchard@google.com · f7d9b9fb · aec76f2e · aec76f2e · aec76f2e
Commit aec76f2e authored Sep 19, 2014 by fbarchard@google.com
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1084
+Version: 1086
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -58,6 +58,13 @@ extern "C" {
 #if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
 #define LIBYUV_DISABLE_NEON
 #endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1084
+#define LIBYUV_VERSION 1086
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -25,7 +25,7 @@
    'conditions': [
       ['(target_arch == "armv7" or target_arch == "armv7s" or \
       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
-       and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)',
+       and (arm_neon == 1 or arm_neon_optional == 1)',
       {
         'build_neon': 1,
       }],
@@ -47,11 +47,6 @@
            '-mfpu=vfpv3-d16',
          ],
          'conditions': [
-            ['target_arch != "arm64"', {
-              'cflags': [
-                '-mfpu=neon',
-               ],
-            }],
            # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
            ['use_lto == 1', {
              'cflags!': [
@@ -60,6 +55,9 @@
              ],
            }],
          ],
+          'cflags': [
+            '-mfpu=neon',
+          ],
          'include_dirs': [
            'include',
            '.',
@@ -93,11 +91,6 @@
      # Allows libyuv.a redistributable library without external dependencies.
      'standalone_static_library': 1,
      'conditions': [
-        ['OS == "ios" and target_subarch == 64', {
-          'defines': [
-            'LIBYUV_DISABLE_NEON'
-          ],
-        }],
        ['OS != "ios" and libyuv_disable_jpeg != 1', {
          'defines': [
            'HAVE_JPEG'
@@ -126,15 +119,6 @@
          'dependencies': [
            'libyuv_neon',
          ],
-          'conditions': [
-          # TODO LIBYUV_NEON is temporary disabled. When all arm64 port has
-          # been done, enable it.
-          ['target_arch !="arm64"', {
-          'defines': [
-            'LIBYUV_NEON',
-          ]
-          }],
-          ],
        }],
        # MemorySanitizer does not support assembly code yet.
        # http://crbug.com/344505
@@ -151,6 +135,7 @@
        # 'LIBYUV_DISABLE_MIPS',
        # Enable the following macro to build libyuv as a shared library (dll).
        # 'LIBYUV_USING_SHARED_LIBRARY',
+	# TODO(fbarchard): Make these into gyp defines.
      ],
      'include_dirs': [
        'include',

--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -15,7 +15,8 @@
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
    !defined(__native_client__)  && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
+    (defined(_M_IX86) || defined(_M_X64))
 #include <immintrin.h>  // For _xgetbv()
 #endif

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -1297,8 +1297,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
 #ifdef HAS_YUY2TOUVROW_NEON
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
  asm volatile (
-    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_yuy2
    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
@@ -1314,7 +1314,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
    "b.gt       1b                             \n"
  : "+r"(src_yuy2),     // %0
-    "+r"(stride_yuy2),  // %1
+    "+r"(src_yuy2b),    // %1
    "+r"(dst_u),        // %2
    "+r"(dst_v),        // %3
    "+r"(pix)           // %4
@@ -1327,8 +1327,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
 #ifdef HAS_UYVYTOUVROW_NEON
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
  asm volatile (
-    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_uyvy
    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
@@ -1344,7 +1344,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
    "b.gt       1b                             \n"
  : "+r"(src_uyvy),     // %0
-    "+r"(stride_uyvy),  // %1
+    "+r"(src_uyvyb),    // %1
    "+r"(dst_u),        // %2
    "+r"(dst_v),        // %3
    "+r"(pix)           // %4
@@ -1357,9 +1357,9 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
 #ifdef HAS_HALFROW_NEON
 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
                  uint8* dst_uv, int pix) {
+  const uint8* src_uvb = src_uv + src_uv_stride;
  asm volatile (
    // change the stride to row 2 pointer
-    "add        %x1, %x0, %w1, sxtw            \n"
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load row 1 16 pixels.
@@ -1371,7 +1371,7 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
    "st1        {v0.16b}, [%2], #16            \n"
    "b.gt       1b                             \n"
  : "+r"(src_uv),         // %0
-    "+r"(src_uv_stride),  // %1
+    "+r"(src_uvb),        // %1
    "+r"(dst_uv),         // %2
    "+r"(pix)             // %3
  :
@@ -1682,11 +1682,11 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                         int pix) {
  asm volatile (
-    "movi       v20.8h, #112 / 2               \n"  // UB / VR 0.875 coefficient
+    "movi       v20.8h, #56                    \n"  // UB / VR 0.875 / 2 coefficient
-    "movi       v21.8h, #74 / 2                \n"  // UG -0.5781 coefficient
+    "movi       v21.8h, #37                    \n"  // UG -0.5781 / 2 coefficient
-    "movi       v22.8h, #38 / 2                \n"  // UR -0.2969 coefficient
+    "movi       v22.8h, #19                    \n"  // UR -0.2969 / 2 coefficient
-    "movi       v23.8h, #18 / 2                \n"  // VB -0.1406 coefficient
+    "movi       v23.8h, #9                     \n"  // VB -0.1406 / 2 coefficient
-    "movi       v24.8h, #94 / 2                \n"  // VG -0.7344 coefficient
+    "movi       v24.8h, #47                    \n"  // VG -0.7344 / 2 coefficient
    "movi       v25.16b, #0x80                 \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -1732,11 +1732,11 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                         int pix) {
  asm volatile (
-    "movi       v20.8h, #112 / 2               \n"  // UB / VR 0.875 coefficient
+    "movi       v20.8h, #56                    \n"  // UB / VR 0.875 / 2 coefficient
-    "movi       v21.8h, #74 / 2                \n"  // UG -0.5781 coefficient
+    "movi       v21.8h, #37                    \n"  // UG -0.5781 / 2 coefficient
-    "movi       v22.8h, #38 / 2                \n"  // UR -0.2969 coefficient
+    "movi       v22.8h, #19                    \n"  // UR -0.2969 / 2 coefficient
-    "movi       v23.8h, #18 / 2                \n"  // VB -0.1406 coefficient
+    "movi       v23.8h, #9                     \n"  // VB -0.1406 / 2 coefficient
-    "movi       v24.8h, #94 / 2                \n"  // VG -0.7344 coefficient
+    "movi       v24.8h, #47                    \n"  // VG -0.7344 / 2 coefficient
    "movi       v25.16b, #0x80                 \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -1800,16 +1800,18 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
 #ifdef HAS_ARGBTOUVROW_NEON
 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -1908,11 +1910,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -1959,11 +1961,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -2010,11 +2012,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -2061,11 +2063,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
                       uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -2112,11 +2114,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
                     uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_raw
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -2164,11 +2166,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
                        uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -2236,11 +2238,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
                        uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -2308,11 +2310,11 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
                          uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q10, #56                       \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q11, #37                       \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q12, #19                       \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q13, #9                        \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.s16   q14, #47                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
    ".p2align   2                              \n"
  "1:                                          \n"
@@ -2748,9 +2750,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    // Blend 1 pixels.
  "1:                                          \n"
    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%0], #4       \n"  // load 1 pixel ARGB0.
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4       \n"  // load 1 pixel ARGB0.
    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}[0], [%1], #4       \n"  // load 1 pixel ARGB1.
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4       \n"  // load 1 pixel ARGB1.
    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
@@ -2766,7 +2768,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
    "movi       v3.8b, #255                    \n"  // a = 255
    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%2], #4       \n"  // store 1 pixel.
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4       \n"  // store 1 pixel.
    "b.ge       1b                             \n"
  "99:                                         \n"

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -21,7 +21,8 @@ extern "C" {
 #endif
 // This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || defined(_M_X64))
 #define YG 74  /* (int8)(1.164 * 64 + 0.5) */

--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -105,12 +105,12 @@ asm volatile (
    MEMACCESS(0)
    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
    MEMACCESS(3)
-    "ld1     {v1.16b}, [%3], #16               \n"
+    "ld1     {v1.16b}, [%2], #16               \n"
    MEMACCESS(4)
-    "ld1     {v2.16b}, [%4], #16               \n"
+    "ld1     {v2.16b}, [%3], #16               \n"
    MEMACCESS(5)
-    "ld1     {v3.16b}, [%5], #16               \n"
+    "ld1     {v3.16b}, [%4], #16               \n"
-    "subs       %2, %2, #4                     \n"
+    "subs    %5, %5, #4                        \n"
    "uaddlp  v0.8h, v0.16b                     \n"
    "uadalp  v0.8h, v1.16b                     \n"
    "uadalp  v0.8h, v2.16b                     \n"
@@ -122,10 +122,10 @@ asm volatile (
    "b.gt       1b                             \n"
  : "+r"(src_ptr),   // %0
    "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %2
-    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %3
-    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3),  // %4
-    "+r"(src_ptr3)   // %5
+    "+r"(dst_width)  // %5
  :
  : "v0", "v1", "v2", "v3", "memory", "cc"
  );
@@ -144,7 +144,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
    MEMACCESS(0)
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
    "subs      %2, %2, #24                             \n"
-    "mov       v2.8b, v3.8b                            \n"  // order v0, v1, v2
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
    MEMACCESS(1)
    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
    "b.gt      1b                                      \n"
@@ -309,6 +309,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
  asm volatile (
    MEMACCESS(5)
@@ -317,7 +318,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    "ld1       {v30.16b}, [%6]                         \n"
    MEMACCESS(7)
    "ld1       {v31.8h}, [%7]                          \n"
-    "add       %3, %3, %0                              \n"
+    "add       %2, %2, %0                              \n"
  "1:                                                  \n"
    // 00 40 01 41 02 42 03 43
@@ -327,10 +328,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    MEMACCESS(0)
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
    MEMACCESS(4)
-    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32              \n"
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
-    "subs      %2, %2, #12                             \n"
+    "subs      %4, %4, #12                             \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -420,9 +421,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    "b.gt      1b                                      \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
+    "+r"(tmp_src_stride),   // %2
-    "+r"(src_stride),       // %3
+    "+r"(src_ptr1),         // %3
-    "+r"(src_ptr1)          // %4
+    "+r"(dst_width)         // %4
  : "r"(&kMult38_Div6),     // %5
    "r"(&kShuf38_2),        // %6
    "r"(&kMult38_Div9)      // %7
@@ -438,12 +439,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
  asm volatile (
    MEMACCESS(4)
    "ld1       {v30.8h}, [%4]                          \n"
    MEMACCESS(5)
    "ld1       {v31.16b}, [%5]                         \n"
-    "add       %3, %3, %0                              \n"
+    "add       %2, %2, %0                              \n"
  "1:                                                  \n"
    // 00 40 01 41 02 42 03 43
@@ -454,7 +457,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
    MEMACCESS(3)
    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"
-    "subs      %2, %2, #12                             \n"
+    "subs      %3, %3, #12                             \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -528,12 +531,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    MEMACCESS(1)
    "st1       {v3.s}[2], [%1], #4                     \n"
    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),       // %0
+  : "+r"(src_ptr),         // %0
-    "+r"(dst_ptr),       // %1
+    "+r"(dst_ptr),         // %1
-    "+r"(dst_width),     // %2
+    "+r"(tmp_src_stride),  // %2
-    "+r"(src_stride)     // %3
+    "+r"(dst_width)        // %3
-  : "r"(&kMult38_Div6),  // %4
+  : "r"(&kMult38_Div6),    // %4
-    "r"(&kShuf38_2)      // %5
+    "r"(&kShuf38_2)        // %5
  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
    "v18", "v19", "v30", "v31", "memory", "cc"
  );