add stride to pointer in C and pass as register to inline.

BUG=357 TESTED=clang on ios R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/29489004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1086 16f28f9a-4ce2-e073-06de-1de4eb20be90

add stride to pointer in C and pass as register to inline.
BUG=357 TESTED=clang on ios R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/29489004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1086 16f28f9a-4ce2-e073-06de-1de4eb20be90
aec76f2e · fbarchard@google.com · f7d9b9fb · aec76f2e · aec76f2e · aec76f2e
Commit aec76f2e authored Sep 19, 2014 by fbarchard@google.com
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1084
+Version: 1086
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -58,6 +58,13 @@ extern "C" {
 #if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
 #define LIBYUV_DISABLE_NEON
 #endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1084
+#define LIBYUV_VERSION 1086
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -25,7 +25,7 @@
    'conditions': [
       ['(target_arch == "armv7" or target_arch == "armv7s" or \
       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
-       and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)',
+       and (arm_neon == 1 or arm_neon_optional == 1)',
       {
         'build_neon': 1,
       }],
@@ -47,11 +47,6 @@
            '-mfpu=vfpv3-d16',
          ],
          'conditions': [
-            ['target_arch != "arm64"', {
-              'cflags': [
-                '-mfpu=neon',
-               ],
-            }],
            # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
            ['use_lto == 1', {
              'cflags!': [
@@ -60,6 +55,9 @@
              ],
            }],
          ],
+          'cflags': [
+            '-mfpu=neon',
+          ],
          'include_dirs': [
            'include',
            '.',
@@ -93,11 +91,6 @@
      # Allows libyuv.a redistributable library without external dependencies.
      'standalone_static_library': 1,
      'conditions': [
-        ['OS == "ios" and target_subarch == 64', {
-          'defines': [
-            'LIBYUV_DISABLE_NEON'
-          ],
-        }],
        ['OS != "ios" and libyuv_disable_jpeg != 1', {
          'defines': [
            'HAVE_JPEG'
@@ -126,15 +119,6 @@
          'dependencies': [
            'libyuv_neon',
          ],
-          'conditions': [
-          # TODO LIBYUV_NEON is temporary disabled. When all arm64 port has
-          # been done, enable it.
-          ['target_arch !="arm64"', {
-          'defines': [
-            'LIBYUV_NEON',
-          ]
-          }],
-          ],
        }],
        # MemorySanitizer does not support assembly code yet.
        # http://crbug.com/344505
@@ -151,6 +135,7 @@
        # 'LIBYUV_DISABLE_MIPS',
        # Enable the following macro to build libyuv as a shared library (dll).
        # 'LIBYUV_USING_SHARED_LIBRARY',
+	# TODO(fbarchard): Make these into gyp defines.
      ],
      'include_dirs': [
        'include',

--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -15,7 +15,8 @@
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
    !defined(__native_client__)  && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
+    (defined(_M_IX86) || defined(_M_X64))
 #include <immintrin.h>  // For _xgetbv()
 #endif

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -21,7 +21,8 @@ extern "C" {
 #endif
 // This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || defined(_M_X64))
 #define YG 74  /* (int8)(1.164 * 64 + 0.5) */

--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -105,12 +105,12 @@ asm volatile (
    MEMACCESS(0)
    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
    MEMACCESS(3)
-    "ld1     {v1.16b}, [%3], #16               \n"
+    "ld1     {v1.16b}, [%2], #16               \n"
    MEMACCESS(4)
-    "ld1     {v2.16b}, [%4], #16               \n"
+    "ld1     {v2.16b}, [%3], #16               \n"
    MEMACCESS(5)
-    "ld1     {v3.16b}, [%5], #16               \n"
+    "ld1     {v3.16b}, [%4], #16               \n"
-    "subs       %2, %2, #4                     \n"
+    "subs    %5, %5, #4                        \n"
    "uaddlp  v0.8h, v0.16b                     \n"
    "uadalp  v0.8h, v1.16b                     \n"
    "uadalp  v0.8h, v2.16b                     \n"
@@ -122,10 +122,10 @@ asm volatile (
    "b.gt       1b                             \n"
  : "+r"(src_ptr),   // %0
    "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %2
-    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %3
-    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3),  // %4
-    "+r"(src_ptr3)   // %5
+    "+r"(dst_width)  // %5
  :
  : "v0", "v1", "v2", "v3", "memory", "cc"
  );
@@ -144,7 +144,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
    MEMACCESS(0)
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
    "subs      %2, %2, #24                             \n"
-    "mov       v2.8b, v3.8b                            \n"  // order v0, v1, v2
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
    MEMACCESS(1)
    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
    "b.gt      1b                                      \n"
@@ -309,6 +309,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
  asm volatile (
    MEMACCESS(5)
@@ -317,7 +318,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    "ld1       {v30.16b}, [%6]                         \n"
    MEMACCESS(7)
    "ld1       {v31.8h}, [%7]                          \n"
-    "add       %3, %3, %0                              \n"
+    "add       %2, %2, %0                              \n"
  "1:                                                  \n"
    // 00 40 01 41 02 42 03 43
@@ -327,10 +328,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    MEMACCESS(0)
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
    MEMACCESS(4)
-    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32              \n"
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
-    "subs      %2, %2, #12                             \n"
+    "subs      %4, %4, #12                             \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -420,9 +421,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    "b.gt      1b                                      \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
+    "+r"(tmp_src_stride),   // %2
-    "+r"(src_stride),       // %3
+    "+r"(src_ptr1),         // %3
-    "+r"(src_ptr1)          // %4
+    "+r"(dst_width)         // %4
  : "r"(&kMult38_Div6),     // %5
    "r"(&kShuf38_2),        // %6
    "r"(&kMult38_Div9)      // %7
@@ -438,12 +439,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
  asm volatile (
    MEMACCESS(4)
    "ld1       {v30.8h}, [%4]                          \n"
    MEMACCESS(5)
    "ld1       {v31.16b}, [%5]                         \n"
-    "add       %3, %3, %0                              \n"
+    "add       %2, %2, %0                              \n"
  "1:                                                  \n"
    // 00 40 01 41 02 42 03 43
@@ -454,7 +457,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
    MEMACCESS(3)
    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"
-    "subs      %2, %2, #12                             \n"
+    "subs      %3, %3, #12                             \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -528,12 +531,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    MEMACCESS(1)
    "st1       {v3.s}[2], [%1], #4                     \n"
    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),       // %0
+  : "+r"(src_ptr),         // %0
-    "+r"(dst_ptr),       // %1
+    "+r"(dst_ptr),         // %1
-    "+r"(dst_width),     // %2
+    "+r"(tmp_src_stride),  // %2
-    "+r"(src_stride)     // %3
+    "+r"(dst_width)        // %3
-  : "r"(&kMult38_Div6),  // %4
+  : "r"(&kMult38_Div6),    // %4
-    "r"(&kShuf38_2)      // %5
+    "r"(&kShuf38_2)        // %5
  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
    "v18", "v19", "v30", "v31", "memory", "cc"
  );