Add nacl macros to ScaleFilterCols_NEON on ARM32/64 platform

Add the nacl macros to ARM functions. If not, a bunch of code is failing to validate. BUG=319 TESTED=libyuvTest.* on ARM32/64 with Android R=fbarchard@google.com Change-Id: I7a36434f18e0de8b8f8a9fe01167bfe50cff8962 Review URL: https://webrtc-codereview.appspot.com/47739004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1343 16f28f9a-4ce2-e073-06de-1de4eb20be90

Add nacl macros to ScaleFilterCols_NEON on ARM32/64 platform
Add the nacl macros to ARM functions. If not, a bunch of code is failing to validate. BUG=319 TESTED=libyuvTest.* on ARM32/64 with Android R=fbarchard@google.com Change-Id: I7a36434f18e0de8b8f8a9fe01167bfe50cff8962 Review URL: https://webrtc-codereview.appspot.com/47739004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1343 16f28f9a-4ce2-e073-06de-1de4eb20be90
0d3bfab6 · yang.zhang@arm.com · d28cd77f · 0d3bfab6 · 0d3bfab6
Commit 0d3bfab6 authored Mar 24, 2015 by yang.zhang@arm.com
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 20 deletions

scale_neon.cc source/scale_neon.cc +15 -10

scale_neon64.cc source/scale_neon64.cc +15 -10

No files found.
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -579,13 +579,16 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 // the x/dx stepping
 #define LOAD2_DATA8_LANE(n)                                    \
    "lsr        %5, %3, #16                    \n"             \
-    "add        r12, %1, %5                    \n"             \
+    "add        %6, %1, %5                     \n"             \
    "add        %3, %3, %4                     \n"             \
-    "vld2.8     {d6["#n"], d7["#n"]}, [r12]    \n"
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"

 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx) {
-  int tmp[4] = {0, 1, 2, 3};
+  int dx_offset[4] = {0, 1, 2, 3};
+  int *tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
  asm volatile (
    ".p2align   2                              \n"
    "vdup.32    q0, %3                         \n"  // x
@@ -629,13 +632,15 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
    "vadd.s32   q2, q2, q0                     \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
-  : "+r"(dst_ptr)          // %0
-  : "r"(src_ptr),          // %1
-    "r"(dst_width),        // %2
-    "r"(x),                // %3
-    "r"(dx),               // %4
-    "r"(tmp)               // %5
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3",
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
    "q8", "q9", "q10", "q11", "q12", "q13"
  );
 }

--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -582,13 +582,16 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 // the x/dx stepping
 #define LOAD2_DATA8_LANE(n)                                    \
    "lsr        %5, %3, #16                    \n"             \
-    "add        x12, %1, %5                    \n"             \
+    "add        %6, %1, %5                    \n"              \
    "add        %3, %3, %4                     \n"             \
-    "ld2        {v4.b, v5.b}["#n"], [x12]      \n"
+    MEMACCESS(6)                                               \
+    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"

 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx) {
-  int tmp[4] = {0, 1, 2, 3};
+  int dx_offset[4] = {0, 1, 2, 3};
+  int *tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
  asm volatile (
    "dup        v0.4s, %w3                     \n"  // x
    "dup        v1.4s, %w4                     \n"  // dx
@@ -631,13 +634,15 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
    "add       v2.4s, v2.4s, v0.4s             \n"
    "subs      %2, %2, #8                      \n"  // 8 processed per loop
    "b.gt      1b                              \n"
-  : "+r"(dst_ptr)          // %0
-  : "r"(src_ptr),          // %1
-    "r"(dst_width),        // %2
-    "r"(static_cast<ptrdiff_t>(x)),                // %3
-    "r"(static_cast<ptrdiff_t>(dx)),               // %4
-    "r"(tmp)               // %5
-  : "memory", "cc", "x12", "v0", "v1", "v2", "v3",
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
    "v4", "v5", "v6", "v7", "v16", "v17"
  );
 }