Remove initialize to zero on output variables for inline.

Inline that uses temporary variables is currently initializing them to 0 and passing in as output "+r". This CL replaces the output constraint to "=&r" for most meaning an output with early write (before inputs). This allows the initialize to zero step to be removed, saving 1 instruction. BUG=libyuv:580 TESTED=local libyuv build on gcc/linux and try bots R=harryjin@google.com Review URL: https://codereview.chromium.org/1895743008 .

Remove initialize to zero on output variables for inline.
Inline that uses temporary variables is currently initializing them to 0 and passing in as output "+r". This CL replaces the output constraint to "=&r" for most meaning an output with early write (before inputs). This allows the initialize to zero step to be removed, saving 1 instruction. BUG=libyuv:580 TESTED=local libyuv build on gcc/linux and try bots R=harryjin@google.com Review URL: https://codereview.chromium.org/1895743008 .
cf101116 · Frank Barchard · f160ce90 · cf101116 · cf101116 · cf101116
Commit cf101116 authored Apr 18, 2016 by Frank Barchard
9 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1584
+Version: 1585
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1584
+#define LIBYUV_VERSION 1585
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -27,7 +27,7 @@ static uvec8 kVTbl4x4Transpose =
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride,
                       int width) {
-  const uint8* src_temp = NULL;
+  const uint8* src_temp;
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
@@ -229,7 +229,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
    "4:                                        \n"
-    : "+r"(src_temp),          // %0
+    : "=&r"(src_temp),         // %0
      "+r"(src),               // %1
      "+r"(src_stride),        // %2
      "+r"(dst),               // %3
@@ -247,7 +247,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
-  const uint8* src_temp = NULL;
+  const uint8* src_temp;
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
@@ -512,7 +512,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "4:                                        \n"
-    : "+r"(src_temp),            // %0
+    : "=&r"(src_temp),           // %0
      "+r"(src),                 // %1
      "+r"(src_stride),          // %2
      "+r"(dst_a),               // %3

--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -26,7 +26,7 @@ static uvec8 kVTbl4x4Transpose =
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride, int width) {
-  const uint8* src_temp = NULL;
+  const uint8* src_temp;
  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
  asm volatile (
    // loops are on blocks of 8. loop will stop when
@@ -235,7 +235,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
    "4:                                          \n"
-    : "+r"(src_temp),                             // %0
+    : "=&r"(src_temp),                            // %0
      "+r"(src),                                  // %1
      "+r"(dst),                                  // %2
      "+r"(width64)                               // %3
@@ -255,7 +255,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
-  const uint8* src_temp = NULL;
+  const uint8* src_temp;
  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
  asm volatile (
    // loops are on blocks of 8. loop will stop when
@@ -520,7 +520,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "4:                                        \n"
-    : "+r"(src_temp),                             // %0
+    : "=&r"(src_temp),                            // %0
      "+r"(src),                                  // %1
      "+r"(dst_a),                                // %2
      "+r"(dst_b),                                // %3

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1811,7 +1811,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
                                uint8* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  int temp = 0;
+  int temp;
  asm volatile (
    YUVTORGB_SETUP(yuvconstants)
    "sub       %[u_buf],%[v_buf]               \n"
@@ -1823,15 +1823,15 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
    STOREARGB
    "subl      $0x8,%[width]                   \n"
    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
+  : [y_buf]"+r"(y_buf),        // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [u_buf]"+r"(u_buf),        // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [v_buf]"+r"(v_buf),        // %[v_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [temp]"+r"(temp),       // %[temp]
+    [temp]"=&r"(temp),         // %[temp]
 #if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)     // %[width]
+    [width]"+m"(width)         // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+    [width]"+rm"(width)        // %[width]
 #endif
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", NACL_R14 YUVTORGB_REGS
@@ -3732,7 +3732,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 // Unattenuate 4 pixels at a time.
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
-  uintptr_t alpha = 0;
+  uintptr_t alpha;
  asm volatile (
    // 4 pixel loop.
    LABELALIGN
@@ -3763,10 +3763,10 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x4,%2                         \n"
    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
+  : "+r"(src_argb),     // %0
-    "+r"(dst_argb),    // %1
+    "+r"(dst_argb),     // %1
-    "+r"(width),       // %2
+    "+r"(width),        // %2
-    "+r"(alpha)        // %3
+    "=&r"(alpha)        // %3
  : "r"(fixed_invtbl8)  // %4
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -3782,7 +3782,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = {
 // Unattenuate 8 pixels at a time.
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
-  uintptr_t alpha = 0;
+  uintptr_t alpha;
  asm volatile (
    "sub        %0,%1                          \n"
    "vbroadcastf128 %5,%%ymm5                  \n"
@@ -3831,10 +3831,10 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
    "sub        $0x8,%2                        \n"
    "jg        1b                              \n"
    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
+  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),    // %1
+    "+r"(dst_argb),      // %1
-    "+r"(width),       // %2
+    "+r"(width),         // %2
-    "+r"(alpha)        // %3
+    "=&r"(alpha)         // %3
  : "r"(fixed_invtbl8),  // %4
    "m"(kUnattenShuffleAlpha_AVX2)  // %5
  : "memory", "cc", NACL_R14
@@ -4759,7 +4759,7 @@ LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* src_dudv, int width) {
  intptr_t src_argb_stride_temp = src_argb_stride;
-  intptr_t temp = 0;
+  intptr_t temp;
  asm volatile (
    "movq      " MEMACCESS(3) ",%%xmm2         \n"
    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
@@ -4831,7 +4831,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
    "+r"(dst_argb),  // %2
    "+r"(src_dudv),  // %3
    "+rm"(width),    // %4
-    "+r"(temp)   // %5
+    "=&r"(temp)      // %5
  :
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
@@ -5057,7 +5057,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                         const uint8* shuffler, int width) {
-  uintptr_t pixel_temp = 0u;
+  uintptr_t pixel_temp;
  asm volatile (
    "pxor      %%xmm5,%%xmm5                   \n"
    "mov       " MEMACCESS(4) ",%k2            \n"
@@ -5162,11 +5162,11 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    "jg        3012b                           \n"
  "99:                                         \n"
-  : "+r"(src_argb),    // %0
+  : "+r"(src_argb),     // %0
-    "+r"(dst_argb),    // %1
+    "+r"(dst_argb),     // %1
-    "+d"(pixel_temp),  // %2
+    "=&d"(pixel_temp),  // %2
    "+r"(width)         // %3
-  : "r"(shuffler)      // %4
+  : "r"(shuffler)       // %4
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm5"
  );
@@ -5343,7 +5343,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 // Tranform ARGB pixels with color table.
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
                           int width) {
-  uintptr_t pixel_temp = 0u;
+  uintptr_t pixel_temp;
  asm volatile (
    // 1 pixel loop.
    LABELALIGN
@@ -5363,10 +5363,10 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
    "dec       %2                              \n"
    "jg        1b                              \n"
-  : "+r"(dst_argb),   // %0
+  : "+r"(dst_argb),     // %0
-    "+d"(pixel_temp), // %1
+    "=&d"(pixel_temp),  // %1
-    "+r"(width)       // %2
+    "+r"(width)         // %2
-  : "r"(table_argb)   // %3
+  : "r"(table_argb)     // %3
  : "memory", "cc");
 }
 #endif  // HAS_ARGBCOLORTABLEROW_X86
@@ -5374,7 +5374,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
-  uintptr_t pixel_temp = 0u;
+  uintptr_t pixel_temp;
  asm volatile (
    // 1 pixel loop.
    LABELALIGN
@@ -5391,10 +5391,10 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
    "dec       %2                              \n"
    "jg        1b                              \n"
-  : "+r"(dst_argb),   // %0
+  : "+r"(dst_argb),     // %0
-    "+d"(pixel_temp), // %1
+    "=&d"(pixel_temp),  // %1
-    "+r"(width)       // %2
+    "+r"(width)         // %2
-  : "r"(table_argb)   // %3
+  : "r"(table_argb)     // %3
  : "memory", "cc");
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86
@@ -5404,8 +5404,8 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                 int width,
                                 const uint8* luma, uint32 lumacoeff) {
-  uintptr_t pixel_temp = 0u;
+  uintptr_t pixel_temp;
-  uintptr_t table_temp = 0u;
+  uintptr_t table_temp;
  asm volatile (
    "movd      %6,%%xmm3                       \n"
    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
@@ -5487,13 +5487,13 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
    "lea       " MEMLEA(0x10,3) ",%3           \n"
    "sub       $0x4,%4                         \n"
    "jg        1b                              \n"
-  : "+d"(pixel_temp),  // %0
+  : "=&d"(pixel_temp),  // %0
-    "+a"(table_temp),  // %1
+    "=&a"(table_temp),  // %1
-    "+r"(src_argb),    // %2
+    "+r"(src_argb),     // %2
-    "+r"(dst_argb),    // %3
+    "+r"(dst_argb),     // %3
-    "+rm"(width)       // %4
+    "+rm"(width)        // %4
-  : "r"(luma),         // %5
+  : "r"(luma),          // %5
-    "rm"(lumacoeff)    // %6
+    "rm"(lumacoeff)     // %6
  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
  );
 }

--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -498,8 +498,8 @@ void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
 void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int width) {
-  int x = 0;
+  int x;
-  int y = 0;
+  int y;
  __asm__ __volatile__ (
    ".set push                                    \n"
    ".set noreorder                               \n"
@@ -579,7 +579,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
        [dst_u] "+r" (dst_u),
        [dst_v] "+r" (dst_v),
        [x] "=&r" (x),
-        [y] "+r" (y)
+        [y] "=&r" (y)
      : [width] "r" (width)
      : "t0", "t1", "t2", "t3", "t4",
      "t5", "t7", "t8", "t9"

--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -316,7 +316,7 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
-  intptr_t stridex3 = 0;
+  intptr_t stridex3;
  asm volatile (
    "pcmpeqb    %%xmm4,%%xmm4                  \n"
    "psrlw      $0xf,%%xmm4                    \n"
@@ -361,7 +361,7 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  : "+r"(src_ptr),     // %0
    "+r"(dst_ptr),     // %1
    "+r"(dst_width),   // %2
-    "+r"(stridex3)     // %3
+    "=&r"(stridex3)    // %3
  : "r"((intptr_t)(src_stride))    // %4
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -824,7 +824,7 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 // Bilinear column filtering. SSSE3 version.
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+  intptr_t x0, x1, temp_pixel;
  asm volatile (
    "movd      %6,%%xmm2                       \n"
    "movd      %7,%%xmm3                       \n"
@@ -880,14 +880,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "movd      %%xmm0,%k2                      \n"
    "mov       %b2," MEMACCESS(0) "            \n"
  "99:                                         \n"
-  : "+r"(dst_ptr),     // %0
+  : "+r"(dst_ptr),      // %0
-    "+r"(src_ptr),     // %1
+    "+r"(src_ptr),      // %1
-    "+a"(temp_pixel),  // %2
+    "=&a"(temp_pixel),  // %2
-    "+r"(x0),          // %3
+    "=&r"(x0),          // %3
-    "+r"(x1),          // %4
+    "=&r"(x1),          // %4
-    "+rm"(dst_width)   // %5
+    "+rm"(dst_width)    // %5
-  : "rm"(x),           // %6
+  : "rm"(x),            // %6
-    "rm"(dx)           // %7
+    "rm"(dx)            // %7
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  );
@@ -998,7 +998,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                               int src_stepx, uint8* dst_argb, int dst_width) {
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12 = 0;
+  intptr_t src_stepx_x12;
  asm volatile (
    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
@@ -1016,11 +1016,11 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
    "lea       " MEMLEA(0x10,2) ",%2           \n"
    "sub       $0x4,%3                         \n"
    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
+  : "+r"(src_argb),       // %0
-    "+r"(src_stepx_x4),  // %1
+    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_argb),      // %2
+    "+r"(dst_argb),       // %2
-    "+r"(dst_width),     // %3
+    "+r"(dst_width),      // %3
-    "+r"(src_stepx_x12)  // %4
+    "=&r"(src_stepx_x12)  // %4
  :: "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3"
  );
@@ -1032,7 +1032,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                  ptrdiff_t src_stride, int src_stepx,
                                  uint8* dst_argb, int dst_width) {
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12 = 0;
+  intptr_t src_stepx_x12;
  intptr_t row1 = (intptr_t)(src_stride);
  asm volatile (
    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
@@ -1061,12 +1061,12 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
    "lea       " MEMLEA(0x10,2) ",%2           \n"
    "sub       $0x4,%3                         \n"
    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
+  : "+r"(src_argb),        // %0
-    "+r"(src_stepx_x4),   // %1
+    "+r"(src_stepx_x4),    // %1
-    "+r"(dst_argb),       // %2
+    "+r"(dst_argb),        // %2
-    "+rm"(dst_width),     // %3
+    "+rm"(dst_width),      // %3
-    "+r"(src_stepx_x12),  // %4
+    "=&r"(src_stepx_x12),  // %4
-    "+r"(row1)            // %5
+    "+r"(row1)             // %5
  :: "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3"
  );
@@ -1074,7 +1074,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
+  intptr_t x0, x1;
  asm volatile (
    "movd      %5,%%xmm2                       \n"
    "movd      %6,%%xmm3                       \n"
@@ -1127,8 +1127,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
    "movd      %%xmm0," MEMACCESS(2) "         \n"
  "99:                                         \n"
-  : "+a"(x0),          // %0
+  : "=&a"(x0),         // %0
-    "+d"(x1),          // %1
+    "=&d"(x1),         // %1
    "+r"(dst_argb),    // %2
    "+r"(src_argb),    // %3
    "+r"(dst_width)    // %4
@@ -1179,7 +1179,7 @@ static uvec8 kShuffleFractions = {
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                               int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
+  intptr_t x0, x1;
  asm volatile (
    "movdqa    %0,%%xmm4                       \n"
    "movdqa    %1,%%xmm5                       \n"
@@ -1242,8 +1242,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  : "+r"(dst_argb),    // %0
    "+r"(src_argb),    // %1
    "+rm"(dst_width),  // %2
-    "+r"(x0),          // %3
+    "=&r"(x0),         // %3
-    "+r"(x1)           // %4
+    "=&r"(x1)          // %4
  : "rm"(x),           // %5
    "rm"(dx)           // %6
  : "memory", "cc", NACL_R14

--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -532,7 +532,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp = NULL;
+  const uint8* src_tmp;
  asm volatile (
  "1:                                          \n"
    "mov       %0, %1                          \n"
@@ -552,12 +552,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "add        %1, %1, #16                    \n"
    "subs       %4, %4, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
-  : "+r"(src_tmp),          // %0
+  : "=&r"(src_tmp),    // %0
-    "+r"(src_ptr),          // %1
+    "+r"(src_ptr),     // %1
-    "+r"(dst_ptr),          // %2
+    "+r"(dst_ptr),     // %2
-    "+r"(src_stride),       // %3
+    "+r"(src_stride),  // %3
-    "+r"(src_width),        // %4
+    "+r"(src_width),   // %4
-    "+r"(src_height)        // %5
+    "+r"(src_height)   // %5
  :
  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
  );
@@ -909,7 +909,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
-  int tmp = 0;
+  int tmp;
  const uint8* src_tmp = src_argb;
  asm volatile (
  "1:                                          \n"
@@ -926,13 +926,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
-  : "+r"(dst_argb),         // %0
+  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),         // %1
+    "+r"(src_argb),   // %1
-    "+r"(dst_width),        // %2
+    "+r"(dst_width),  // %2
-    "+r"(x),                // %3
+    "+r"(x),          // %3
-    "+r"(dx),               // %4
+    "+r"(dx),         // %4
-    "+r"(tmp),              // %5
+    "=&r"(tmp),       // %5
-    "+r"(src_tmp)           // %6
+    "+r"(src_tmp)     // %6
  :
  : "memory", "cc", "q0", "q1"
  );

--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -547,7 +547,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp = NULL;
+  const uint8* src_tmp;
  asm volatile (
  "1:                                          \n"
    "mov       %0, %1                          \n"
@@ -567,12 +567,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "add      %1, %1, #16                      \n"
    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
    "b.gt     1b                               \n"
-  : "+r"(src_tmp),          // %0
+  : "=&r"(src_tmp),    // %0
-    "+r"(src_ptr),          // %1
+    "+r"(src_ptr),     // %1
-    "+r"(dst_ptr),          // %2
+    "+r"(dst_ptr),     // %2
-    "+r"(src_stride),       // %3
+    "+r"(src_stride),  // %3
-    "+r"(src_width),        // %4
+    "+r"(src_width),   // %4
-    "+r"(src_height)        // %5
+    "+r"(src_height)   // %5
  :
  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
  );
@@ -931,7 +931,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
  int64 x64 = (int64) x;
  int64 dx64 = (int64) dx;
-  int64 tmp64 = 0;
+  int64 tmp64;
  asm volatile (
  "1:                                          \n"
    LOAD1_DATA32_LANE(v0, 0)
@@ -947,13 +947,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
    "b.gt        1b                            \n"
-  : "+r"(dst_argb),         // %0
+  : "+r"(dst_argb),     // %0
-    "+r"(src_argb),         // %1
+    "+r"(src_argb),     // %1
-    "+r"(dst_width64),      // %2
+    "+r"(dst_width64),  // %2
-    "+r"(x64),              // %3
+    "+r"(x64),          // %3
-    "+r"(dx64),             // %4
+    "+r"(dx64),         // %4
-    "+r"(tmp64),            // %5
+    "=&r"(tmp64),       // %5
-    "+r"(src_tmp)           // %6
+    "+r"(src_tmp)       // %6
  :
  : "memory", "cc", "v0", "v1"
  );