diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index f52b0ed01543fa5b1fd28521ef1ec98a608c4e8e..e09719b67ac4ca305756f26f5e1aa4932e142e7a 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -30,12 +30,11 @@ void TransposeWx8_NEON(const uint8* src, int dst_stride, int width) { const uint8* src_temp; - int64 width64 = (int64)width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %3, %3, #8 \n" + "sub %w3, %w3, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" @@ -106,19 +105,19 @@ void TransposeWx8_NEON(const uint8* src, "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride - "subs %3, %3, #8 \n" // w -= 8 + "subs %w3, %w3, #8 \n" // w -= 8 "b.ge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %3, %3, #8 \n" + "adds %w3, %w3, #8 \n" "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %3, #2 \n" + "cmp %w3, #2 \n" "b.lt 3f \n" - "cmp %3, #4 \n" + "cmp %w3, #4 \n" "b.lt 2f \n" // 4x8 block @@ -171,12 +170,12 @@ void TransposeWx8_NEON(const uint8* src, "add %1, %1, #4 \n" // src += 4 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride - "subs %3, %3, #4 \n" // w -= 4 + "subs %w3, %w3, #4 \n" // w -= 4 "b.eq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less - "cmp %3, #2 \n" + "cmp %w3, #2 \n" "b.lt 3f \n" // 2x8 block @@ -211,7 +210,7 @@ void TransposeWx8_NEON(const uint8* src, "add %1, %1, #2 \n" // src += 2 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride - "subs %3, %3, #2 \n" // w -= 2 + "subs %w3, %w3, #2 \n" // w -= 2 "b.eq 4f \n" // 1x8 block @@ -241,7 +240,7 @@ void TransposeWx8_NEON(const uint8* src, : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(dst), // %2 - "+r"(width64) // %3 + "+r"(width) // %3 : "r"(&kVTbl4x4Transpose), // %4 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 @@ -262,12 +261,11 @@ void TransposeUVWx8_NEON(const uint8* src, int dst_stride_b, int width) { const uint8* src_temp; - int64 width64 = (int64)width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %4, %4, #8 \n" + "sub %w4, %w4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" @@ -358,19 +356,19 @@ void TransposeUVWx8_NEON(const uint8* src, "add %1, %1, #16 \n" // src += 8*2 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %4, %4, #8 \n" // w -= 8 + "subs %w4, %w4, #8 \n" // w -= 8 "b.ge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %4, %4, #8 \n" + "adds %w4, %w4, #8 \n" "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %4, #2 \n" + "cmp %w4, #2 \n" "b.lt 3f \n" - "cmp %4, #4 \n" + "cmp %w4, #4 \n" "b.lt 2f \n" // TODO(frkoenig): Clean this up @@ -447,12 +445,12 @@ void TransposeUVWx8_NEON(const uint8* src, "add %1, %1, #8 \n" // src += 4 * 2 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %4, %4, #4 \n" // w -= 4 + "subs %w4, %w4, #4 \n" // w -= 4 "b.eq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less - "cmp %4, #2 \n" + "cmp %w4, #2 \n" "b.lt 3f \n" // 2x8 block @@ -497,7 +495,7 @@ void TransposeUVWx8_NEON(const uint8* src, "add %1, %1, #4 \n" // src += 2 * 2 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %4, %4, #2 \n" // w -= 2 + "subs %w4, %w4, #2 \n" // w -= 2 "b.eq 4f \n" // 1x8 block @@ -530,7 +528,7 @@ void TransposeUVWx8_NEON(const uint8* src, "+r"(src), // %1 "+r"(dst_a), // %2 "+r"(dst_b), // %3 - "+r"(width64) // %4 + "+r"(width) // %4 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 1ff5f2bfe5e32c16b4ed20092d28314749835173..d2513ef6f83560874f721fe192ece79f53481c72 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -622,7 +622,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8* src_tmp = src_ptr; - int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. int64 x64 = (int64)x; int64 dx64 = (int64)dx; asm volatile ( @@ -669,7 +668,7 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 @@ -970,7 +969,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, "add %6, %1, %5, lsl #2 \n" \ "add %3, %3, %4 \n" \ MEMACCESS(6) \ - "ld1 {" #vn ".s}[" #n "], [%6] \n" + "ld1 {" #vn ".s}[" #n "], [%6] \n" // clang-format on void ScaleARGBCols_NEON(uint8* dst_argb, @@ -979,7 +978,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, int x, int dx) { const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. int64 x64 = (int64)x; int64 dx64 = (int64)dx; int64 tmp64; @@ -1000,7 +998,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "=&r"(tmp64), // %5 @@ -1031,7 +1029,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. int64 x64 = (int64)x; int64 dx64 = (int64)dx; asm volatile ( @@ -1077,7 +1074,7 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5