Commit 6c94ad13 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Remove ARM NaCL macros from source

NaCL has been disabled for awhile, so the code
will still build, but only with C versions.
This change removes the MEMACCESS() macros from
Neon and Neon64 source.

BUG=libyuv:702
TEST=try bots build for arm.
R=kjellander@chromium.org

Change-Id: Id581a5c8ff71e18cc69595e7fee9337f97c44a19
Reviewed-on: https://chromium-review.googlesource.com/528332Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 5f94a33e
...@@ -625,15 +625,6 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 ...@@ -625,15 +625,6 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
#op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n" #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
#endif // defined(__native_client__) && defined(__x86_64__) #endif // defined(__native_client__) && defined(__x86_64__)
#if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS
#if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
#else
#define MEMACCESS(base)
#endif
#endif
// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be // Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
// measured and then run with iaca -64 libyuv_unittest. // measured and then run with iaca -64 libyuv_unittest.
// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within // IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
......
...@@ -64,9 +64,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -64,9 +64,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q2, d0, d2 \n"
......
...@@ -59,9 +59,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -59,9 +59,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n" "subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n" "usubl v2.8h, v0.8b, v1.8b \n"
......
...@@ -30,7 +30,7 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -30,7 +30,7 @@ void TransposeWx8_NEON(const uint8* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8* src_temp; const uint8* src_temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
...@@ -40,21 +40,13 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -40,21 +40,13 @@ void TransposeWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n" "vld1.8 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n" "vld1.8 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n" "vld1.8 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n" "vld1.8 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n" "vld1.8 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n" "vld1.8 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n" "vld1.8 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n" "vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n" "vtrn.8 d1, d0 \n"
...@@ -79,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -79,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n" "vst1.8 {d1}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n" "vst1.8 {d3}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n" "vst1.8 {d5}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n" "vst1.8 {d7}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n" "vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8 "add %1, #8 \n" // src += 8
...@@ -115,26 +99,17 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -115,26 +99,17 @@ void TransposeWx8_NEON(const uint8* src,
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], %2 \n" "vld1.32 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], %2 \n" "vld1.32 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], %2 \n" "vld1.32 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], %2 \n" "vld1.32 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n" "vld1.32 {d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n" "vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n" "vld1.32 {d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n" "vld1.32 {d3[1]}, [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(6)
"vld1.8 {q3}, [%6] \n" "vld1.8 {q3}, [%6] \n"
"vtbl.8 d4, {d0, d1}, d6 \n" "vtbl.8 d4, {d0, d1}, d6 \n"
...@@ -144,23 +119,15 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -144,23 +119,15 @@ void TransposeWx8_NEON(const uint8* src,
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
MEMACCESS(0)
"vst1.32 {d4[0]}, [%0], %4 \n" "vst1.32 {d4[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d4[1]}, [%0], %4 \n" "vst1.32 {d4[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n" "vst1.32 {d5[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n" "vst1.32 {d5[1]}, [%0] \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d0[0]}, [%0], %4 \n" "vst1.32 {d0[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d0[1]}, [%0], %4 \n" "vst1.32 {d0[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n" "vst1.32 {d1[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n" "vst1.32 {d1[1]}, [%0] \n"
"add %1, #4 \n" // src += 4 "add %1, #4 \n" // src += 4
...@@ -176,30 +143,20 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -176,30 +143,20 @@ void TransposeWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.16 {d0[0]}, [%0], %2 \n" "vld1.16 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[0]}, [%0], %2 \n" "vld1.16 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[1]}, [%0], %2 \n" "vld1.16 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[1]}, [%0], %2 \n" "vld1.16 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n" "vld1.16 {d0[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n" "vld1.16 {d1[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n" "vld1.16 {d0[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n" "vld1.16 {d1[3]}, [%0] \n"
"vtrn.8 d0, d1 \n" "vtrn.8 d0, d1 \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0] \n" "vst1.64 {d1}, [%0] \n"
"add %1, #2 \n" // src += 2 "add %1, #2 \n" // src += 2
...@@ -209,24 +166,15 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -209,24 +166,15 @@ void TransposeWx8_NEON(const uint8* src,
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n" "vld1.8 {d0[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n" "vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n" "vld1.8 {d0[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n" "vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n" "vld1.8 {d0[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n" "vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n" "vld1.8 {d0[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n" "vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n" "vst1.64 {d0}, [%3] \n"
"4: \n" "4: \n"
...@@ -238,8 +186,7 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -238,8 +186,7 @@ void TransposeWx8_NEON(const uint8* src,
"+r"(dst_stride), // %4 "+r"(dst_stride), // %4
"+r"(width) // %5 "+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6 : "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3" : "memory", "cc", "q0", "q1", "q2", "q3");
);
} }
static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
...@@ -253,7 +200,7 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -253,7 +200,7 @@ void TransposeUVWx8_NEON(const uint8* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8* src_temp; const uint8* src_temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
...@@ -263,21 +210,13 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -263,21 +210,13 @@ void TransposeUVWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n" "vld2.8 {d0, d1}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n" "vld2.8 {d2, d3}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n" "vld2.8 {d4, d5}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n" "vld2.8 {d6, d7}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n" "vld2.8 {d16, d17}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n" "vld2.8 {d18, d19}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n" "vld2.8 {d20, d21}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n" "vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n" "vtrn.8 q1, q0 \n"
...@@ -306,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -306,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n" "vst1.8 {d6}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n" "vst1.8 {d18}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n" "vst1.8 {d16}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n" "vst1.8 {d22}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d20}, [%0] \n" "vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n" "vst1.8 {d3}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n" "vst1.8 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n" "vst1.8 {d7}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n" "vst1.8 {d5}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n" "vst1.8 {d19}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n" "vst1.8 {d17}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n" "vst1.8 {d23}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d21}, [%0] \n" "vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2 "add %1, #8*2 \n" // src += 8*2
...@@ -363,24 +286,15 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -363,24 +286,15 @@ void TransposeUVWx8_NEON(const uint8* src,
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.64 {d0}, [%0], %2 \n" "vld1.64 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d1}, [%0], %2 \n" "vld1.64 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d2}, [%0], %2 \n" "vld1.64 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d3}, [%0], %2 \n" "vld1.64 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n" "vld1.64 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n" "vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d6}, [%0], %2 \n" "vld1.64 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d7}, [%0] \n" "vld1.64 {d7}, [%0] \n"
MEMACCESS(8)
"vld1.8 {q15}, [%8] \n" "vld1.8 {q15}, [%8] \n"
"vtrn.8 q0, q1 \n" "vtrn.8 q0, q1 \n"
...@@ -397,49 +311,35 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -397,49 +311,35 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.32 {d16[0]}, [%0], %4 \n" "vst1.32 {d16[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d16[1]}, [%0], %4 \n" "vst1.32 {d16[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[0]}, [%0], %4 \n" "vst1.32 {d17[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[1]}, [%0], %4 \n" "vst1.32 {d17[1]}, [%0], %4 \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d20[0]}, [%0], %4 \n" "vst1.32 {d20[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d20[1]}, [%0], %4 \n" "vst1.32 {d20[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n" "vst1.32 {d21[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[1]}, [%0] \n" "vst1.32 {d21[1]}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.32 {d18[0]}, [%0], %6 \n" "vst1.32 {d18[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d18[1]}, [%0], %6 \n" "vst1.32 {d18[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[0]}, [%0], %6 \n" "vst1.32 {d19[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n" "vst1.32 {d19[1]}, [%0], %6 \n"
"add %0, %5, #4 \n" "add %0, %5, #4 \n"
MEMACCESS(0)
"vst1.32 {d22[0]}, [%0], %6 \n" "vst1.32 {d22[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d22[1]}, [%0], %6 \n" "vst1.32 {d22[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[0]}, [%0], %6 \n" "vst1.32 {d23[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[1]}, [%0] \n" "vst1.32 {d23[1]}, [%0] \n"
"add %1, #4*2 \n" // src += 4 * 2 "add %1, #4*2 \n" // src += 4 * 2
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b // dst_stride_a
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
// dst_stride_b
"subs %7, #4 \n" // w -= 4 "subs %7, #4 \n" // w -= 4
"beq 4f \n" "beq 4f \n"
...@@ -451,21 +351,13 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -451,21 +351,13 @@ void TransposeUVWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n" "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n" "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n" "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n" "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n" "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n" "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n" "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[3], d3[3]}, [%0] \n" "vld2.16 {d1[3], d3[3]}, [%0] \n"
"vtrn.8 d0, d1 \n" "vtrn.8 d0, d1 \n"
...@@ -473,46 +365,34 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -473,46 +365,34 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d2}, [%0] \n" "vst1.64 {d2}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0], %6 \n" "vst1.64 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.64 {d3}, [%0] \n" "vst1.64 {d3}, [%0] \n"
"add %1, #2*2 \n" // src += 2 * 2 "add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b // dst_stride_a
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
// dst_stride_b
"subs %7, #2 \n" // w -= 2 "subs %7, #2 \n" // w -= 2
"beq 4f \n" "beq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n" "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n" "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n" "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n" "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n" "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n" "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n" "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n" "vld2.8 {d0[7], d1[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n" "vst1.64 {d0}, [%3] \n"
MEMACCESS(5)
"vst1.64 {d1}, [%5] \n" "vst1.64 {d1}, [%5] \n"
"4: \n" "4: \n"
...@@ -526,9 +406,7 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -526,9 +406,7 @@ void TransposeUVWx8_NEON(const uint8* src,
"+r"(dst_stride_b), // %6 "+r"(dst_stride_b), // %6
"+r"(width) // %7 "+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8 : "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
} }
#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
......
...@@ -40,21 +40,13 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -40,21 +40,13 @@ void TransposeWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n" "trn2 v16.8b, v0.8b, v1.8b \n"
...@@ -86,21 +78,13 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -86,21 +78,13 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n" "st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n" "st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n" "st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n" "st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n" "st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n" "st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n" "st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v22.8b}, [%0] \n" "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8 "add %1, %1, #8 \n" // src += 8
...@@ -122,26 +106,17 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -122,26 +106,17 @@ void TransposeWx8_NEON(const uint8* src,
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %5 \n" "ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %5 \n" "ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %5 \n" "ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %5 \n" "ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[0], [%0], %5 \n" "ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[1], [%0], %5 \n" "ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[2], [%0], %5 \n" "ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[3], [%0] \n" "ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n" "ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n" "tbl v3.16b, {v0.16b}, v2.16b \n"
...@@ -149,23 +124,15 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -149,23 +124,15 @@ void TransposeWx8_NEON(const uint8* src,
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
MEMACCESS(0)
"st1 {v3.s}[0], [%0], %6 \n" "st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[1], [%0], %6 \n" "st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[2], [%0], %6 \n" "st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[3], [%0] \n" "st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v0.s}[0], [%0], %6 \n" "st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[1], [%0], %6 \n" "st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[2], [%0], %6 \n" "st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[3], [%0] \n" "st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4 "add %1, %1, #4 \n" // src += 4
...@@ -181,21 +148,13 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -181,21 +148,13 @@ void TransposeWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.h}[0], [%0], %5 \n" "ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[0], [%0], %5 \n" "ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[1], [%0], %5 \n" "ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[1], [%0], %5 \n" "ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[2], [%0], %5 \n" "ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[2], [%0], %5 \n" "ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[3], [%0], %5 \n" "ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[3], [%0] \n" "ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n" "trn2 v2.8b, v0.8b, v1.8b \n"
...@@ -203,9 +162,7 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -203,9 +162,7 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v3.8b}, [%0], %6 \n" "st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v2.8b}, [%0] \n" "st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2 "add %1, %1, #2 \n" // src += 2
...@@ -215,24 +172,15 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -215,24 +172,15 @@ void TransposeWx8_NEON(const uint8* src,
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld1 {v0.b}[0], [%1], %5 \n" "ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[1], [%1], %5 \n" "ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[2], [%1], %5 \n" "ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[3], [%1], %5 \n" "ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[4], [%1], %5 \n" "ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[5], [%1], %5 \n" "ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[6], [%1], %5 \n" "ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[7], [%1] \n" "ld1 {v0.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n" "st1 {v0.8b}, [%2] \n"
"4: \n" "4: \n"
...@@ -271,21 +219,13 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -271,21 +219,13 @@ void TransposeUVWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %5 \n" "ld1 {v0.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], %5 \n" "ld1 {v1.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.16b}, [%0], %5 \n" "ld1 {v2.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.16b}, [%0], %5 \n" "ld1 {v3.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.16b}, [%0], %5 \n" "ld1 {v4.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.16b}, [%0], %5 \n" "ld1 {v5.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.16b}, [%0], %5 \n" "ld1 {v6.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.16b}, [%0] \n" "ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n" "trn1 v16.16b, v0.16b, v1.16b \n"
...@@ -317,40 +257,24 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -317,40 +257,24 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.d}[0], [%0], %6 \n" "st1 {v16.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[0], [%0], %6 \n" "st1 {v18.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[0], [%0], %6 \n" "st1 {v17.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[0], [%0], %6 \n" "st1 {v19.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.d}[1], [%0], %6 \n" "st1 {v16.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[1], [%0], %6 \n" "st1 {v18.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[1], [%0], %6 \n" "st1 {v17.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[1], [%0] \n" "st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v20.d}[0], [%0], %7 \n" "st1 {v20.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[0], [%0], %7 \n" "st1 {v22.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[0], [%0], %7 \n" "st1 {v21.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[0], [%0], %7 \n" "st1 {v23.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v20.d}[1], [%0], %7 \n" "st1 {v20.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[1], [%0], %7 \n" "st1 {v22.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[1], [%0], %7 \n" "st1 {v21.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[1], [%0] \n" "st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2 "add %1, %1, #16 \n" // src += 8*2
...@@ -374,24 +298,15 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -374,24 +298,15 @@ void TransposeUVWx8_NEON(const uint8* src,
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
MEMACCESS(8)
"ld1 {v30.16b}, [%8], #16 \n" "ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n" "ld1 {v31.16b}, [%8] \n"
...@@ -402,44 +317,28 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -402,44 +317,28 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.s}[0], [%0], %6 \n" "st1 {v16.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[1], [%0], %6 \n" "st1 {v16.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[2], [%0], %6 \n" "st1 {v16.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[3], [%0], %6 \n" "st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v18.s}[0], [%0], %6 \n" "st1 {v18.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[1], [%0], %6 \n" "st1 {v18.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[2], [%0], %6 \n" "st1 {v18.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[3], [%0] \n" "st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v17.s}[0], [%0], %7 \n" "st1 {v17.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[1], [%0], %7 \n" "st1 {v17.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[2], [%0], %7 \n" "st1 {v17.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[3], [%0], %7 \n" "st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"st1 {v19.s}[0], [%0], %7 \n" "st1 {v19.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[1], [%0], %7 \n" "st1 {v19.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[2], [%0], %7 \n" "st1 {v19.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[3], [%0] \n" "st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2 "add %1, %1, #8 \n" // src += 4 * 2
...@@ -456,21 +355,13 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -456,21 +355,13 @@ void TransposeUVWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[0], [%0], %5 \n" "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[0], [%0], %5 \n" "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[1], [%0], %5 \n" "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[1], [%0], %5 \n" "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[2], [%0], %5 \n" "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[2], [%0], %5 \n" "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[3], [%0], %5 \n" "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[3], [%0] \n" "ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n" "trn1 v4.8b, v0.8b, v2.8b \n"
...@@ -480,16 +371,12 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -480,16 +371,12 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v4.d}[0], [%0], %6 \n" "st1 {v4.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v6.d}[0], [%0] \n" "st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v5.d}[0], [%0], %7 \n" "st1 {v5.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v7.d}[0], [%0] \n" "st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2 "add %1, %1, #4 \n" // src += 2 * 2
...@@ -500,26 +387,16 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -500,26 +387,16 @@ void TransposeUVWx8_NEON(const uint8* src,
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[0], [%1], %5 \n" "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[1], [%1], %5 \n" "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[2], [%1], %5 \n" "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[3], [%1], %5 \n" "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[4], [%1], %5 \n" "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[5], [%1], %5 \n" "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[6], [%1], %5 \n" "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[7], [%1] \n" "ld2 {v0.b, v1.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.d}[0], [%2] \n" "st1 {v0.d}[0], [%2] \n"
MEMACCESS(3)
"st1 {v1.d}[0], [%3] \n" "st1 {v1.d}[0], [%3] \n"
"4: \n" "4: \n"
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -28,13 +28,11 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ...@@ -28,13 +28,11 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" "vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels "vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -51,16 +49,16 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ...@@ -51,16 +49,16 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc // inc
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // add adjacent "vpaddl.u8 q0, q0 \n" // add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1 \n"
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #1 \n" "vrshrn.u16 d1, q1, #1 \n"
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" "vst1.8 {q0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -76,22 +74,21 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -76,22 +74,21 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
MEMACCESS(1)
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1 \n"
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
// row1
"vpadal.u8 q1, q3 \n" "vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -108,20 +105,17 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ...@@ -108,20 +105,17 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {d2}, [%1]! \n" "vst1.8 {d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "q0", "q1", "memory", "cc" : "q0", "q1", "memory", "cc");
);
} }
void ScaleRowDown4Box_NEON(const uint8* src_ptr, void ScaleRowDown4Box_NEON(const uint8* src_ptr,
...@@ -131,15 +125,11 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -131,15 +125,11 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
MEMACCESS(3)
"vld1.8 {q1}, [%3]! \n" "vld1.8 {q1}, [%3]! \n"
MEMACCESS(4)
"vld1.8 {q2}, [%4]! \n" "vld1.8 {q2}, [%4]! \n"
MEMACCESS(5)
"vld1.8 {q3}, [%5]! \n" "vld1.8 {q3}, [%5]! \n"
"subs %2, %2, #4 \n" "subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 q0, q0 \n"
...@@ -149,7 +139,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -149,7 +139,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"vpaddl.u16 q0, q0 \n" "vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n" "vmovn.u16 d0, q0 \n"
MEMACCESS(1)
"vst1.32 {d0[0]}, [%1]! \n" "vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -159,8 +148,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -159,8 +148,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"+r"(src_ptr2), // %4 "+r"(src_ptr2), // %4
"+r"(src_ptr3) // %5 "+r"(src_ptr3) // %5
: :
: "q0", "q1", "q2", "q3", "memory", "cc" : "q0", "q1", "q2", "q3", "memory", "cc");
);
} }
// Down scale from 4 to 3 pixels. Use the neon multilane read/write // Down scale from 4 to 3 pixels. Use the neon multilane read/write
...@@ -171,34 +159,29 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -171,34 +159,29 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2 "vmov d2, d3 \n" // order d0, d1, d2
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "d0", "d1", "d2", "d3", "memory", "cc" : "d0", "d1", "d2", "d3", "memory", "cc");
);
} }
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
...@@ -235,7 +218,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -235,7 +218,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q8, d3, d24 \n" "vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n" "vqrshrn.u16 d2, q8, #2 \n"
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
...@@ -244,21 +226,19 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -244,21 +226,19 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(src_stride) // %3 "+r"(src_stride) // %3
: :
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
); "cc");
} }
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
...@@ -278,7 +258,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -278,7 +258,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q3, d3, d24 \n" "vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n" "vqrshrn.u16 d2, q3, #2 \n"
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -286,8 +265,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -286,8 +265,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(src_stride) // %3 "+r"(src_stride) // %3
: :
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
);
} }
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
...@@ -305,26 +283,21 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -305,26 +283,21 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
MEMACCESS(3)
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" "vst1.8 {d4}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d5[0]}, [%1]! \n" "vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(&kShuf38) // %3 : "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
);
} }
// 32x3 -> 12x1 // 32x3 -> 12x1
...@@ -334,12 +307,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -334,12 +307,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2; const uint8* src_ptr1 = src_ptr + src_stride * 2;
asm volatile ( asm volatile(
MEMACCESS(5)
"vld1.16 {q13}, [%5] \n" "vld1.16 {q13}, [%5] \n"
MEMACCESS(6)
"vld1.8 {q14}, [%6] \n" "vld1.8 {q14}, [%6] \n"
MEMACCESS(7)
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
...@@ -348,11 +318,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -348,11 +318,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
MEMACCESS(4)
"vld4.8 {d16, d17, d18, d19}, [%4]! \n" "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
...@@ -430,9 +397,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -430,9 +397,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n" "vst1.8 {d3}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -443,8 +408,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -443,8 +408,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
: "r"(&kMult38_Div6), // %5 : "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6 "r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7 "r"(&kMult38_Div9) // %7
: "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
); "cc");
} }
// 32x2 -> 12x1 // 32x2 -> 12x1
...@@ -452,10 +417,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -452,10 +417,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
MEMACCESS(4)
"vld1.16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
MEMACCESS(5)
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
...@@ -464,9 +427,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -464,9 +427,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
...@@ -533,9 +494,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -533,9 +494,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n" "vst1.8 {d3}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -544,8 +503,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -544,8 +503,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"+r"(src_stride) // %3 "+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4 : "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5 "r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
);
} }
void ScaleAddRows_NEON(const uint8* src_ptr, void ScaleAddRows_NEON(const uint8* src_ptr,
...@@ -554,7 +512,7 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -554,7 +512,7 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
int src_width, int src_width,
int src_height) { int src_height) {
const uint8* src_tmp; const uint8* src_tmp;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"mov r12, %5 \n" "mov r12, %5 \n"
...@@ -562,13 +520,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -562,13 +520,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"veor q3, q3, q3 \n" "veor q3, q3, q3 \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
MEMACCESS(0)
"vld1.8 {q0}, [%0], %3 \n" "vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n" "vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q2, q2, d0 \n" "vaddw.u8 q2, q2, d0 \n"
"subs r12, r12, #1 \n" "subs r12, r12, #1 \n"
"bgt 2b \n" "bgt 2b \n"
MEMACCESS(2)
"vst1.16 {q2, q3}, [%2]! \n" // store pixels "vst1.16 {q2, q3}, [%2]! \n" // store pixels
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop "subs %4, %4, #16 \n" // 16 processed per loop
...@@ -591,7 +547,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -591,7 +547,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// clang-format on // clang-format on
...@@ -643,7 +598,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, ...@@ -643,7 +598,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
"vadd.s16 q8, q8, q9 \n" "vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n" "vmovn.s16 d6, q8 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels "vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n" "vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n" "vadd.s32 q2, q2, q0 \n"
...@@ -670,7 +624,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -670,7 +624,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int dst_width, int dst_width,
int source_y_fraction) { int source_y_fraction) {
asm volatile ( asm volatile(
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %1 \n"
...@@ -686,9 +640,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -686,9 +640,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "vmull.u8 q13, d0, d4 \n"
...@@ -697,63 +649,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -697,63 +649,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n" "vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 25b \n" "bgt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n" "vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 75b \n" "bgt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0)
"vst1.8 {d1[7]}, [%0] \n" "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
...@@ -761,8 +700,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -761,8 +700,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"+r"(dst_width), // %3 "+r"(dst_width), // %3
"+r"(source_y_fraction) // %4 "+r"(source_y_fraction) // %4
: :
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
);
} }
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
...@@ -770,17 +708,13 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ...@@ -770,17 +708,13 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0)
"vld2.32 {q0, q1}, [%0]! \n" "vld2.32 {q0, q1}, [%0]! \n"
MEMACCESS(0)
"vld2.32 {q2, q3}, [%0]! \n" "vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels "vst1.8 {q1}, [%1]! \n" // store odd pixels
MEMACCESS(1)
"vst1.8 {q3}, [%1]! \n" "vst1.8 {q3}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -796,22 +730,21 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ...@@ -796,22 +730,21 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
uint8* dst_argb, uint8* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. // pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #1 \n" "vrshrn.u16 d1, q1, #1 \n"
"vrshrn.u16 d2, q2, #1 \n" "vrshrn.u16 d2, q2, #1 \n"
"vrshrn.u16 d3, q3, #1 \n" "vrshrn.u16 d3, q3, #1 \n"
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -826,32 +759,31 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -826,32 +759,31 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. // pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
MEMACCESS(1) "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. // pixels.
MEMACCESS(1) "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. // pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n" "vrshrn.u16 d3, q3, #2 \n"
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -859,8 +791,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -859,8 +791,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"+r"(dst), // %2 "+r"(dst), // %2
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: :
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
);
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
...@@ -871,27 +802,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ...@@ -871,27 +802,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
uint8* dst_argb, uint8* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], r12 \n" "vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop. "subs %2, %2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" "vst1.8 {q0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(src_stepx) // %3 : "r"(src_stepx) // %3
: "memory", "cc", "r12", "q0" : "memory", "cc", "r12", "q0");
);
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
...@@ -901,25 +826,18 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -901,25 +826,18 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
int src_stepx, int src_stepx,
uint8* dst_argb, uint8* dst_argb,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks ->
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 // 2x1
MEMACCESS(1)
"vld1.8 {d1}, [%1], r12 \n" "vld1.8 {d1}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], r12 \n" "vld1.8 {d2}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d3}, [%1], r12 \n" "vld1.8 {d3}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], r12 \n" "vld1.8 {d4}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d5}, [%1], r12 \n" "vld1.8 {d5}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], r12 \n" "vld1.8 {d6}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d7}, [%1], r12 \n" "vld1.8 {d7}, [%1], r12 \n"
"vaddl.u8 q0, d0, d1 \n" "vaddl.u8 q0, d0, d1 \n"
"vaddl.u8 q1, d2, d3 \n" "vaddl.u8 q1, d2, d3 \n"
...@@ -932,7 +850,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -932,7 +850,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop. "subs %3, %3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -940,8 +857,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -940,8 +857,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: "r"(src_stepx) // %4 : "r"(src_stepx) // %4
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
);
} }
// clang-format off // clang-format off
...@@ -951,7 +867,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -951,7 +867,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {" #dn "[" #n "]}, [%6] \n" "vld1.32 {" #dn "[" #n "]}, [%6] \n"
// clang-format on // clang-format on
...@@ -962,20 +877,15 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -962,20 +877,15 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int dx) { int dx) {
int tmp; int tmp;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile ( asm volatile(
"1: \n" "1: \n" LOAD1_DATA32_LANE(
LOAD1_DATA32_LANE(d0, 0) d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
LOAD1_DATA32_LANE(d1, 0) d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0)
LOAD1_DATA32_LANE(d2, 1)
LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
MEMACCESS(0)
"vst1.32 {q0, q1}, [%0]! \n" // store pixels "vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per
// loop
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
...@@ -985,8 +895,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -985,8 +895,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"=&r"(tmp), // %5 "=&r"(tmp), // %5
"+r"(src_tmp) // %6 "+r"(src_tmp) // %6
: :
: "memory", "cc", "q0", "q1" : "memory", "cc", "q0", "q1");
);
} }
#undef LOAD1_DATA32_LANE #undef LOAD1_DATA32_LANE
...@@ -998,7 +907,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -998,7 +907,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
// clang-format on // clang-format on
...@@ -1045,7 +953,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, ...@@ -1045,7 +953,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
"vshrn.i16 d0, q11, #7 \n" "vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n" "vshrn.i16 d1, q12, #7 \n"
MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels "vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n" "vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop
......
...@@ -29,10 +29,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ...@@ -29,10 +29,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -51,14 +49,12 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ...@@ -51,14 +49,12 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // add adjacent "uaddlp v0.8h, v0.16b \n" // add adjacent
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #1 \n" "rshrn2 v0.16b, v1.8h, #1 \n"
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -78,9 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -78,9 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
...@@ -89,7 +83,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -89,7 +83,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v3.16b \n" "uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -108,10 +101,8 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ...@@ -108,10 +101,8 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -131,13 +122,9 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -131,13 +122,9 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%3], #16 \n" "ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%4], #16 \n" "ld1 {v3.16b}, [%4], #16 \n"
"subs %w5, %w5, #4 \n" "subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n" "uaddlp v0.8h, v0.16b \n"
...@@ -146,7 +133,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -146,7 +133,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"uadalp v0.8h, v3.16b \n" "uadalp v0.8h, v3.16b \n"
"addp v0.8h, v0.8h, v0.8h \n" "addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
MEMACCESS(1)
"st1 {v0.s}[0], [%1], #4 \n" "st1 {v0.s}[0], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -170,11 +156,9 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -170,11 +156,9 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -193,9 +177,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -193,9 +177,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
...@@ -232,7 +214,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -232,7 +214,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"umlal v16.8h, v3.8b, v20.8b \n" "umlal v16.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v16.8h, #2 \n" "uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -254,9 +235,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -254,9 +235,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
...@@ -278,7 +257,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -278,7 +257,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"umlal v4.8h, v3.8b, v20.8b \n" "umlal v4.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v4.8h, #2 \n" "uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -305,16 +283,12 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -305,16 +283,12 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
MEMACCESS(3)
"ld1 {v3.16b}, [%3] \n" "ld1 {v3.16b}, [%3] \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n" "subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v2.s}[2], [%1], #4 \n" "st1 {v2.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -334,11 +308,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -334,11 +308,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(5)
"ld1 {v29.8h}, [%5] \n" "ld1 {v29.8h}, [%5] \n"
MEMACCESS(6)
"ld1 {v30.16b}, [%6] \n" "ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n" "ld1 {v31.8h}, [%7] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
...@@ -347,11 +318,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -347,11 +318,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %w4, %w4, #12 \n" "subs %w4, %w4, #12 \n"
...@@ -436,9 +404,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -436,9 +404,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// be adjacent // be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -463,9 +429,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -463,9 +429,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+. // TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n" "ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n" "ld1 {v31.16b}, [%5] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
...@@ -474,9 +438,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -474,9 +438,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"subs %w3, %w3, #12 \n" "subs %w3, %w3, #12 \n"
...@@ -547,9 +509,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -547,9 +509,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -577,13 +537,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -577,13 +537,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"eor v3.16b, v3.16b, v3.16b \n" "eor v3.16b, v3.16b, v3.16b \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" "ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n" "uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n" "uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n" "subs w12, w12, #1 \n"
"b.gt 2b \n" "b.gt 2b \n"
MEMACCESS(2)
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop "subs %w4, %w4, #16 \n" // 16 processed per loop
...@@ -606,7 +564,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -606,7 +564,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n" "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on // clang-format on
...@@ -660,7 +617,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, ...@@ -660,7 +617,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
"add v4.8h, v4.8h, v6.8h \n" "add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n" "xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels "st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n" "add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n"
...@@ -703,9 +659,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -703,9 +659,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"dup v4.8b, %w5 \n" "dup v4.8b, %w5 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n" "umull v6.8h, v0.8b, v4.8b \n"
...@@ -714,63 +668,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -714,63 +668,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"umlal2 v7.8h, v1.16b, v5.16b \n" "umlal2 v7.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v6.8h, #8 \n" "rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n" "b.gt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n" "b.gt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0)
"st1 {v0.b}[15], [%0] \n" "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
...@@ -791,14 +732,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ...@@ -791,14 +732,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS (0)
"ld2 {v0.4s, v1.4s}, [%0], #32 \n" "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
MEMACCESS (0)
"ld2 {v2.4s, v3.4s}, [%0], #32 \n" "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS (1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1)
"st1 {v3.16b}, [%1], #16 \n" "st1 {v3.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
...@@ -816,7 +753,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ...@@ -816,7 +753,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS (0)
// load 8 ARGB pixels. // load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
...@@ -828,7 +764,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ...@@ -828,7 +764,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
"rshrn v1.8b, v1.8h, #1 \n" "rshrn v1.8b, v1.8h, #1 \n"
"rshrn v2.8b, v2.8h, #1 \n" "rshrn v2.8b, v2.8h, #1 \n"
"rshrn v3.8b, v3.8h, #1 \n" "rshrn v3.8b, v3.8h, #1 \n"
MEMACCESS (1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -847,14 +782,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -847,14 +782,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS (0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
MEMACCESS (1)
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
...@@ -864,7 +797,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -864,7 +797,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
...@@ -886,16 +818,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ...@@ -886,16 +818,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %3 \n" "ld1 {v0.s}[0], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %3 \n" "ld1 {v0.s}[1], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %3 \n" "ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -918,21 +845,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -918,21 +845,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile ( asm volatile (
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v3.8b}, [%1], %4 \n" "ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %4 \n" "ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v5.8b}, [%1], %4 \n" "ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %4 \n" "ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v7.8b}, [%1], %4 \n" "ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n" "uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n" "uaddl v2.8h, v2.8b, v3.8b \n"
...@@ -949,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -949,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop. "subs %w3, %w3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -968,7 +886,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -968,7 +886,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {" #vn ".s}[" #n "], [%6] \n" "ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
...@@ -992,7 +909,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -992,7 +909,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v1, 3)
MEMACCESS(0)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1017,7 +933,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -1017,7 +933,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
...@@ -1067,7 +982,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, ...@@ -1067,7 +982,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
"shrn v0.8b, v16.8h, #7 \n" "shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels "st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n" "add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment