Commit 6c94ad13 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Remove ARM NaCL macros from source

NaCL has been disabled for awhile, so the code
will still build, but only with C versions.
This change removes the MEMACCESS() macros from
Neon and Neon64 source.

BUG=libyuv:702
TEST=try bots build for arm.
R=kjellander@chromium.org

Change-Id: Id581a5c8ff71e18cc69595e7fee9337f97c44a19
Reviewed-on: https://chromium-review.googlesource.com/528332Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 5f94a33e
...@@ -625,15 +625,6 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 ...@@ -625,15 +625,6 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
#op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n" #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
#endif // defined(__native_client__) && defined(__x86_64__) #endif // defined(__native_client__) && defined(__x86_64__)
#if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS
#if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
#else
#define MEMACCESS(base)
#endif
#endif
// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be // Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
// measured and then run with iaca -64 libyuv_unittest. // measured and then run with iaca -64 libyuv_unittest.
// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within // IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
......
...@@ -64,9 +64,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -64,9 +64,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q2, d0, d2 \n"
......
...@@ -59,9 +59,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -59,9 +59,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n" "subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n" "usubl v2.8h, v0.8b, v1.8b \n"
......
...@@ -30,31 +30,23 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -30,31 +30,23 @@ void TransposeWx8_NEON(const uint8* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8* src_temp; const uint8* src_temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %5, #8 \n" "sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n" "vld1.8 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n" "vld1.8 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n" "vld1.8 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n" "vld1.8 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n" "vld1.8 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n" "vld1.8 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n" "vld1.8 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n" "vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n" "vtrn.8 d1, d0 \n"
...@@ -79,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -79,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n" "vst1.8 {d1}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n" "vst1.8 {d3}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n" "vst1.8 {d5}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n" "vst1.8 {d7}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n" "vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8 "add %1, #8 \n" // src += 8
...@@ -101,145 +85,108 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -101,145 +85,108 @@ void TransposeWx8_NEON(const uint8* src,
"subs %5, #8 \n" // w -= 8 "subs %5, #8 \n" // w -= 8
"bge 1b \n" "bge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %5, #8 \n" "adds %5, #8 \n"
"beq 4f \n" "beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %5, #2 \n" "cmp %5, #2 \n"
"blt 3f \n" "blt 3f \n"
"cmp %5, #4 \n" "cmp %5, #4 \n"
"blt 2f \n" "blt 2f \n"
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0) "vld1.32 {d0[0]}, [%0], %2 \n"
"vld1.32 {d0[0]}, [%0], %2 \n" "vld1.32 {d0[1]}, [%0], %2 \n"
MEMACCESS(0) "vld1.32 {d1[0]}, [%0], %2 \n"
"vld1.32 {d0[1]}, [%0], %2 \n" "vld1.32 {d1[1]}, [%0], %2 \n"
MEMACCESS(0) "vld1.32 {d2[0]}, [%0], %2 \n"
"vld1.32 {d1[0]}, [%0], %2 \n" "vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0) "vld1.32 {d3[0]}, [%0], %2 \n"
"vld1.32 {d1[1]}, [%0], %2 \n" "vld1.32 {d3[1]}, [%0] \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n" "vld1.8 {q3}, [%6] \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n" "vtbl.8 d4, {d0, d1}, d6 \n"
MEMACCESS(0) "vtbl.8 d5, {d0, d1}, d7 \n"
"vld1.32 {d3[1]}, [%0] \n" "vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
"mov %0, %3 \n"
// TODO(frkoenig): Rework shuffle above to
MEMACCESS(6) // write out with 4 instead of 8 writes.
"vld1.8 {q3}, [%6] \n" "vst1.32 {d4[0]}, [%0], %4 \n"
"vst1.32 {d4[1]}, [%0], %4 \n"
"vtbl.8 d4, {d0, d1}, d6 \n" "vst1.32 {d5[0]}, [%0], %4 \n"
"vtbl.8 d5, {d0, d1}, d7 \n" "vst1.32 {d5[1]}, [%0] \n"
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n" "add %0, %3, #4 \n"
"vst1.32 {d0[0]}, [%0], %4 \n"
// TODO(frkoenig): Rework shuffle above to "vst1.32 {d0[1]}, [%0], %4 \n"
// write out with 4 instead of 8 writes. "vst1.32 {d1[0]}, [%0], %4 \n"
MEMACCESS(0) "vst1.32 {d1[1]}, [%0] \n"
"vst1.32 {d4[0]}, [%0], %4 \n"
MEMACCESS(0) "add %1, #4 \n" // src += 4
"vst1.32 {d4[1]}, [%0], %4 \n" "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
MEMACCESS(0) "subs %5, #4 \n" // w -= 4
"vst1.32 {d5[0]}, [%0], %4 \n" "beq 4f \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n" // some residual, check to see if it includes a 2x8 block,
// or less
"add %0, %3, #4 \n" "cmp %5, #2 \n"
MEMACCESS(0) "blt 3f \n"
"vst1.32 {d0[0]}, [%0], %4 \n"
MEMACCESS(0) // 2x8 block
"vst1.32 {d0[1]}, [%0], %4 \n" "2: \n"
MEMACCESS(0) "mov %0, %1 \n"
"vst1.32 {d1[0]}, [%0], %4 \n" "vld1.16 {d0[0]}, [%0], %2 \n"
MEMACCESS(0) "vld1.16 {d1[0]}, [%0], %2 \n"
"vst1.32 {d1[1]}, [%0] \n" "vld1.16 {d0[1]}, [%0], %2 \n"
"vld1.16 {d1[1]}, [%0], %2 \n"
"add %1, #4 \n" // src += 4 "vld1.16 {d0[2]}, [%0], %2 \n"
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride "vld1.16 {d1[2]}, [%0], %2 \n"
"subs %5, #4 \n" // w -= 4 "vld1.16 {d0[3]}, [%0], %2 \n"
"beq 4f \n" "vld1.16 {d1[3]}, [%0] \n"
// some residual, check to see if it includes a 2x8 block, "vtrn.8 d0, d1 \n"
// or less
"cmp %5, #2 \n" "mov %0, %3 \n"
"blt 3f \n"
"vst1.64 {d0}, [%0], %4 \n"
// 2x8 block "vst1.64 {d1}, [%0] \n"
"2: \n"
"mov %0, %1 \n" "add %1, #2 \n" // src += 2
MEMACCESS(0) "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"vld1.16 {d0[0]}, [%0], %2 \n" "subs %5, #2 \n" // w -= 2
MEMACCESS(0) "beq 4f \n"
"vld1.16 {d1[0]}, [%0], %2 \n"
MEMACCESS(0) // 1x8 block
"vld1.16 {d0[1]}, [%0], %2 \n" "3: \n"
MEMACCESS(0) "vld1.8 {d0[0]}, [%1], %2 \n"
"vld1.16 {d1[1]}, [%0], %2 \n" "vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(0) "vld1.8 {d0[2]}, [%1], %2 \n"
"vld1.16 {d0[2]}, [%0], %2 \n" "vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(0) "vld1.8 {d0[4]}, [%1], %2 \n"
"vld1.16 {d1[2]}, [%0], %2 \n" "vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(0) "vld1.8 {d0[6]}, [%1], %2 \n"
"vld1.16 {d0[3]}, [%0], %2 \n" "vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n" "vst1.64 {d0}, [%3] \n"
"vtrn.8 d0, d1 \n" "4: \n"
"mov %0, %3 \n" : "=&r"(src_temp), // %0
"+r"(src), // %1
MEMACCESS(0) "+r"(src_stride), // %2
"vst1.64 {d0}, [%0], %4 \n" "+r"(dst), // %3
MEMACCESS(0) "+r"(dst_stride), // %4
"vst1.64 {d1}, [%0] \n" "+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6
"add %1, #2 \n" // src += 2 : "memory", "cc", "q0", "q1", "q2", "q3");
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"subs %5, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
"+r"(dst_stride), // %4
"+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3"
);
} }
static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
...@@ -253,31 +200,23 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -253,31 +200,23 @@ void TransposeUVWx8_NEON(const uint8* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8* src_temp; const uint8* src_temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %7, #8 \n" "sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n" "vld2.8 {d0, d1}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n" "vld2.8 {d2, d3}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n" "vld2.8 {d4, d5}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n" "vld2.8 {d6, d7}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n" "vld2.8 {d16, d17}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n" "vld2.8 {d18, d19}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n" "vld2.8 {d20, d21}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n" "vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n" "vtrn.8 q1, q0 \n"
...@@ -306,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -306,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n" "vst1.8 {d6}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n" "vst1.8 {d18}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n" "vst1.8 {d16}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n" "vst1.8 {d22}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d20}, [%0] \n" "vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n" "vst1.8 {d3}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n" "vst1.8 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n" "vst1.8 {d7}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n" "vst1.8 {d5}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n" "vst1.8 {d19}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n" "vst1.8 {d17}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n" "vst1.8 {d23}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d21}, [%0] \n" "vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2 "add %1, #8*2 \n" // src += 8*2
...@@ -348,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -348,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src,
"subs %7, #8 \n" // w -= 8 "subs %7, #8 \n" // w -= 8
"bge 1b \n" "bge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %7, #8 \n" "adds %7, #8 \n"
"beq 4f \n" "beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %7, #2 \n" "cmp %7, #2 \n"
"blt 3f \n" "blt 3f \n"
"cmp %7, #4 \n" "cmp %7, #4 \n"
"blt 2f \n" "blt 2f \n"
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0) "vld1.64 {d0}, [%0], %2 \n"
"vld1.64 {d0}, [%0], %2 \n" "vld1.64 {d1}, [%0], %2 \n"
MEMACCESS(0) "vld1.64 {d2}, [%0], %2 \n"
"vld1.64 {d1}, [%0], %2 \n" "vld1.64 {d3}, [%0], %2 \n"
MEMACCESS(0) "vld1.64 {d4}, [%0], %2 \n"
"vld1.64 {d2}, [%0], %2 \n" "vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0) "vld1.64 {d6}, [%0], %2 \n"
"vld1.64 {d3}, [%0], %2 \n" "vld1.64 {d7}, [%0] \n"
MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n" "vld1.8 {q15}, [%8] \n"
MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n" "vtrn.8 q0, q1 \n"
MEMACCESS(0) "vtrn.8 q2, q3 \n"
"vld1.64 {d6}, [%0], %2 \n"
MEMACCESS(0) "vtbl.8 d16, {d0, d1}, d30 \n"
"vld1.64 {d7}, [%0] \n" "vtbl.8 d17, {d0, d1}, d31 \n"
"vtbl.8 d18, {d2, d3}, d30 \n"
MEMACCESS(8) "vtbl.8 d19, {d2, d3}, d31 \n"
"vld1.8 {q15}, [%8] \n" "vtbl.8 d20, {d4, d5}, d30 \n"
"vtbl.8 d21, {d4, d5}, d31 \n"
"vtrn.8 q0, q1 \n" "vtbl.8 d22, {d6, d7}, d30 \n"
"vtrn.8 q2, q3 \n" "vtbl.8 d23, {d6, d7}, d31 \n"
"vtbl.8 d16, {d0, d1}, d30 \n" "mov %0, %3 \n"
"vtbl.8 d17, {d0, d1}, d31 \n"
"vtbl.8 d18, {d2, d3}, d30 \n" "vst1.32 {d16[0]}, [%0], %4 \n"
"vtbl.8 d19, {d2, d3}, d31 \n" "vst1.32 {d16[1]}, [%0], %4 \n"
"vtbl.8 d20, {d4, d5}, d30 \n" "vst1.32 {d17[0]}, [%0], %4 \n"
"vtbl.8 d21, {d4, d5}, d31 \n" "vst1.32 {d17[1]}, [%0], %4 \n"
"vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n" "add %0, %3, #4 \n"
"vst1.32 {d20[0]}, [%0], %4 \n"
"mov %0, %3 \n" "vst1.32 {d20[1]}, [%0], %4 \n"
"vst1.32 {d21[0]}, [%0], %4 \n"
MEMACCESS(0) "vst1.32 {d21[1]}, [%0] \n"
"vst1.32 {d16[0]}, [%0], %4 \n"
MEMACCESS(0) "mov %0, %5 \n"
"vst1.32 {d16[1]}, [%0], %4 \n"
MEMACCESS(0) "vst1.32 {d18[0]}, [%0], %6 \n"
"vst1.32 {d17[0]}, [%0], %4 \n" "vst1.32 {d18[1]}, [%0], %6 \n"
MEMACCESS(0) "vst1.32 {d19[0]}, [%0], %6 \n"
"vst1.32 {d17[1]}, [%0], %4 \n" "vst1.32 {d19[1]}, [%0], %6 \n"
"add %0, %3, #4 \n" "add %0, %5, #4 \n"
MEMACCESS(0) "vst1.32 {d22[0]}, [%0], %6 \n"
"vst1.32 {d20[0]}, [%0], %4 \n" "vst1.32 {d22[1]}, [%0], %6 \n"
MEMACCESS(0) "vst1.32 {d23[0]}, [%0], %6 \n"
"vst1.32 {d20[1]}, [%0], %4 \n" "vst1.32 {d23[1]}, [%0] \n"
MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n" "add %1, #4*2 \n" // src += 4 * 2
MEMACCESS(0) "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
"vst1.32 {d21[1]}, [%0] \n" // dst_stride_a
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
"mov %0, %5 \n" // dst_stride_b
"subs %7, #4 \n" // w -= 4
MEMACCESS(0) "beq 4f \n"
"vst1.32 {d18[0]}, [%0], %6 \n"
MEMACCESS(0) // some residual, check to see if it includes a 2x8 block,
"vst1.32 {d18[1]}, [%0], %6 \n" // or less
MEMACCESS(0) "cmp %7, #2 \n"
"vst1.32 {d19[0]}, [%0], %6 \n" "blt 3f \n"
MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n" // 2x8 block
"2: \n"
"add %0, %5, #4 \n" "mov %0, %1 \n"
MEMACCESS(0) "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
"vst1.32 {d22[0]}, [%0], %6 \n" "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
MEMACCESS(0) "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
"vst1.32 {d22[1]}, [%0], %6 \n" "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
MEMACCESS(0) "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
"vst1.32 {d23[0]}, [%0], %6 \n" "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0) "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
"vst1.32 {d23[1]}, [%0] \n" "vld2.16 {d1[3], d3[3]}, [%0] \n"
"add %1, #4*2 \n" // src += 4 * 2 "vtrn.8 d0, d1 \n"
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a "vtrn.8 d2, d3 \n"
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %7, #4 \n" // w -= 4 "mov %0, %3 \n"
"beq 4f \n"
"vst1.64 {d0}, [%0], %4 \n"
// some residual, check to see if it includes a 2x8 block, "vst1.64 {d2}, [%0] \n"
// or less
"cmp %7, #2 \n" "mov %0, %5 \n"
"blt 3f \n"
"vst1.64 {d1}, [%0], %6 \n"
// 2x8 block "vst1.64 {d3}, [%0] \n"
"2: \n"
"mov %0, %1 \n" "add %1, #2*2 \n" // src += 2 * 2
MEMACCESS(0) "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n" // dst_stride_a
MEMACCESS(0) "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n" // dst_stride_b
MEMACCESS(0) "subs %7, #2 \n" // w -= 2
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n" "beq 4f \n"
MEMACCESS(0)
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n" // 1x8 block
MEMACCESS(0) "3: \n"
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n" "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
MEMACCESS(0) "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n" "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
MEMACCESS(0) "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n" "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(0) "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
"vld2.16 {d1[3], d3[3]}, [%0] \n" "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
"vld2.8 {d0[7], d1[7]}, [%1] \n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n" "vst1.64 {d0}, [%3] \n"
"vst1.64 {d1}, [%5] \n"
"mov %0, %3 \n"
"4: \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n" : "=&r"(src_temp), // %0
MEMACCESS(0) "+r"(src), // %1
"vst1.64 {d2}, [%0] \n" "+r"(src_stride), // %2
"+r"(dst_a), // %3
"mov %0, %5 \n" "+r"(dst_stride_a), // %4
"+r"(dst_b), // %5
MEMACCESS(0) "+r"(dst_stride_b), // %6
"vst1.64 {d1}, [%0], %6 \n" "+r"(width) // %7
MEMACCESS(0) : "r"(&kVTbl4x4TransposeDi) // %8
"vst1.64 {d3}, [%0] \n" : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
"add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %7, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
MEMACCESS(5)
"vst1.64 {d1}, [%5] \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst_a), // %3
"+r"(dst_stride_a), // %4
"+r"(dst_b), // %5
"+r"(dst_stride_b), // %6
"+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
} }
#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
......
...@@ -34,27 +34,19 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -34,27 +34,19 @@ void TransposeWx8_NEON(const uint8* src,
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %w3, %w3, #8 \n" "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n" "trn2 v16.8b, v0.8b, v1.8b \n"
...@@ -86,31 +78,23 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -86,31 +78,23 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n" "st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n" "st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n" "st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n" "st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n" "st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n" "st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n" "st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v22.8b}, [%0] \n" "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8 "add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
"subs %w3, %w3, #8 \n" // w -= 8 "subs %w3, %w3, #8 \n" // w -= 8
"b.ge 1b \n" "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %w3, %w3, #8 \n" "adds %w3, %w3, #8 \n"
"b.eq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
...@@ -122,26 +106,17 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -122,26 +106,17 @@ void TransposeWx8_NEON(const uint8* src,
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %5 \n" "ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %5 \n" "ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %5 \n" "ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %5 \n" "ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[0], [%0], %5 \n" "ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[1], [%0], %5 \n" "ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[2], [%0], %5 \n" "ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[3], [%0] \n" "ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n" "ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n" "tbl v3.16b, {v0.16b}, v2.16b \n"
...@@ -149,53 +124,37 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -149,53 +124,37 @@ void TransposeWx8_NEON(const uint8* src,
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
MEMACCESS(0)
"st1 {v3.s}[0], [%0], %6 \n" "st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[1], [%0], %6 \n" "st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[2], [%0], %6 \n" "st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[3], [%0] \n" "st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v0.s}[0], [%0], %6 \n" "st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[1], [%0], %6 \n" "st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[2], [%0], %6 \n" "st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[3], [%0] \n" "st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4 "add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %w3, %w3, #4 \n" // w -= 4 "subs %w3, %w3, #4 \n" // w -= 4
"b.eq 4f \n" "b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %w3, #2 \n" "cmp %w3, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.h}[0], [%0], %5 \n" "ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[0], [%0], %5 \n" "ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[1], [%0], %5 \n" "ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[1], [%0], %5 \n" "ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[2], [%0], %5 \n" "ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[2], [%0], %5 \n" "ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[3], [%0], %5 \n" "ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[3], [%0] \n" "ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n" "trn2 v2.8b, v0.8b, v1.8b \n"
...@@ -203,36 +162,25 @@ void TransposeWx8_NEON(const uint8* src, ...@@ -203,36 +162,25 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v3.8b}, [%0], %6 \n" "st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v2.8b}, [%0] \n" "st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2 "add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %w3, %w3, #2 \n" // w -= 2 "subs %w3, %w3, #2 \n" // w -= 2
"b.eq 4f \n" "b.eq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld1 {v0.b}[0], [%1], %5 \n" "ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[1], [%1], %5 \n" "ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[2], [%1], %5 \n" "ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[3], [%1], %5 \n" "ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[4], [%1], %5 \n" "ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[5], [%1], %5 \n" "ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[6], [%1], %5 \n" "ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[7], [%1] \n" "ld1 {v0.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n" "st1 {v0.8b}, [%2] \n"
"4: \n" "4: \n"
...@@ -265,27 +213,19 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -265,27 +213,19 @@ void TransposeUVWx8_NEON(const uint8* src,
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %w4, %w4, #8 \n" "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %5 \n" "ld1 {v0.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], %5 \n" "ld1 {v1.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.16b}, [%0], %5 \n" "ld1 {v2.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.16b}, [%0], %5 \n" "ld1 {v3.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.16b}, [%0], %5 \n" "ld1 {v4.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.16b}, [%0], %5 \n" "ld1 {v5.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.16b}, [%0], %5 \n" "ld1 {v6.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.16b}, [%0] \n" "ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n" "trn1 v16.16b, v0.16b, v1.16b \n"
...@@ -317,81 +257,56 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -317,81 +257,56 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.d}[0], [%0], %6 \n" "st1 {v16.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[0], [%0], %6 \n" "st1 {v18.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[0], [%0], %6 \n" "st1 {v17.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[0], [%0], %6 \n" "st1 {v19.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.d}[1], [%0], %6 \n" "st1 {v16.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[1], [%0], %6 \n" "st1 {v18.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[1], [%0], %6 \n" "st1 {v17.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[1], [%0] \n" "st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v20.d}[0], [%0], %7 \n" "st1 {v20.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[0], [%0], %7 \n" "st1 {v22.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[0], [%0], %7 \n" "st1 {v21.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[0], [%0], %7 \n" "st1 {v23.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v20.d}[1], [%0], %7 \n" "st1 {v20.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[1], [%0], %7 \n" "st1 {v22.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[1], [%0], %7 \n" "st1 {v21.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[1], [%0] \n" "st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2 "add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8 "subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n" "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %w4, %w4, #8 \n" "adds %w4, %w4, #8 \n"
"b.eq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %w4, #2 \n" "cmp %w4, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
"cmp %w4, #4 \n" "cmp %w4, #4 \n"
"b.lt 2f \n" "b.lt 2f \n"
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
MEMACCESS(8)
"ld1 {v30.16b}, [%8], #16 \n" "ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n" "ld1 {v31.16b}, [%8] \n"
...@@ -402,75 +317,51 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -402,75 +317,51 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.s}[0], [%0], %6 \n" "st1 {v16.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[1], [%0], %6 \n" "st1 {v16.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[2], [%0], %6 \n" "st1 {v16.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[3], [%0], %6 \n" "st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v18.s}[0], [%0], %6 \n" "st1 {v18.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[1], [%0], %6 \n" "st1 {v18.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[2], [%0], %6 \n" "st1 {v18.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[3], [%0] \n" "st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v17.s}[0], [%0], %7 \n" "st1 {v17.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[1], [%0], %7 \n" "st1 {v17.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[2], [%0], %7 \n" "st1 {v17.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[3], [%0], %7 \n" "st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"st1 {v19.s}[0], [%0], %7 \n" "st1 {v19.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[1], [%0], %7 \n" "st1 {v19.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[2], [%0], %7 \n" "st1 {v19.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[3], [%0] \n" "st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2 "add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %w4, %w4, #4 \n" // w -= 4 "subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n" "b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %w4, #2 \n" "cmp %w4, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[0], [%0], %5 \n" "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[0], [%0], %5 \n" "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[1], [%0], %5 \n" "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[1], [%0], %5 \n" "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[2], [%0], %5 \n" "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[2], [%0], %5 \n" "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[3], [%0], %5 \n" "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[3], [%0] \n" "ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n" "trn1 v4.8b, v0.8b, v2.8b \n"
...@@ -480,46 +371,32 @@ void TransposeUVWx8_NEON(const uint8* src, ...@@ -480,46 +371,32 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v4.d}[0], [%0], %6 \n" "st1 {v4.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v6.d}[0], [%0] \n" "st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v5.d}[0], [%0], %7 \n" "st1 {v5.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v7.d}[0], [%0] \n" "st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2 "add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %w4, %w4, #2 \n" // w -= 2 "subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n" "b.eq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[0], [%1], %5 \n" "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[1], [%1], %5 \n" "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[2], [%1], %5 \n" "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[3], [%1], %5 \n" "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[4], [%1], %5 \n" "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[5], [%1], %5 \n" "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[6], [%1], %5 \n" "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[7], [%1] \n" "ld2 {v0.b, v1.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.d}[0], [%2] \n" "st1 {v0.d}[0], [%2] \n"
MEMACCESS(3)
"st1 {v1.d}[0], [%3] \n" "st1 {v1.d}[0], [%3] \n"
"4: \n" "4: \n"
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -50,7 +50,7 @@ CANY(ScaleARGBFilterCols_Any_NEON, ...@@ -50,7 +50,7 @@ CANY(ScaleARGBFilterCols_Any_NEON,
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \ int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
int n = dst_width - r; \ int n = dst_width - r; \
if (n > 0) { \ if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
...@@ -65,7 +65,7 @@ CANY(ScaleARGBFilterCols_Any_NEON, ...@@ -65,7 +65,7 @@ CANY(ScaleARGBFilterCols_Any_NEON,
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \ int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
int n = (dst_width - 1) - r; \ int n = (dst_width - 1) - r; \
if (n > 0) { \ if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
......
...@@ -28,21 +28,19 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ...@@ -28,21 +28,19 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n"
"vld2.8 {q0, q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop
"subs %2, %2, #16 \n" // 16 processed per loop "vst1.8 {q1}, [%1]! \n" // store odd pixels
MEMACCESS(1) "bgt 1b \n"
"vst1.8 {q1}, [%1]! \n" // store odd pixels : "+r"(src_ptr), // %0
"bgt 1b \n" "+r"(dst), // %1
: "+r"(src_ptr), // %0 "+r"(dst_width) // %2
"+r"(dst), // %1 :
"+r"(dst_width) // %2 : "q0", "q1" // Clobber List
: );
: "q0", "q1" // Clobber List
);
} }
// Read 32x1 average down and write 16x1. // Read 32x1 average down and write 16x1.
...@@ -51,24 +49,24 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ...@@ -51,24 +49,24 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc // inc
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // add adjacent "vpaddl.u8 q0, q0 \n" // add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1 \n"
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
"vrshrn.u16 d1, q1, #1 \n" // pack
MEMACCESS(1) "vrshrn.u16 d1, q1, #1 \n"
"vst1.8 {q0}, [%1]! \n" "vst1.8 {q0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "q0", "q1" // Clobber List : "q0", "q1" // Clobber List
); );
} }
// Read 32x2 average down and write 16x1. // Read 32x2 average down and write 16x1.
...@@ -76,31 +74,30 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -76,31 +74,30 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
MEMACCESS(1) "subs %3, %3, #16 \n" // 16 processed per loop
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"subs %3, %3, #16 \n" // 16 processed per loop "vpaddl.u8 q1, q1 \n"
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
"vpaddl.u8 q1, q1 \n" // row1
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 "vpadal.u8 q1, q3 \n"
"vpadal.u8 q1, q3 \n" "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack // pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
MEMACCESS(2) "vst1.8 {q0}, [%2]! \n"
"vst1.8 {q0}, [%2]! \n" "bgt 1b \n"
"bgt 1b \n" : "+r"(src_ptr), // %0
: "+r"(src_ptr), // %0 "+r"(src_stride), // %1
"+r"(src_stride), // %1 "+r"(dst), // %2
"+r"(dst), // %2 "+r"(dst_width) // %3
"+r"(dst_width) // %3 :
: : "q0", "q1", "q2", "q3" // Clobber List
: "q0", "q1", "q2", "q3" // Clobber List );
);
} }
void ScaleRowDown4_NEON(const uint8* src_ptr, void ScaleRowDown4_NEON(const uint8* src_ptr,
...@@ -108,20 +105,17 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ...@@ -108,20 +105,17 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop
"subs %2, %2, #8 \n" // 8 processed per loop "vst1.8 {d2}, [%1]! \n"
MEMACCESS(1) "bgt 1b \n"
"vst1.8 {d2}, [%1]! \n" : "+r"(src_ptr), // %0
"bgt 1b \n" "+r"(dst_ptr), // %1
: "+r"(src_ptr), // %0 "+r"(dst_width) // %2
"+r"(dst_ptr), // %1 :
"+r"(dst_width) // %2 : "q0", "q1", "memory", "cc");
:
: "q0", "q1", "memory", "cc"
);
} }
void ScaleRowDown4Box_NEON(const uint8* src_ptr, void ScaleRowDown4Box_NEON(const uint8* src_ptr,
...@@ -131,36 +125,30 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -131,36 +125,30 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q1}, [%3]! \n"
MEMACCESS(3) "vld1.8 {q2}, [%4]! \n"
"vld1.8 {q1}, [%3]! \n" "vld1.8 {q3}, [%5]! \n"
MEMACCESS(4) "subs %2, %2, #4 \n"
"vld1.8 {q2}, [%4]! \n" "vpaddl.u8 q0, q0 \n"
MEMACCESS(5) "vpadal.u8 q0, q1 \n"
"vld1.8 {q3}, [%5]! \n" "vpadal.u8 q0, q2 \n"
"subs %2, %2, #4 \n" "vpadal.u8 q0, q3 \n"
"vpaddl.u8 q0, q0 \n" "vpaddl.u16 q0, q0 \n"
"vpadal.u8 q0, q1 \n" "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vpadal.u8 q0, q2 \n" "vmovn.u16 d0, q0 \n"
"vpadal.u8 q0, q3 \n" "vst1.32 {d0[0]}, [%1]! \n"
"vpaddl.u16 q0, q0 \n" "bgt 1b \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding : "+r"(src_ptr), // %0
"vmovn.u16 d0, q0 \n" "+r"(dst_ptr), // %1
MEMACCESS(1) "+r"(dst_width), // %2
"vst1.32 {d0[0]}, [%1]! \n" "+r"(src_ptr1), // %3
"bgt 1b \n" "+r"(src_ptr2), // %4
: "+r"(src_ptr), // %0 "+r"(src_ptr3) // %5
"+r"(dst_ptr), // %1 :
"+r"(dst_width), // %2 : "q0", "q1", "q2", "q3", "memory", "cc");
"+r"(src_ptr1), // %3
"+r"(src_ptr2), // %4
"+r"(src_ptr3) // %5
:
: "q0", "q1", "q2", "q3", "memory", "cc"
);
} }
// Down scale from 4 to 3 pixels. Use the neon multilane read/write // Down scale from 4 to 3 pixels. Use the neon multilane read/write
...@@ -171,123 +159,113 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -171,123 +159,113 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n"
"subs %2, %2, #24 \n" "vmov d2, d3 \n" // order d0, d1, d2
"vmov d2, d3 \n" // order d0, d1, d2 "vst3.8 {d0, d1, d2}, [%1]! \n"
MEMACCESS(1) "bgt 1b \n"
"vst3.8 {d0, d1, d2}, [%1]! \n" : "+r"(src_ptr), // %0
"bgt 1b \n" "+r"(dst_ptr), // %1
: "+r"(src_ptr), // %0 "+r"(dst_width) // %2
"+r"(dst_ptr), // %1 :
"+r"(dst_width) // %2 : "d0", "d1", "d2", "d3", "memory", "cc");
:
: "d0", "d1", "d2", "d3", "memory", "cc"
);
} }
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
MEMACCESS(3) "subs %2, %2, #24 \n"
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" // filter src line 0 with src line 1
// expand chars to shorts to allow for room
// filter src line 0 with src line 1 // when adding lines together
// expand chars to shorts to allow for room "vmovl.u8 q8, d4 \n"
// when adding lines together "vmovl.u8 q9, d5 \n"
"vmovl.u8 q8, d4 \n" "vmovl.u8 q10, d6 \n"
"vmovl.u8 q9, d5 \n" "vmovl.u8 q11, d7 \n"
"vmovl.u8 q10, d6 \n"
"vmovl.u8 q11, d7 \n" // 3 * line_0 + line_1
"vmlal.u8 q8, d0, d24 \n"
// 3 * line_0 + line_1 "vmlal.u8 q9, d1, d24 \n"
"vmlal.u8 q8, d0, d24 \n" "vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q9, d1, d24 \n" "vmlal.u8 q11, d3, d24 \n"
"vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q11, d3, d24 \n" // (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q8, #2 \n"
// (3 * line_0 + line_1) >> 2 "vqrshrn.u16 d1, q9, #2 \n"
"vqrshrn.u16 d0, q8, #2 \n" "vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d1, q9, #2 \n" "vqrshrn.u16 d3, q11, #2 \n"
"vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d3, q11, #2 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q8, d1 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 "vmlal.u8 q8, d0, d24 \n"
"vmovl.u8 q8, d1 \n" "vqrshrn.u16 d0, q8, #2 \n"
"vmlal.u8 q8, d0, d24 \n"
"vqrshrn.u16 d0, q8, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q8, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 "vmlal.u8 q8, d3, d24 \n"
"vmovl.u8 q8, d2 \n" "vqrshrn.u16 d2, q8, #2 \n"
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
MEMACCESS(1) "bgt 1b \n"
"vst3.8 {d0, d1, d2}, [%1]! \n" : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"bgt 1b \n" "+r"(dst_width), // %2
: "+r"(src_ptr), // %0 "+r"(src_stride) // %3
"+r"(dst_ptr), // %1 :
"+r"(dst_width), // %2 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
"+r"(src_stride) // %3 "cc");
:
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
} }
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
MEMACCESS(3) "subs %2, %2, #24 \n"
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 // average src line 0 with src line 1
"subs %2, %2, #24 \n" "vrhadd.u8 q0, q0, q2 \n"
// average src line 0 with src line 1 "vrhadd.u8 q1, q1, q3 \n"
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 "vmlal.u8 q3, d0, d24 \n"
"vmovl.u8 q3, d1 \n" "vqrshrn.u16 d0, q3, #2 \n"
"vmlal.u8 q3, d0, d24 \n"
"vqrshrn.u16 d0, q3, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 "vmlal.u8 q3, d3, d24 \n"
"vmovl.u8 q3, d2 \n" "vqrshrn.u16 d2, q3, #2 \n"
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
MEMACCESS(1) : "+r"(src_ptr), // %0
"vst3.8 {d0, d1, d2}, [%1]! \n" "+r"(dst_ptr), // %1
"bgt 1b \n" "+r"(dst_width), // %2
: "+r"(src_ptr), // %0 "+r"(src_stride) // %3
"+r"(dst_ptr), // %1 :
"+r"(dst_width), // %2 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
} }
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
...@@ -305,26 +283,21 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -305,26 +283,21 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
MEMACCESS(3) "vld1.8 {q3}, [%3] \n"
"vld1.8 {q3}, [%3] \n" "1: \n"
"1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(0) "subs %2, %2, #12 \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"subs %2, %2, #12 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vst1.8 {d4}, [%1]! \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vst1.32 {d5[0]}, [%1]! \n"
MEMACCESS(1) "bgt 1b \n"
"vst1.8 {d4}, [%1]! \n" : "+r"(src_ptr), // %0
MEMACCESS(1) "+r"(dst_ptr), // %1
"vst1.32 {d5[0]}, [%1]! \n" "+r"(dst_width) // %2
"bgt 1b \n" : "r"(&kShuf38) // %3
: "+r"(src_ptr), // %0 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
} }
// 32x3 -> 12x1 // 32x3 -> 12x1
...@@ -334,117 +307,109 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -334,117 +307,109 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2; const uint8* src_ptr1 = src_ptr + src_stride * 2;
asm volatile ( asm volatile(
MEMACCESS(5) "vld1.16 {q13}, [%5] \n"
"vld1.16 {q13}, [%5] \n" "vld1.8 {q14}, [%6] \n"
MEMACCESS(6) "vld1.8 {q15}, [%7] \n"
"vld1.8 {q14}, [%6] \n" "add %3, %0 \n"
MEMACCESS(7) "1: \n"
"vld1.8 {q15}, [%7] \n"
"add %3, %0 \n" // d0 = 00 40 01 41 02 42 03 43
"1: \n" // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d0 = 00 40 01 41 02 42 03 43 // d3 = 30 70 31 71 32 72 33 73
// d1 = 10 50 11 51 12 52 13 53 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
// d2 = 20 60 21 61 22 62 23 63 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
// d3 = 30 70 31 71 32 72 33 73 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
MEMACCESS(0) "subs %2, %2, #12 \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3) // Shuffle the input data around to get align the data
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
MEMACCESS(4) // d0 = 00 10 01 11 02 12 03 13
"vld4.8 {d16, d17, d18, d19}, [%4]! \n" // d1 = 40 50 41 51 42 52 43 53
"subs %2, %2, #12 \n" "vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
// Shuffle the input data around to get align the data "vtrn.u8 d16, d17 \n"
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13 // d2 = 20 30 21 31 22 32 23 33
// d1 = 40 50 41 51 42 52 43 53 // d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d0, d1 \n" "vtrn.u8 d2, d3 \n"
"vtrn.u8 d4, d5 \n" "vtrn.u8 d6, d7 \n"
"vtrn.u8 d16, d17 \n" "vtrn.u8 d18, d19 \n"
// d2 = 20 30 21 31 22 32 23 33 // d0 = 00+10 01+11 02+12 03+13
// d3 = 60 70 61 71 62 72 63 73 // d2 = 40+50 41+51 42+52 43+53
"vtrn.u8 d2, d3 \n" "vpaddl.u8 q0, q0 \n"
"vtrn.u8 d6, d7 \n" "vpaddl.u8 q2, q2 \n"
"vtrn.u8 d18, d19 \n" "vpaddl.u8 q8, q8 \n"
// d0 = 00+10 01+11 02+12 03+13 // d3 = 60+70 61+71 62+72 63+73
// d2 = 40+50 41+51 42+52 43+53 "vpaddl.u8 d3, d3 \n"
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 d7, d7 \n"
"vpaddl.u8 q2, q2 \n" "vpaddl.u8 d19, d19 \n"
"vpaddl.u8 q8, q8 \n"
// combine source lines
// d3 = 60+70 61+71 62+72 63+73 "vadd.u16 q0, q2 \n"
"vpaddl.u8 d3, d3 \n" "vadd.u16 q0, q8 \n"
"vpaddl.u8 d7, d7 \n" "vadd.u16 d4, d3, d7 \n"
"vpaddl.u8 d19, d19 \n" "vadd.u16 d4, d19 \n"
// combine source lines // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
"vadd.u16 q0, q2 \n" // + s[6 + st * 1] + s[7 + st * 1]
"vadd.u16 q0, q8 \n" // + s[6 + st * 2] + s[7 + st * 2]) / 6
"vadd.u16 d4, d3, d7 \n" "vqrdmulh.s16 q2, q2, q13 \n"
"vadd.u16 d4, d19 \n" "vmovn.u16 d4, q2 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // Shuffle 2,3 reg around so that 2 can be added to the
// + s[6 + st * 1] + s[7 + st * 1] // 0,1 reg and 3 can be added to the 4,5 reg. This
// + s[6 + st * 2] + s[7 + st * 2]) / 6 // requires expanding from u8 to u16 as the 0,1 and 4,5
"vqrdmulh.s16 q2, q2, q13 \n" // registers are already expanded. Then do transposes
"vmovn.u16 d4, q2 \n" // to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
// Shuffle 2,3 reg around so that 2 can be added to the "vmovl.u8 q1, d2 \n"
// 0,1 reg and 3 can be added to the 4,5 reg. This "vmovl.u8 q3, d6 \n"
// requires expanding from u8 to u16 as the 0,1 and 4,5 "vmovl.u8 q9, d18 \n"
// registers are already expanded. Then do transposes
// to get aligned. // combine source lines
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 "vadd.u16 q1, q3 \n"
"vmovl.u8 q1, d2 \n" "vadd.u16 q1, q9 \n"
"vmovl.u8 q3, d6 \n"
"vmovl.u8 q9, d18 \n" // d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
// combine source lines "vtrn.u32 d2, d3 \n"
"vadd.u16 q1, q3 \n"
"vadd.u16 q1, q9 \n" // d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
// d4 = xx 20 xx 30 xx 22 xx 32 "vtrn.u16 d2, d3 \n"
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n" // 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33 // Need to divide, but can't downshift as the the value
"vtrn.u16 d2, d3 \n" // isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
// 0+1+2, 3+4+5 "vqrdmulh.s16 q0, q0, q15 \n"
"vadd.u16 q0, q1 \n"
// Align for table lookup, vtbl requires registers to
// Need to divide, but can't downshift as the the value // be adjacent
// isn't a power of 2. So multiply by 65536 / n "vmov.u8 d2, d4 \n"
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q15 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent "vst1.8 {d3}, [%1]! \n"
"vmov.u8 d2, d4 \n" "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" : "+r"(src_ptr), // %0
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "+r"(dst_ptr), // %1
"+r"(dst_width), // %2
MEMACCESS(1) "+r"(src_stride), // %3
"vst1.8 {d3}, [%1]! \n" "+r"(src_ptr1) // %4
MEMACCESS(1) : "r"(&kMult38_Div6), // %5
"vst1.32 {d4[0]}, [%1]! \n" "r"(&kShuf38_2), // %6
"bgt 1b \n" "r"(&kMult38_Div9) // %7
: "+r"(src_ptr), // %0 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
"+r"(dst_ptr), // %1 "cc");
"+r"(dst_width), // %2
"+r"(src_stride), // %3
"+r"(src_ptr1) // %4
: "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7
: "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
);
} }
// 32x2 -> 12x1 // 32x2 -> 12x1
...@@ -452,100 +417,93 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -452,100 +417,93 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
MEMACCESS(4) "vld1.16 {q13}, [%4] \n"
"vld1.16 {q13}, [%4] \n" "vld1.8 {q14}, [%5] \n"
MEMACCESS(5) "add %3, %0 \n"
"vld1.8 {q14}, [%5] \n" "1: \n"
"add %3, %0 \n"
"1: \n" // d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d0 = 00 40 01 41 02 42 03 43 // d2 = 20 60 21 61 22 62 23 63
// d1 = 10 50 11 51 12 52 13 53 // d3 = 30 70 31 71 32 72 33 73
// d2 = 20 60 21 61 22 62 23 63 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
// d3 = 30 70 31 71 32 72 33 73 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
MEMACCESS(0) "subs %2, %2, #12 \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3) // Shuffle the input data around to get align the data
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
"subs %2, %2, #12 \n" // d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
// Shuffle the input data around to get align the data "vtrn.u8 d0, d1 \n"
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 "vtrn.u8 d4, d5 \n"
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53 // d2 = 20 30 21 31 22 32 23 33
"vtrn.u8 d0, d1 \n" // d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d4, d5 \n" "vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73 // d0 = 00+10 01+11 02+12 03+13
"vtrn.u8 d2, d3 \n" // d2 = 40+50 41+51 42+52 43+53
"vtrn.u8 d6, d7 \n" "vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53 // d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 d3, d3 \n"
"vpaddl.u8 q2, q2 \n" "vpaddl.u8 d7, d7 \n"
// d3 = 60+70 61+71 62+72 63+73 // combine source lines
"vpaddl.u8 d3, d3 \n" "vadd.u16 q0, q2 \n"
"vpaddl.u8 d7, d7 \n" "vadd.u16 d4, d3, d7 \n"
// combine source lines // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vadd.u16 q0, q2 \n" "vqrshrn.u16 d4, q2, #2 \n"
"vadd.u16 d4, d3, d7 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 // 0,1 reg and 3 can be added to the 4,5 reg. This
"vqrshrn.u16 d4, q2, #2 \n" // requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// Shuffle 2,3 reg around so that 2 can be added to the // to get aligned.
// 0,1 reg and 3 can be added to the 4,5 reg. This // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
// requires expanding from u8 to u16 as the 0,1 and 4,5 "vmovl.u8 q1, d2 \n"
// registers are already expanded. Then do transposes "vmovl.u8 q3, d6 \n"
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // combine source lines
"vmovl.u8 q1, d2 \n" "vadd.u16 q1, q3 \n"
"vmovl.u8 q3, d6 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// combine source lines // d5 = xx 21 xx 31 xx 23 xx 33
"vadd.u16 q1, q3 \n" "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32 // d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 21 xx 31 xx 23 xx 33 // d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u32 d2, d3 \n" "vtrn.u16 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23 // 0+1+2, 3+4+5
// d5 = xx 30 xx 31 xx 32 xx 33 "vadd.u16 q0, q1 \n"
"vtrn.u16 d2, d3 \n"
// Need to divide, but can't downshift as the the value
// 0+1+2, 3+4+5 // isn't a power of 2. So multiply by 65536 / n
"vadd.u16 q0, q1 \n" // and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q13 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n // Align for table lookup, vtbl requires registers to
// and take the upper 16 bits. // be adjacent
"vqrdmulh.s16 q0, q0, q13 \n" "vmov.u8 d2, d4 \n"
// Align for table lookup, vtbl requires registers to "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
// be adjacent "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vmov.u8 d2, d4 \n"
"vst1.8 {d3}, [%1]! \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vst1.32 {d4[0]}, [%1]! \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "bgt 1b \n"
: "+r"(src_ptr), // %0
MEMACCESS(1) "+r"(dst_ptr), // %1
"vst1.8 {d3}, [%1]! \n" "+r"(dst_width), // %2
MEMACCESS(1) "+r"(src_stride) // %3
"vst1.32 {d4[0]}, [%1]! \n" : "r"(&kMult38_Div6), // %4
"bgt 1b \n" "r"(&kShuf38_2) // %5
: "+r"(src_ptr), // %0 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
} }
void ScaleAddRows_NEON(const uint8* src_ptr, void ScaleAddRows_NEON(const uint8* src_ptr,
...@@ -554,34 +512,32 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -554,34 +512,32 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
int src_width, int src_width,
int src_height) { int src_height) {
const uint8* src_tmp; const uint8* src_tmp;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"mov r12, %5 \n" "mov r12, %5 \n"
"veor q2, q2, q2 \n" "veor q2, q2, q2 \n"
"veor q3, q3, q3 \n" "veor q3, q3, q3 \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
MEMACCESS(0) "vld1.8 {q0}, [%0], %3 \n"
"vld1.8 {q0}, [%0], %3 \n" "vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q3, q3, d1 \n" "vaddw.u8 q2, q2, d0 \n"
"vaddw.u8 q2, q2, d0 \n" "subs r12, r12, #1 \n"
"subs r12, r12, #1 \n" "bgt 2b \n"
"bgt 2b \n" "vst1.16 {q2, q3}, [%2]! \n" // store pixels
MEMACCESS(2) "add %1, %1, #16 \n"
"vst1.16 {q2, q3}, [%2]! \n" // store pixels "subs %4, %4, #16 \n" // 16 processed per loop
"add %1, %1, #16 \n" "bgt 1b \n"
"subs %4, %4, #16 \n" // 16 processed per loop : "=&r"(src_tmp), // %0
"bgt 1b \n" "+r"(src_ptr), // %1
: "=&r"(src_tmp), // %0 "+r"(dst_ptr), // %2
"+r"(src_ptr), // %1 "+r"(src_stride), // %3
"+r"(dst_ptr), // %2 "+r"(src_width), // %4
"+r"(src_stride), // %3 "+r"(src_height) // %5
"+r"(src_width), // %4 :
"+r"(src_height) // %5 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
: );
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
);
} }
// clang-format off // clang-format off
...@@ -591,7 +547,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -591,7 +547,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// clang-format on // clang-format on
...@@ -643,7 +598,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, ...@@ -643,7 +598,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
"vadd.s16 q8, q8, q9 \n" "vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n" "vmovn.s16 d6, q8 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels "vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n" "vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n" "vadd.s32 q2, q2, q0 \n"
...@@ -670,99 +624,83 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -670,99 +624,83 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int dst_width, int dst_width,
int source_y_fraction) { int source_y_fraction) {
asm volatile ( asm volatile(
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %1 \n"
"cmp %4, #64 \n" "cmp %4, #64 \n"
"beq 75f \n" "beq 75f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
"beq 50f \n" "beq 50f \n"
"cmp %4, #192 \n" "cmp %4, #192 \n"
"beq 25f \n" "beq 25f \n"
"vdup.8 d5, %4 \n" "vdup.8 d5, %4 \n"
"rsb %4, #256 \n" "rsb %4, #256 \n"
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1) "vld1.8 {q0}, [%1]! \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n"
MEMACCESS(2) "subs %3, %3, #16 \n"
"vld1.8 {q1}, [%2]! \n" "vmull.u8 q13, d0, d4 \n"
"subs %3, %3, #16 \n" "vmull.u8 q14, d1, d4 \n"
"vmull.u8 q13, d0, d4 \n" "vmlal.u8 q13, d2, d5 \n"
"vmull.u8 q14, d1, d4 \n" "vmlal.u8 q14, d3, d5 \n"
"vmlal.u8 q13, d2, d5 \n" "vrshrn.u16 d0, q13, #8 \n"
"vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d1, q14, #8 \n"
"vrshrn.u16 d0, q13, #8 \n" "vst1.8 {q0}, [%0]! \n"
"vrshrn.u16 d1, q14, #8 \n" "bgt 1b \n"
MEMACCESS(0) "b 99f \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n" // Blend 25 / 75.
"b 99f \n" "25: \n"
"vld1.8 {q0}, [%1]! \n"
// Blend 25 / 75. "vld1.8 {q1}, [%2]! \n"
"25: \n" "subs %3, %3, #16 \n"
MEMACCESS(1) "vrhadd.u8 q0, q1 \n"
"vld1.8 {q0}, [%1]! \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(2) "vst1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%2]! \n" "bgt 25b \n"
"subs %3, %3, #16 \n" "b 99f \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" // Blend 50 / 50.
MEMACCESS(0) "50: \n"
"vst1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%1]! \n"
"bgt 25b \n" "vld1.8 {q1}, [%2]! \n"
"b 99f \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
// Blend 50 / 50. "vst1.8 {q0}, [%0]! \n"
"50: \n" "bgt 50b \n"
MEMACCESS(1) "b 99f \n"
"vld1.8 {q0}, [%1]! \n"
MEMACCESS(2) // Blend 75 / 25.
"vld1.8 {q1}, [%2]! \n" "75: \n"
"subs %3, %3, #16 \n" "vld1.8 {q1}, [%1]! \n"
"vrhadd.u8 q0, q1 \n" "vld1.8 {q0}, [%2]! \n"
MEMACCESS(0) "subs %3, %3, #16 \n"
"vst1.8 {q0}, [%0]! \n" "vrhadd.u8 q0, q1 \n"
"bgt 50b \n" "vrhadd.u8 q0, q1 \n"
"b 99f \n" "vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
// Blend 75 / 25. "b 99f \n"
"75: \n"
MEMACCESS(1) // Blend 100 / 0 - Copy row unchanged.
"vld1.8 {q1}, [%1]! \n" "100: \n"
MEMACCESS(2) "vld1.8 {q0}, [%1]! \n"
"vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n"
"subs %3, %3, #16 \n" "vst1.8 {q0}, [%0]! \n"
"vrhadd.u8 q0, q1 \n" "bgt 100b \n"
"vrhadd.u8 q0, q1 \n"
MEMACCESS(0) "99: \n"
"vst1.8 {q0}, [%0]! \n" "vst1.8 {d1[7]}, [%0] \n"
"bgt 75b \n" : "+r"(dst_ptr), // %0
"b 99f \n" "+r"(src_ptr), // %1
"+r"(src_stride), // %2
// Blend 100 / 0 - Copy row unchanged. "+r"(dst_width), // %3
"100: \n" "+r"(source_y_fraction) // %4
MEMACCESS(1) :
"vld1.8 {q0}, [%1]! \n" : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
"subs %3, %3, #16 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
MEMACCESS(0)
"vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
} }
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
...@@ -770,25 +708,21 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ...@@ -770,25 +708,21 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0) "vld2.32 {q0, q1}, [%0]! \n"
"vld2.32 {q0, q1}, [%0]! \n" "vld2.32 {q2, q3}, [%0]! \n"
MEMACCESS(0) "subs %2, %2, #8 \n" // 8 processed per loop
"vld2.32 {q2, q3}, [%0]! \n" "vst1.8 {q1}, [%1]! \n" // store odd pixels
"subs %2, %2, #8 \n" // 8 processed per loop "vst1.8 {q3}, [%1]! \n"
MEMACCESS(1) "bgt 1b \n"
"vst1.8 {q1}, [%1]! \n" // store odd pixels : "+r"(src_ptr), // %0
MEMACCESS(1) "+r"(dst), // %1
"vst1.8 {q3}, [%1]! \n" "+r"(dst_width) // %2
"bgt 1b \n" :
: "+r"(src_ptr), // %0 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
"+r"(dst), // %1 );
"+r"(dst_width) // %2
:
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
);
} }
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
...@@ -796,71 +730,68 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ...@@ -796,71 +730,68 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
uint8* dst_argb, uint8* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
MEMACCESS(0) // pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop
"subs %2, %2, #8 \n" // 8 processed per loop "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack // pack
"vrshrn.u16 d1, q1, #1 \n" "vrshrn.u16 d1, q1, #1 \n"
"vrshrn.u16 d2, q2, #1 \n" "vrshrn.u16 d2, q2, #1 \n"
"vrshrn.u16 d3, q3, #1 \n" "vrshrn.u16 d3, q3, #1 \n"
MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" "bgt 1b \n"
"bgt 1b \n" : "+r"(src_argb), // %0
: "+r"(src_argb), // %0 "+r"(dst_argb), // %1
"+r"(dst_argb), // %1 "+r"(dst_width) // %2
"+r"(dst_width) // %2 :
: : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List );
);
} }
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst, uint8* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
MEMACCESS(0) // pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop.
"subs %3, %3, #8 \n" // 8 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
MEMACCESS(1) // pixels.
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
MEMACCESS(1) // pixels.
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack // pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n" "vrshrn.u16 d3, q3, #2 \n"
MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" "bgt 1b \n"
"bgt 1b \n" : "+r"(src_ptr), // %0
: "+r"(src_ptr), // %0 "+r"(src_stride), // %1
"+r"(src_stride), // %1 "+r"(dst), // %2
"+r"(dst), // %2 "+r"(dst_width) // %3
"+r"(dst_width) // %3 :
: : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
...@@ -871,27 +802,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ...@@ -871,27 +802,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
uint8* dst_argb, uint8* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld1.32 {d0[0]}, [%0], r12 \n"
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n"
MEMACCESS(0) "vld1.32 {d1[0]}, [%0], r12 \n"
"vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d1[1]}, [%0], r12 \n"
MEMACCESS(0) "subs %2, %2, #4 \n" // 4 pixels per loop.
"vld1.32 {d1[0]}, [%0], r12 \n" "vst1.8 {q0}, [%1]! \n"
MEMACCESS(0) "bgt 1b \n"
"vld1.32 {d1[1]}, [%0], r12 \n" : "+r"(src_argb), // %0
"subs %2, %2, #4 \n" // 4 pixels per loop. "+r"(dst_argb), // %1
MEMACCESS(1) "+r"(dst_width) // %2
"vst1.8 {q0}, [%1]! \n" : "r"(src_stepx) // %3
"bgt 1b \n" : "memory", "cc", "r12", "q0");
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"(src_stepx) // %3
: "memory", "cc", "r12", "q0"
);
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
...@@ -901,47 +826,38 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -901,47 +826,38 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
int src_stepx, int src_stepx,
uint8* dst_argb, uint8* dst_argb,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks ->
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 // 2x1
MEMACCESS(1) "vld1.8 {d1}, [%1], r12 \n"
"vld1.8 {d1}, [%1], r12 \n" "vld1.8 {d2}, [%0], r12 \n"
MEMACCESS(0) "vld1.8 {d3}, [%1], r12 \n"
"vld1.8 {d2}, [%0], r12 \n" "vld1.8 {d4}, [%0], r12 \n"
MEMACCESS(1) "vld1.8 {d5}, [%1], r12 \n"
"vld1.8 {d3}, [%1], r12 \n" "vld1.8 {d6}, [%0], r12 \n"
MEMACCESS(0) "vld1.8 {d7}, [%1], r12 \n"
"vld1.8 {d4}, [%0], r12 \n" "vaddl.u8 q0, d0, d1 \n"
MEMACCESS(1) "vaddl.u8 q1, d2, d3 \n"
"vld1.8 {d5}, [%1], r12 \n" "vaddl.u8 q2, d4, d5 \n"
MEMACCESS(0) "vaddl.u8 q3, d6, d7 \n"
"vld1.8 {d6}, [%0], r12 \n" "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
MEMACCESS(1) "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
"vld1.8 {d7}, [%1], r12 \n" "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
"vaddl.u8 q0, d0, d1 \n" "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
"vaddl.u8 q1, d2, d3 \n" "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vaddl.u8 q2, d4, d5 \n" "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"vaddl.u8 q3, d6, d7 \n" "subs %3, %3, #4 \n" // 4 pixels per loop.
"vswp.8 d1, d2 \n" // ab_cd -> ac_bd "vst1.8 {q0}, [%2]! \n"
"vswp.8 d5, d6 \n" // ef_gh -> eg_fh "bgt 1b \n"
"vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) : "+r"(src_argb), // %0
"vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) "+r"(src_stride), // %1
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "+r"(dst_argb), // %2
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. "+r"(dst_width) // %3
"subs %3, %3, #4 \n" // 4 pixels per loop. : "r"(src_stepx) // %4
MEMACCESS(2) : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
"+r"(dst_width) // %3
: "r"(src_stepx) // %4
: "memory", "cc", "r12", "q0", "q1", "q2", "q3"
);
} }
// clang-format off // clang-format off
...@@ -951,7 +867,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -951,7 +867,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {" #dn "[" #n "]}, [%6] \n" "vld1.32 {" #dn "[" #n "]}, [%6] \n"
// clang-format on // clang-format on
...@@ -962,31 +877,25 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -962,31 +877,25 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int dx) { int dx) {
int tmp; int tmp;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile ( asm volatile(
"1: \n" "1: \n" LOAD1_DATA32_LANE(
LOAD1_DATA32_LANE(d0, 0) d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
LOAD1_DATA32_LANE(d1, 0) d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0) "vst1.32 {q0, q1}, [%0]! \n" // store pixels
LOAD1_DATA32_LANE(d2, 1) "subs %2, %2, #8 \n" // 8 processed per
LOAD1_DATA32_LANE(d3, 0) // loop
LOAD1_DATA32_LANE(d3, 1) "bgt 1b \n"
: "+r"(dst_argb), // %0
MEMACCESS(0) "+r"(src_argb), // %1
"vst1.32 {q0, q1}, [%0]! \n" // store pixels "+r"(dst_width), // %2
"subs %2, %2, #8 \n" // 8 processed per loop "+r"(x), // %3
"bgt 1b \n" "+r"(dx), // %4
: "+r"(dst_argb), // %0 "=&r"(tmp), // %5
"+r"(src_argb), // %1 "+r"(src_tmp) // %6
"+r"(dst_width), // %2 :
"+r"(x), // %3 : "memory", "cc", "q0", "q1");
"+r"(dx), // %4
"=&r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1"
);
} }
#undef LOAD1_DATA32_LANE #undef LOAD1_DATA32_LANE
...@@ -998,7 +907,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -998,7 +907,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
// clang-format on // clang-format on
...@@ -1045,7 +953,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, ...@@ -1045,7 +953,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
"vshrn.i16 d0, q11, #7 \n" "vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n" "vshrn.i16 d1, q12, #7 \n"
MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels "vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n" "vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop
......
...@@ -29,10 +29,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ...@@ -29,10 +29,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -51,14 +49,12 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ...@@ -51,14 +49,12 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // add adjacent "uaddlp v0.8h, v0.16b \n" // add adjacent
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #1 \n" "rshrn2 v0.16b, v1.8h, #1 \n"
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -78,9 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -78,9 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
...@@ -89,7 +83,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -89,7 +83,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v3.16b \n" "uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -108,10 +101,8 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ...@@ -108,10 +101,8 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -131,13 +122,9 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -131,13 +122,9 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%3], #16 \n" "ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%4], #16 \n" "ld1 {v3.16b}, [%4], #16 \n"
"subs %w5, %w5, #4 \n" "subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n" "uaddlp v0.8h, v0.16b \n"
...@@ -146,7 +133,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -146,7 +133,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"uadalp v0.8h, v3.16b \n" "uadalp v0.8h, v3.16b \n"
"addp v0.8h, v0.8h, v0.8h \n" "addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
MEMACCESS(1)
"st1 {v0.s}[0], [%1], #4 \n" "st1 {v0.s}[0], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -170,11 +156,9 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -170,11 +156,9 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -193,9 +177,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -193,9 +177,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
...@@ -232,8 +214,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -232,8 +214,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"umlal v16.8h, v3.8b, v20.8b \n" "umlal v16.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v16.8h, #2 \n" "uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1) "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -254,10 +235,8 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -254,10 +235,8 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n" "urhadd v0.8b, v0.8b, v4.8b \n"
...@@ -278,8 +257,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -278,8 +257,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"umlal v4.8h, v3.8b, v20.8b \n" "umlal v4.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v4.8h, #2 \n" "uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1) "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -305,16 +283,12 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -305,16 +283,12 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
MEMACCESS(3)
"ld1 {v3.16b}, [%3] \n" "ld1 {v3.16b}, [%3] \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n" "subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v2.s}[2], [%1], #4 \n" "st1 {v2.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -334,11 +308,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -334,11 +308,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(5)
"ld1 {v29.8h}, [%5] \n" "ld1 {v29.8h}, [%5] \n"
MEMACCESS(6)
"ld1 {v30.16b}, [%6] \n" "ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n" "ld1 {v31.8h}, [%7] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
...@@ -347,12 +318,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -347,12 +318,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(3) "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %w4, %w4, #12 \n" "subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
...@@ -436,9 +404,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -436,9 +404,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// be adjacent // be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -463,9 +429,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -463,9 +429,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+. // TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n" "ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n" "ld1 {v31.16b}, [%5] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
...@@ -474,10 +438,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -474,10 +438,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"subs %w3, %w3, #12 \n" "subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
...@@ -547,9 +509,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -547,9 +509,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -577,13 +537,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -577,13 +537,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"eor v3.16b, v3.16b, v3.16b \n" "eor v3.16b, v3.16b, v3.16b \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" "ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n" "uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n" "uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n" "subs w12, w12, #1 \n"
"b.gt 2b \n" "b.gt 2b \n"
MEMACCESS(2)
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop "subs %w4, %w4, #16 \n" // 16 processed per loop
...@@ -606,7 +564,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -606,7 +564,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n" "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on // clang-format on
...@@ -660,7 +617,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, ...@@ -660,7 +617,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
"add v4.8h, v4.8h, v6.8h \n" "add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n" "xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels "st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n" "add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n"
...@@ -703,9 +659,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -703,9 +659,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"dup v4.8b, %w5 \n" "dup v4.8b, %w5 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n" "umull v6.8h, v0.8b, v4.8b \n"
...@@ -714,63 +668,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -714,63 +668,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"umlal2 v7.8h, v1.16b, v5.16b \n" "umlal2 v7.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v6.8h, #8 \n" "rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n" "b.gt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n" "b.gt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0)
"st1 {v0.b}[15], [%0] \n" "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
...@@ -791,14 +732,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ...@@ -791,14 +732,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS (0)
"ld2 {v0.4s, v1.4s}, [%0], #32 \n" "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
MEMACCESS (0)
"ld2 {v2.4s, v3.4s}, [%0], #32 \n" "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS (1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1)
"st1 {v3.16b}, [%1], #16 \n" "st1 {v3.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
...@@ -816,7 +753,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ...@@ -816,7 +753,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS (0)
// load 8 ARGB pixels. // load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
...@@ -828,7 +764,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ...@@ -828,7 +764,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
"rshrn v1.8b, v1.8h, #1 \n" "rshrn v1.8b, v1.8h, #1 \n"
"rshrn v2.8b, v2.8h, #1 \n" "rshrn v2.8b, v2.8h, #1 \n"
"rshrn v3.8b, v3.8h, #1 \n" "rshrn v3.8b, v3.8h, #1 \n"
MEMACCESS (1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -847,14 +782,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -847,14 +782,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS (0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
MEMACCESS (1)
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
...@@ -864,7 +797,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -864,7 +797,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
...@@ -886,16 +818,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ...@@ -886,16 +818,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %3 \n" "ld1 {v0.s}[0], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %3 \n" "ld1 {v0.s}[1], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %3 \n" "ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -918,21 +845,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -918,21 +845,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile ( asm volatile (
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v3.8b}, [%1], %4 \n" "ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %4 \n" "ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v5.8b}, [%1], %4 \n" "ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %4 \n" "ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v7.8b}, [%1], %4 \n" "ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n" "uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n" "uaddl v2.8h, v2.8b, v3.8b \n"
...@@ -949,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -949,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop. "subs %w3, %w3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -968,7 +886,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -968,7 +886,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {" #vn ".s}[" #n "], [%6] \n" "ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
...@@ -992,10 +909,9 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -992,10 +909,9 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v1, 3)
MEMACCESS(0)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -1017,7 +933,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -1017,7 +933,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
...@@ -1067,7 +982,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, ...@@ -1067,7 +982,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
"shrn v0.8b, v16.8h, #7 \n" "shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels "st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n" "add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment