Commit 0acc6771 authored by Frank Barchard's avatar Frank Barchard

clang format / lint cleanup for arm scale functions

TBR=kjellander@chromium.org
BUG=libyuv:725
TEST=lint

Change-Id: I76f777427f9b1458faba12796fb0011d8e3228d5
Reviewed-on: https://chromium-review.googlesource.com/646586Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent a826dd71
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1666 Version: 1667
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1666 #define LIBYUV_VERSION 1667
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ...@@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n" "vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
...@@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ...@@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vrhadd.u8 q0, q0, q1 \n" // rounding half add "vrhadd.u8 q0, q0, q1 \n" // rounding half add
...@@ -72,7 +72,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -72,7 +72,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
...@@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ...@@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vst1.8 {d2}, [%1]! \n" "vst1.8 {d2}, [%1]! \n"
...@@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [%3]! \n" "vld1.8 {q1}, [%3]! \n"
"vld1.8 {q2}, [%4]! \n" "vld1.8 {q2}, [%4]! \n"
...@@ -155,12 +155,12 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -155,12 +155,12 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2 "vmov d2, d3 \n" // order d0, d1, d2
"vst3.8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -175,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -175,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
...@@ -232,7 +232,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -232,7 +232,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
...@@ -280,7 +280,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -280,7 +280,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
"1: \n" "1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
...@@ -307,7 +307,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -307,7 +307,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vld1.8 {q14}, [%6] \n" "vld1.8 {q14}, [%6] \n"
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
...@@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vld1.16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
...@@ -508,12 +508,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -508,12 +508,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
int src_height) { int src_height) {
const uint8* src_tmp; const uint8* src_tmp;
asm volatile( asm volatile(
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"mov r12, %5 \n" "mov r12, %5 \n"
"veor q2, q2, q2 \n" "veor q2, q2, q2 \n"
"veor q3, q3, q3 \n" "veor q3, q3, q3 \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
"vld1.8 {q0}, [%0], %3 \n" "vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n" "vaddw.u8 q3, q3, d1 \n"
...@@ -535,15 +535,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -535,15 +535,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
); );
} }
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping // the x/dx stepping
#define LOAD2_DATA8_LANE(n) \ #define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
// clang-format on
// The NEON version mimics this formula (from row_common.cc): // The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) + // #define BLENDER(a, b, f) (uint8)((int)(a) +
...@@ -634,7 +632,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -634,7 +632,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"rsb %4, #256 \n" "rsb %4, #256 \n"
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
...@@ -649,7 +647,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -649,7 +647,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
...@@ -660,7 +658,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -660,7 +658,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
...@@ -670,7 +668,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -670,7 +668,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"vld1.8 {q0}, [%2]! \n" "vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
...@@ -681,13 +679,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -681,13 +679,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
"vst1.8 {d1[7]}, [%0] \n" "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
...@@ -719,7 +717,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ...@@ -719,7 +717,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
); );
} }
// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! // 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
// 4a: 3e04 subs r6, #4 // 4a: 3e04 subs r6, #4
// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! // 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
...@@ -727,14 +724,13 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ...@@ -727,14 +724,13 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
// 54: f942 038d vst2.32 {d16-d19}, [r2]! // 54: f942 038d vst2.32 {d16-d19}, [r2]!
// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46> // 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, uint8* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
...@@ -757,25 +753,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -757,25 +753,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
// pixels.
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
// pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
// pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n" "vrshrn.u16 d3, q3, #2 \n"
...@@ -799,7 +791,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ...@@ -799,7 +791,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
"1: \n" "1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
"vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n"
"vld1.32 {d1[0]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n"
...@@ -824,9 +816,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -824,9 +816,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile( asm volatile(
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
// 2x1
"vld1.8 {d1}, [%1], r12 \n" "vld1.8 {d1}, [%1], r12 \n"
"vld1.8 {d2}, [%0], r12 \n" "vld1.8 {d2}, [%0], r12 \n"
"vld1.8 {d3}, [%1], r12 \n" "vld1.8 {d3}, [%1], r12 \n"
...@@ -855,15 +846,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -855,15 +846,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
: "memory", "cc", "r12", "q0", "q1", "q2", "q3"); : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
} }
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping // the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n) \ #define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
"vld1.32 {" #dn "[" #n "]}, [%6] \n" "vld1.32 {" #dn "[" #n "]}, [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb, void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb, const uint8* src_argb,
...@@ -873,15 +862,20 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -873,15 +862,20 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int tmp; int tmp;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile( asm volatile(
"1: \n" LOAD1_DATA32_LANE( "1: \n"
d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0) // clang-format off
LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE( LOAD1_DATA32_LANE(d0, 0)
d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1) LOAD1_DATA32_LANE(d0, 1)
LOAD1_DATA32_LANE(d1, 0)
"vst1.32 {q0, q1}, [%0]! \n" // store pixels LOAD1_DATA32_LANE(d1, 1)
"subs %2, %2, #8 \n" // 8 processed per LOAD1_DATA32_LANE(d2, 0)
// loop LOAD1_DATA32_LANE(d2, 1)
"bgt 1b \n" LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
// clang-format on
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -895,15 +889,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -895,15 +889,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
#undef LOAD1_DATA32_LANE #undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping // the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \ #define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
// clang-format on
void ScaleARGBFilterCols_NEON(uint8* dst_argb, void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb, const uint8* src_argb,
......
...@@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ...@@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
...@@ -48,11 +48,11 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ...@@ -48,11 +48,11 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -71,7 +71,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -71,7 +71,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
...@@ -100,7 +100,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ...@@ -100,7 +100,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
...@@ -120,7 +120,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ...@@ -120,7 +120,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"ld1 {v2.16b}, [%3], #16 \n" "ld1 {v2.16b}, [%3], #16 \n"
...@@ -153,11 +153,10 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -153,11 +153,10 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
// v2
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -174,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -174,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
...@@ -307,14 +306,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -307,14 +306,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"ld1 {v30.16b}, [%6] \n" "ld1 {v30.16b}, [%6] \n"
"ld1 {v31.8h}, [%7] \n" "ld1 {v31.8h}, [%7] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
// 00 40 01 41 02 42 03 43 // 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %w4, %w4, #12 \n" "subs %w4, %w4, #12 \n"
...@@ -426,7 +425,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -426,7 +425,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"ld1 {v30.8h}, [%4] \n" "ld1 {v30.8h}, [%4] \n"
"ld1 {v31.16b}, [%5] \n" "ld1 {v31.16b}, [%5] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
// 00 40 01 41 02 42 03 43 // 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
...@@ -523,12 +522,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -523,12 +522,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
int src_height) { int src_height) {
const uint8* src_tmp; const uint8* src_tmp;
asm volatile( asm volatile(
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"mov w12, %w5 \n" "mov w12, %w5 \n"
"eor v2.16b, v2.16b, v2.16b \n" "eor v2.16b, v2.16b, v2.16b \n"
"eor v3.16b, v3.16b, v3.16b \n" "eor v3.16b, v3.16b, v3.16b \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
"ld1 {v0.16b}, [%0], %3 \n" "ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n" "uaddw2 v3.8h, v3.8h, v0.16b \n"
...@@ -550,15 +549,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ...@@ -550,15 +549,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
); );
} }
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping // the x/dx stepping
#define LOAD2_DATA8_LANE(n) \ #define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n" "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on
// The NEON version mimics this formula (from row_common.cc): // The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) + // #define BLENDER(a, b, f) (uint8)((int)(a) +
...@@ -572,8 +569,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, ...@@ -572,8 +569,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
int dx_offset[4] = {0, 1, 2, 3}; int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset; int* tmp = dx_offset;
const uint8* src_tmp = src_ptr; const uint8* src_tmp = src_ptr;
int64 x64 = (int64)x; int64 x64 = (int64)x; // NOLINT
int64 dx64 = (int64)dx; int64 dx64 = (int64)dx; // NOLINT
asm volatile ( asm volatile (
"dup v0.4s, %w3 \n" // x "dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx "dup v1.4s, %w4 \n" // dx
...@@ -651,7 +648,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -651,7 +648,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"dup v5.8b, %w4 \n" "dup v5.8b, %w4 \n"
"dup v4.8b, %w5 \n" "dup v4.8b, %w5 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
...@@ -666,7 +663,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -666,7 +663,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
...@@ -677,7 +674,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -677,7 +674,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
...@@ -687,7 +684,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -687,7 +684,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
...@@ -698,13 +695,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -698,13 +695,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
"st1 {v0.b}[15], [%0] \n" "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
...@@ -724,7 +721,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ...@@ -724,7 +721,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
"1: \n" "1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"mov v2.16b, v3.16b \n" "mov v2.16b, v3.16b \n"
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
...@@ -745,7 +742,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ...@@ -745,7 +742,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
asm volatile( asm volatile(
"1: \n" "1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
...@@ -767,17 +764,14 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -767,17 +764,14 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
// more ARGB
// pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
...@@ -806,7 +800,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ...@@ -806,7 +800,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.s}[0], [%0], %3 \n" "ld1 {v0.s}[0], [%0], %3 \n"
"ld1 {v0.s}[1], [%0], %3 \n" "ld1 {v0.s}[1], [%0], %3 \n"
"ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n"
...@@ -832,7 +826,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -832,7 +826,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
int dst_width) { int dst_width) {
asm volatile( asm volatile(
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks ->
// 2x1 // 2x1
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
...@@ -867,15 +861,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -867,15 +861,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
} }
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping // the x/dx stepping
#define LOAD1_DATA32_LANE(vn, n) \ #define LOAD1_DATA32_LANE(vn, n) \
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
"ld1 {" #vn ".s}[" #n "], [%6] \n" "ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb, void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb, const uint8* src_argb,
...@@ -883,19 +875,25 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -883,19 +875,25 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int x, int x,
int dx) { int dx) {
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
int64 x64 = (int64)x; int64 x64 = (int64)x; // NOLINT
int64 dx64 = (int64)dx; int64 dx64 = (int64)dx; // NOLINT
int64 tmp64; int64 tmp64;
asm volatile( asm volatile(
"1: \n" LOAD1_DATA32_LANE( "1: \n"
v0, 0) LOAD1_DATA32_LANE(v0, 1) LOAD1_DATA32_LANE(v0, 2) // clang-format off
LOAD1_DATA32_LANE(v0, 3) LOAD1_DATA32_LANE(v1, 0) LOAD1_DATA32_LANE( LOAD1_DATA32_LANE(v0, 0)
v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v0, 1)
LOAD1_DATA32_LANE(v0, 2)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels LOAD1_DATA32_LANE(v0, 3)
"subs %w2, %w2, #8 \n" // 8 processed per LOAD1_DATA32_LANE(v1, 0)
// loop LOAD1_DATA32_LANE(v1, 1)
"b.gt 1b \n" LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
// clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per
// loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -909,15 +907,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -909,15 +907,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
#undef LOAD1_DATA32_LANE #undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping // the x/dx stepping
#define LOAD2_DATA32_LANE(vn1, vn2, n) \ #define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBFilterCols_NEON(uint8* dst_argb, void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb, const uint8* src_argb,
...@@ -927,8 +923,8 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, ...@@ -927,8 +923,8 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
int dx_offset[4] = {0, 1, 2, 3}; int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset; int* tmp = dx_offset;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
int64 x64 = (int64)x; int64 x64 = (int64)x; // NOLINT
int64 dx64 = (int64)dx; int64 dx64 = (int64)dx; // NOLINT
asm volatile ( asm volatile (
"dup v0.4s, %w3 \n" // x "dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx "dup v1.4s, %w4 \n" // dx
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment