Commit 0a3d23c8 authored by Frank Barchard's avatar Frank Barchard

fix clang-format-ing for row arm functions

TBR=kjellander@chromium.org
BUG=None
TEST=git cl lint

Change-Id: I45ecd7f8279981ba037dc051f521f6b6d5506f64
Reviewed-on: https://chromium-review.googlesource.com/664345
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent 753a91cb
...@@ -385,8 +385,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -385,8 +385,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB "1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
...@@ -400,8 +399,8 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -400,8 +399,8 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb) [kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q12", "q13", "q14", "q15"); "q10", "q11", "q12", "q13", "q14", "q15");
} }
void NV21ToARGBRow_NEON(const uint8* src_y, void NV21ToARGBRow_NEON(const uint8* src_y,
...@@ -409,8 +408,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -409,8 +408,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB "1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
...@@ -424,8 +422,8 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -424,8 +422,8 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb) [kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q12", "q13", "q14", "q15"); "q10", "q11", "q12", "q13", "q14", "q15");
} }
void NV12ToRGB565Row_NEON(const uint8* src_y, void NV12ToRGB565Row_NEON(const uint8* src_y,
...@@ -455,8 +453,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -455,8 +453,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB "1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
...@@ -469,16 +466,15 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -469,16 +466,15 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb) [kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q12", "q13", "q14", "q15"); "q10", "q11", "q12", "q13", "q14", "q15");
} }
void UYVYToARGBRow_NEON(const uint8* src_uyvy, void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB "1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
...@@ -491,8 +487,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -491,8 +487,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb) [kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q12", "q13", "q14", "q15"); "q10", "q11", "q12", "q13", "q14", "q15");
} }
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
...@@ -1100,8 +1096,9 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, ...@@ -1100,8 +1096,9 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d20, d20, d2 \n" "vqadd.u8 d20, d20, d2 \n"
"vqadd.u8 d21, d21, d2 \n" "vqadd.u8 d21, d21, d2 \n"
"vqadd.u8 d22, d22, d2 \n" ARGBTORGB565 "vqadd.u8 d22, d22, d2 \n" // add for dither
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_rgb) // %0 : "+r"(dst_rgb) // %0
: "r"(src_argb), // %1 : "r"(src_argb), // %1
...@@ -1118,8 +1115,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, ...@@ -1118,8 +1115,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
"vst1.8 {q0}, [%1]! \n" // store 8 pixels "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
// ARGB1555.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1 "+r"(dst_argb1555), // %1
...@@ -1138,8 +1134,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, ...@@ -1138,8 +1134,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
"vst1.8 {q0}, [%1]! \n" // store 8 pixels "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
// ARGB4444.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1 "+r"(dst_argb4444), // %1
...@@ -1249,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, ...@@ -1249,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"q15"); "q15");
} }
// clang-format off
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \ #define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB \ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
", q10 \n" /* B */ \ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
"vmls.s16 q8, " #QG \ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
", q11 \n" /* G */ \
"vmls.s16 q8, " #QR \
", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR \ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
", q10 \n" /* R */ \ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
"vmls.s16 q9, " #QG \ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
", q14 \n" /* G */ \
"vmls.s16 q9, " #QB \
", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVRow_NEON(const uint8* src_argb, void ARGBToUVRow_NEON(const uint8* src_argb,
...@@ -2326,7 +2317,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2326,7 +2317,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B "vmull.u8 q0, d0, d1 \n" // multiply B
"vmull.u8 q1, d2, d3 \n" // multiply G "vmull.u8 q1, d2, d3 \n" // multiply G
...@@ -2338,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2338,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2357,13 +2346,11 @@ void ARGBAddRow_NEON(const uint8* src_argb0, ...@@ -2357,13 +2346,11 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A "vqadd.u8 q1, q1, q3 \n" // add R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2382,13 +2369,11 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, ...@@ -2382,13 +2369,11 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A "vqsub.u8 q1, q1, q3 \n" // subtract R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
......
...@@ -740,7 +740,6 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { ...@@ -740,7 +740,6 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -806,7 +805,6 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { ...@@ -806,7 +805,6 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -908,7 +906,6 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { ...@@ -908,7 +906,6 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24. // RGB24.
...@@ -975,7 +972,6 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, ...@@ -975,7 +972,6 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
...@@ -996,7 +992,6 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, ...@@ -996,7 +992,6 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
...@@ -1213,7 +1208,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1213,7 +1208,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
...@@ -1252,7 +1246,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1252,7 +1246,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v6.8b, #38 \n" // R * 0.29900 coefficient "movi v6.8b, #38 \n" // R * 0.29900 coefficient
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
...@@ -1318,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, ...@@ -1318,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
// clang-format off
#define RGBTOUV(QB, QG, QR) \ #define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB \ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
",v20.8h \n" /* B */ \ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
"mul v4.8h, " #QR \ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
",v20.8h \n" /* R */ \ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"mls v3.8h, " #QG \ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
",v21.8h \n" /* G */ \ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"mls v4.8h, " #QG \
",v24.8h \n" /* G */ \
"mls v3.8h, " #QR \
",v22.8h \n" /* R */ \
"mls v4.8h, " #QB \
",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides. // TODO(fbarchard): consider ptrdiff_t for all strides.
...@@ -1626,8 +1615,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, ...@@ -1626,8 +1615,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
"movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
"movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
"movi v27.16b, #0x80 \n" // 128.5 (0x8080 in "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
// 16-bit)
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB RGB565TOARGB
...@@ -2138,7 +2126,6 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2138,7 +2126,6 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Attenuate 8 pixels. // Attenuate 8 pixels.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v5.8h, v1.8b, v3.8b \n" // g * a
...@@ -2171,8 +2158,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, ...@@ -2171,8 +2158,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
// ARGB.
"subs %w1, %w1, #8 \n" // 8 processed per loop. "subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n" "uxtl v1.8h, v1.8b \n"
...@@ -2190,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, ...@@ -2190,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"uqxtn v1.8b, v1.8h \n" "uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n" "uqxtn v2.8b, v2.8h \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
...@@ -2215,7 +2200,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, ...@@ -2215,7 +2200,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n" "uxtl v5.8h, v5.8b \n"
...@@ -2230,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, ...@@ -2230,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"uqxtn v6.8b, v6.8h \n" "uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n" "uqxtn v7.8b, v7.8h \n"
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2249,7 +2232,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2249,7 +2232,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"movi v26.8b, #38 \n" // R * 0.29900 coefficient "movi v26.8b, #38 \n" // R * 0.29900 coefficient
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v1.8b, v25.8b \n" // G
...@@ -2319,8 +2301,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, ...@@ -2319,8 +2301,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
"1: \n" "1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g "uxtl v17.8h, v17.8b \n" // g
...@@ -2358,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, ...@@ -2358,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
// pixels.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2379,9 +2359,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2379,9 +2359,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G "umull v1.8h, v1.8b, v5.8b \n" // multiply G
...@@ -2392,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2392,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2412,18 +2388,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, ...@@ -2412,18 +2388,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2441,18 +2413,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, ...@@ -2441,18 +2413,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2481,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx, ...@@ -2481,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx,
"orr v1.8b, v0.8b, v0.8b \n" "orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n" "orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
...@@ -2531,7 +2498,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, ...@@ -2531,7 +2498,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
...@@ -2680,7 +2646,6 @@ float ScaleMaxSamples_NEON(const float* src, ...@@ -2680,7 +2646,6 @@ float ScaleMaxSamples_NEON(const float* src,
"b.gt 1b \n" "b.gt 1b \n"
"fmax v5.4s, v5.4s, v6.4s \n" // max "fmax v5.4s, v5.4s, v6.4s \n" // max
"fmaxv %s3, v5.4s \n" // signed max acculator "fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width), // %2 "+r"(width), // %2
...@@ -2707,12 +2672,10 @@ float ScaleSumSamples_NEON(const float* src, ...@@ -2707,12 +2672,10 @@ float ScaleSumSamples_NEON(const float* src,
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n" "fmla v6.4s, v2.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
"faddp v5.4s, v5.4s, v6.4s \n" "faddp v5.4s, v5.4s, v6.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n" "faddp v5.4s, v5.4s, v5.4s \n"
"faddp %3.4s, v5.4s, v5.4s \n" // sum "faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width), // %2 "+r"(width), // %2
...@@ -2731,7 +2694,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { ...@@ -2731,7 +2694,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -2768,7 +2730,6 @@ void GaussCol_NEON(const uint16* src0, ...@@ -2768,7 +2730,6 @@ void GaussCol_NEON(const uint16* src0,
"subs %w6, %w6, #8 \n" // 8 processed per loop "subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src0), // %0 : "+r"(src0), // %0
"+r"(src1), // %1 "+r"(src1), // %1
"+r"(src2), // %2 "+r"(src2), // %2
...@@ -2807,7 +2768,6 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) { ...@@ -2807,7 +2768,6 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
"uqrshrn2 v0.8h, v1.4s, #8 \n" "uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(src1), // %1 "+r"(src1), // %1
"+r"(src2), // %2 "+r"(src2), // %2
......
...@@ -77,11 +77,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -77,11 +77,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" // row 2 add adjacent + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
// row1
"uadalp v1.8h, v3.16b \n" "uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and "rshrn v0.8b, v0.8h, #2 \n" // round and pack
// pack
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -394,8 +392,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -394,8 +392,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"sqrdmulh v0.8h, v20.8h, v31.8h \n" "sqrdmulh v0.8h, v20.8h, v31.8h \n"
"sqrdmulh v1.8h, v21.8h, v31.8h \n" "sqrdmulh v1.8h, v21.8h, v31.8h \n"
// Align for table lookup, vtbl requires registers to // Align for table lookup, vtbl requires registers to be adjacent
// be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
...@@ -776,8 +773,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -776,8 +773,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and "rshrn v0.8b, v0.8h, #2 \n" // round and pack
// pack
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
...@@ -827,8 +823,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -827,8 +823,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile( asm volatile(
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
// 2x1
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
"ld1 {v3.8b}, [%1], %4 \n" "ld1 {v3.8b}, [%1], %4 \n"
...@@ -891,8 +886,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -891,8 +886,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v1, 3)
// clang-format on // clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per "subs %w2, %w2, #8 \n" // 8 processed per loop
// loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment