Commit 0a3d23c8 authored by Frank Barchard's avatar Frank Barchard

fix clang-format-ing for row arm functions

TBR=kjellander@chromium.org
BUG=None
TEST=git cl lint

Change-Id: I45ecd7f8279981ba037dc051f521f6b6d5506f64
Reviewed-on: https://chromium-review.googlesource.com/664345
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent 753a91cb
......@@ -385,8 +385,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
......@@ -400,8 +399,8 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void NV21ToARGBRow_NEON(const uint8* src_y,
......@@ -409,8 +408,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
......@@ -424,8 +422,8 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void NV12ToRGB565Row_NEON(const uint8* src_y,
......@@ -455,8 +453,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
......@@ -469,16 +466,15 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
......@@ -491,8 +487,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
......@@ -1100,8 +1096,9 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d20, d20, d2 \n"
"vqadd.u8 d21, d21, d2 \n"
"vqadd.u8 d22, d22, d2 \n" ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
"vqadd.u8 d22, d22, d2 \n" // add for dither
ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
"bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
......@@ -1118,8 +1115,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
// ARGB1555.
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
......@@ -1138,8 +1134,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
// ARGB4444.
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
......@@ -1249,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"q15");
}
// clang-format off
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB \
", q10 \n" /* B */ \
"vmls.s16 q8, " #QG \
", q11 \n" /* G */ \
"vmls.s16 q8, " #QR \
", q12 \n" /* R */ \
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR \
", q10 \n" /* R */ \
"vmls.s16 q9, " #QG \
", q14 \n" /* G */ \
"vmls.s16 q9, " #QB \
", q13 \n" /* B */ \
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVRow_NEON(const uint8* src_argb,
......@@ -2326,7 +2317,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B
"vmull.u8 q1, d2, d3 \n" // multiply G
......@@ -2338,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2357,13 +2346,11 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2382,13 +2369,11 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......
......@@ -740,7 +740,6 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
......@@ -806,7 +805,6 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
......@@ -908,7 +906,6 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24.
......@@ -975,7 +972,6 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
......@@ -996,7 +992,6 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
......@@ -1213,7 +1208,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
......@@ -1252,7 +1246,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v6.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
......@@ -1318,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
// clang-format off
#define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB \
",v20.8h \n" /* B */ \
"mul v4.8h, " #QR \
",v20.8h \n" /* R */ \
"mls v3.8h, " #QG \
",v21.8h \n" /* G */ \
"mls v4.8h, " #QG \
",v24.8h \n" /* G */ \
"mls v3.8h, " #QR \
",v22.8h \n" /* R */ \
"mls v4.8h, " #QB \
",v23.8h \n" /* B */ \
"mul v3.8h, " #QB ",v20.8h \n" /* B */ \
"mul v4.8h, " #QR ",v20.8h \n" /* R */ \
"mls v3.8h, " #QG ",v21.8h \n" /* G */ \
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
......@@ -1626,8 +1615,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
"movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
"movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
"movi v27.16b, #0x80 \n" // 128.5 (0x8080 in
// 16-bit)
"movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB
......@@ -2138,7 +2126,6 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Attenuate 8 pixels.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a
......@@ -2171,8 +2158,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of
// ARGB.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n"
......@@ -2190,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
......@@ -2215,7 +2200,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
// 8 pixel loop.
"1: \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n"
......@@ -2230,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n"
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -2249,7 +2232,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"movi v26.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G
......@@ -2319,8 +2301,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
// pixels.
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g
......@@ -2358,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8
// pixels.
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -2379,9 +2359,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
......@@ -2392,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2412,18 +2388,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2441,18 +2413,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2481,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx,
"orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -2531,7 +2498,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -2680,7 +2646,6 @@ float ScaleMaxSamples_NEON(const float* src,
"b.gt 1b \n"
"fmax v5.4s, v5.4s, v6.4s \n" // max
"fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
......@@ -2707,12 +2672,10 @@ float ScaleSumSamples_NEON(const float* src,
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
"faddp v5.4s, v5.4s, v6.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n"
"faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
......@@ -2731,7 +2694,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
......@@ -2768,7 +2730,6 @@ void GaussCol_NEON(const uint16* src0,
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
......@@ -2807,7 +2768,6 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
"uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
......
......@@ -77,11 +77,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"uaddlp v1.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" // row 2 add adjacent +
// row1
"uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
"uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and
// pack
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n"
......@@ -394,8 +392,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"sqrdmulh v0.8h, v20.8h, v31.8h \n"
"sqrdmulh v1.8h, v21.8h, v31.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
// Align for table lookup, vtbl requires registers to be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
"st1 {v3.8b}, [%1], #8 \n"
......@@ -776,8 +773,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and
// pack
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
......@@ -827,8 +823,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile(
"add %1, %1, %0 \n"
"1: \n"
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks ->
// 2x1
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
"ld1 {v1.8b}, [%1], %4 \n"
"ld1 {v2.8b}, [%0], %4 \n"
"ld1 {v3.8b}, [%1], %4 \n"
......@@ -891,8 +886,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 3)
// clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per
// loop
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment