Commit 0a3d23c8 authored by Frank Barchard's avatar Frank Barchard

fix clang-format-ing for row arm functions

TBR=kjellander@chromium.org
BUG=None
TEST=git cl lint

Change-Id: I45ecd7f8279981ba037dc051f521f6b6d5506f64
Reviewed-on: https://chromium-review.googlesource.com/664345
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent 753a91cb
......@@ -29,7 +29,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
asm volatile(
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
......@@ -60,7 +60,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
......
......@@ -27,7 +27,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
asm volatile(
"movi v4.8h, #0 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
......@@ -55,7 +55,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
......
......@@ -115,7 +115,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUV444 YUVTORGB
"1: \n" READYUV444 YUVTORGB
"subs %4, %4, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
......@@ -141,7 +141,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
......@@ -167,7 +167,7 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %5, %5, #8 \n"
"vld1.8 {d23}, [%3]! \n"
"vst4.8 {d20, d21, d22, d23}, [%4]! \n"
......@@ -194,7 +194,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n" // d19 modified by
// YUVTORGB
......@@ -221,7 +221,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
......@@ -253,7 +253,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" ARGBTORGB565
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
......@@ -287,7 +287,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" ARGBTOARGB1555
"vst1.8 {q0}, [%3]! \n" // store 8 pixels
......@@ -325,7 +325,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
YUVTORGB_SETUP
"vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic.
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" ARGBTOARGB4444
"vst1.8 {q0}, [%3]! \n" // store 8 pixels
......@@ -348,7 +348,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUV400 YUVTORGB
"1: \n" READYUV400 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
......@@ -366,7 +366,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d23, #255 \n"
"1: \n"
"1: \n"
"vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
......@@ -385,23 +385,22 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void NV21ToARGBRow_NEON(const uint8* src_y,
......@@ -409,23 +408,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void NV12ToRGB565Row_NEON(const uint8* src_y,
......@@ -435,7 +433,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n" ARGBTORGB565
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
......@@ -455,44 +453,42 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
......@@ -501,7 +497,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
"vst1.8 {q0}, [%1]! \n" // store U
......@@ -522,7 +518,7 @@ void MergeUVRow_NEON(const uint8* src_u,
uint8* dst_uv,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
......@@ -590,7 +586,7 @@ void MergeRGBRow_NEON(const uint8* src_r,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
......@@ -607,7 +603,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile(
"vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
......@@ -621,7 +617,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile(
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"1: \n"
"subs %1, %1, #4 \n" // 4 pixels per loop
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
......@@ -638,7 +634,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2 \n"
"sub %0, #16 \n"
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n"
......@@ -662,7 +658,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
"add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n"
"1: \n"
"1: \n"
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n"
......@@ -684,7 +680,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n"
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n"
......@@ -701,7 +697,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d4, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
......@@ -717,7 +713,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d4, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
......@@ -733,7 +729,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
......@@ -763,7 +759,7 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
......@@ -809,7 +805,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
int width) {
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
......@@ -838,7 +834,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
int width) {
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
......@@ -854,7 +850,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
......@@ -870,7 +866,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
......@@ -886,7 +882,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
......@@ -901,7 +897,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
......@@ -919,7 +915,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.8 {d1}, [%1]! \n" // store 8 U.
......@@ -939,7 +935,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.8 {d0}, [%1]! \n" // store 8 U.
......@@ -961,7 +957,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int width) {
asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
......@@ -988,7 +984,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
int width) {
asm volatile(
"add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
......@@ -1015,7 +1011,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
......@@ -1036,7 +1032,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
uint8* dst_yuy2,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
"vld1.8 {d1}, [%1]! \n" // load 8 Us
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
......@@ -1058,7 +1054,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
uint8* dst_uyvy,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
"vld1.8 {d0}, [%1]! \n" // load 8 Us
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
......@@ -1076,7 +1072,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
......@@ -1095,13 +1091,14 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"vdup.32 d2, %2 \n" // dither4
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d20, d20, d2 \n"
"vqadd.u8 d21, d21, d2 \n"
"vqadd.u8 d22, d22, d2 \n" ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
"vqadd.u8 d22, d22, d2 \n" // add for dither
ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
"bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
......@@ -1114,12 +1111,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
// ARGB1555.
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
......@@ -1134,12 +1130,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
asm volatile(
"vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
// ARGB4444.
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
......@@ -1154,7 +1149,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
......@@ -1173,7 +1168,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
"subs %2, %2, #16 \n" // 16 processed per loop
......@@ -1192,7 +1187,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
......@@ -1221,7 +1216,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
......@@ -1249,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"q15");
}
// clang-format off
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB \
", q10 \n" /* B */ \
"vmls.s16 q8, " #QG \
", q11 \n" /* G */ \
"vmls.s16 q8, " #QR \
", q12 \n" /* R */ \
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR \
", q10 \n" /* R */ \
"vmls.s16 q9, " #QG \
", q14 \n" /* G */ \
"vmls.s16 q9, " #QB \
", q13 \n" /* B */ \
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVRow_NEON(const uint8* src_argb,
......@@ -1282,7 +1273,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
......@@ -1328,7 +1319,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb,
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
......@@ -1373,7 +1364,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
......@@ -1418,7 +1409,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
......@@ -1463,7 +1454,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
......@@ -1508,7 +1499,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
......@@ -1553,7 +1544,7 @@ void RAWToUVRow_NEON(const uint8* src_raw,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
......@@ -1600,7 +1591,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
......@@ -1666,7 +1657,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
......@@ -1732,7 +1723,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
......@@ -1789,7 +1780,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
......@@ -1813,7 +1804,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
......@@ -1837,7 +1828,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
......@@ -1861,7 +1852,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R
......@@ -1884,7 +1875,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R
......@@ -1907,7 +1898,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B
......@@ -1930,7 +1921,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
......@@ -1953,7 +1944,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
......@@ -1988,7 +1979,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"1: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
......@@ -2003,7 +1994,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"50: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
......@@ -2013,13 +2004,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"100: \n"
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
......@@ -2038,7 +2029,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"subs %3, #8 \n"
"blt 89f \n"
// Blend 8 pixels.
"8: \n"
"8: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop.
......@@ -2056,12 +2047,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"bge 8b \n"
"89: \n"
"89: \n"
"adds %3, #8-1 \n"
"blt 99f \n"
// Blend 1 pixels.
"1: \n"
"1: \n"
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
......@@ -2093,7 +2084,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile(
// Attenuate 8 pixels.
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
......@@ -2125,7 +2116,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
......@@ -2166,7 +2157,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
......@@ -2198,7 +2189,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
......@@ -2231,7 +2222,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d28, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
......@@ -2267,7 +2258,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
"1: \n"
"1: \n"
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
......@@ -2323,10 +2314,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B
"vmull.u8 q1, d2, d3 \n" // multiply G
......@@ -2338,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2354,16 +2343,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2379,16 +2366,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2409,7 +2394,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
asm volatile(
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
......@@ -2433,7 +2418,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
int width) {
asm volatile(
// 16 pixel loop.
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
"vld1.8 {q1}, [%1]! \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
......@@ -2460,7 +2445,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
asm volatile(
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
......@@ -2485,7 +2470,7 @@ void SobelXRow_NEON(const uint8* src_y0,
uint8* dst_sobelx,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n"
......@@ -2523,7 +2508,7 @@ void SobelYRow_NEON(const uint8* src_y0,
uint8* dst_sobely,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n"
......@@ -2555,7 +2540,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's
......@@ -2580,7 +2565,7 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's
......
......@@ -273,7 +273,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB(
"1: \n" READYUV422 YUVTORGB(
v22, v21,
v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
......@@ -310,7 +310,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n" READYUV422 YUVTORGB(
"1: \n" READYUV422 YUVTORGB(
v22, v21,
v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
......@@ -395,7 +395,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile(
"movi v23.8b, #255 \n"
"1: \n"
"1: \n"
"ld1 {v20.8b}, [%0], #8 \n"
"orr v21.8b, v20.8b, v20.8b \n"
"orr v22.8b, v20.8b, v20.8b \n"
......@@ -470,7 +470,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB(
"1: \n" READNV12 YUVTORGB(
v22, v21,
v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
......@@ -544,7 +544,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store U
......@@ -565,7 +565,7 @@ void MergeUVRow_NEON(const uint8* src_u,
uint8* dst_uv,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop
......@@ -631,7 +631,7 @@ void MergeRGBRow_NEON(const uint8* src_r,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"subs %w2, %w2, #32 \n" // 32 processed per loop
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
......@@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile(
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
"1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
......@@ -661,7 +661,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile(
"dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
......@@ -676,7 +676,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// Start at end of source row.
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #16 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n"
......@@ -698,7 +698,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
// Start at end of source row.
"add %0, %0, %w3, sxtw #1 \n"
"sub %0, %0, #16 \n"
"1: \n"
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
"subs %w3, %w3, #8 \n" // 8 pixels per loop.
"rev64 v0.8b, v0.8b \n"
......@@ -719,7 +719,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// Start at end of source row.
"add %0, %0, %w2, sxtw #2 \n"
"sub %0, %0, #16 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
"rev64 v0.4s, v0.4s \n"
......@@ -736,11 +736,10 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
asm volatile(
"movi v4.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
......@@ -753,7 +752,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
asm volatile(
"movi v5.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
......@@ -770,7 +769,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
......@@ -801,12 +800,11 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile(
"movi v3.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
......@@ -858,7 +856,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
int width) {
asm volatile(
"movi v3.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
......@@ -889,7 +887,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
uint8* dst_argb,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
......@@ -906,9 +904,8 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24.
......@@ -923,7 +920,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g
......@@ -940,7 +937,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
......@@ -955,7 +952,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
......@@ -973,9 +970,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
......@@ -994,9 +990,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
......@@ -1017,7 +1012,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int width) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
......@@ -1044,7 +1039,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
int width) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
......@@ -1071,7 +1066,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
......@@ -1091,7 +1086,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
uint8* dst_yuy2,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"orr v2.8b, v1.8b, v1.8b \n"
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
......@@ -1114,7 +1109,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
uint8* dst_uyvy,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"orr v3.8b, v2.8b, v2.8b \n"
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
......@@ -1133,7 +1128,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
......@@ -1152,7 +1147,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"dup v1.4s, %w2 \n" // dither4
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n"
......@@ -1171,7 +1166,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
......@@ -1191,7 +1186,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
asm volatile(
"movi v4.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
......@@ -1211,9 +1206,8 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
......@@ -1231,7 +1225,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
// pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop
......@@ -1250,9 +1244,8 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v4.8b, #15 \n" // B * 0.11400 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient
"movi v6.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
......@@ -1280,7 +1273,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v27.8b, #18 \n" // VB -0.1406 coefficient
"movi v28.8b, #94 \n" // VG -0.7344 coefficient
"movi v29.16b,#0x80 \n" // 128.5
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
......@@ -1318,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
// clang-format off
#define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB \
",v20.8h \n" /* B */ \
"mul v4.8h, " #QR \
",v20.8h \n" /* R */ \
"mls v3.8h, " #QG \
",v21.8h \n" /* G */ \
"mls v4.8h, " #QG \
",v24.8h \n" /* G */ \
"mls v3.8h, " #QR \
",v22.8h \n" /* R */ \
"mls v4.8h, " #QB \
",v23.8h \n" /* B */ \
"mul v3.8h, " #QB ",v20.8h \n" /* B */ \
"mul v4.8h, " #QR ",v20.8h \n" /* R */ \
"mls v3.8h, " #QG ",v21.8h \n" /* G */ \
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
......@@ -1626,9 +1615,8 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
"movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
"movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
"movi v27.16b, #0x80 \n" // 128.5 (0x8080 in
// 16-bit)
"1: \n"
"movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
......@@ -1693,7 +1681,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
......@@ -1758,7 +1746,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
......@@ -1822,7 +1810,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
......@@ -1847,7 +1835,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
......@@ -1871,7 +1859,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
......@@ -1895,7 +1883,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R
......@@ -1918,7 +1906,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R
......@@ -1941,7 +1929,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B
......@@ -1964,7 +1952,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
......@@ -1987,7 +1975,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
......@@ -2022,7 +2010,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
......@@ -2037,7 +2025,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"50: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
......@@ -2047,13 +2035,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"100: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n"
"99: \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_ptr1), // %2
......@@ -2073,7 +2061,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"subs %w3, %w3, #8 \n"
"b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
"8: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
// pixels
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
......@@ -2096,12 +2084,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
// pixels
"b.ge 8b \n"
"89: \n"
"89: \n"
"adds %w3, %w3, #8-1 \n"
"b.lt 99f \n"
// Blend 1 pixels.
"1: \n"
"1: \n"
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %w3, %w3, #1 \n" // 1 processed per loop.
......@@ -2121,7 +2109,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
"b.ge 1b \n"
"99: \n"
"99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
......@@ -2136,9 +2124,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile(
// Attenuate 8 pixels.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a
......@@ -2170,9 +2157,8 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of
// ARGB.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n"
......@@ -2190,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
......@@ -2213,9 +2198,8 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n"
......@@ -2230,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n"
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -2247,9 +2230,8 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"movi v24.8b, #15 \n" // B * 0.11400 coefficient
"movi v25.8b, #75 \n" // G * 0.58700 coefficient
"movi v26.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G
......@@ -2282,7 +2264,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"movi v28.8b, #24 \n" // BB coefficient
"movi v29.8b, #98 \n" // BG coefficient
"movi v30.8b, #50 \n" // BR coefficient
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
......@@ -2318,9 +2300,8 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
// pixels.
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g
......@@ -2358,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8
// pixels.
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -2377,11 +2357,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
......@@ -2392,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2410,20 +2386,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2439,20 +2411,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
......@@ -2473,7 +2441,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
asm volatile(
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
......@@ -2481,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx,
"orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -2498,7 +2465,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
int width) {
asm volatile(
// 16 pixel loop.
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %w3, %w3, #16 \n" // 16 processed per loop.
......@@ -2525,13 +2492,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
asm volatile(
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -2551,7 +2517,7 @@ void SobelXRow_NEON(const uint8* src_y0,
uint8* dst_sobelx,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.8b}, [%0],%5 \n" // top
"ld1 {v1.8b}, [%0],%6 \n"
"usubl v0.8h, v0.8b, v1.8b \n"
......@@ -2589,7 +2555,7 @@ void SobelYRow_NEON(const uint8* src_y0,
uint8* dst_sobely,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.8b}, [%0],%4 \n" // left
"ld1 {v1.8b}, [%1],%4 \n"
"usubl v0.8h, v0.8b, v1.8b \n"
......@@ -2620,7 +2586,7 @@ void SobelYRow_NEON(const uint8* src_y0,
// Caveat - rounds float to half float whereas scaling version truncates.
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
......@@ -2640,7 +2606,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
......@@ -2680,7 +2646,6 @@ float ScaleMaxSamples_NEON(const float* src,
"b.gt 1b \n"
"fmax v5.4s, v5.4s, v6.4s \n" // max
"fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
......@@ -2707,12 +2672,10 @@ float ScaleSumSamples_NEON(const float* src,
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
"faddp v5.4s, v5.4s, v6.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n"
"faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
......@@ -2731,7 +2694,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
......@@ -2768,7 +2730,6 @@ void GaussCol_NEON(const uint16* src0,
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
......@@ -2807,7 +2768,6 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
"uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
......
......@@ -77,11 +77,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"uaddlp v1.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" // row 2 add adjacent +
// row1
"uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
"uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and
// pack
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n"
......@@ -101,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n"
......@@ -230,7 +228,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile(
"movi v20.8b, #3 \n"
"add %3, %3, %0 \n"
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n"
......@@ -279,7 +277,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
(void)src_stride;
asm volatile(
"ld1 {v3.16b}, [%3] \n"
"1: \n"
"1: \n"
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
......@@ -394,8 +392,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"sqrdmulh v0.8h, v20.8h, v31.8h \n"
"sqrdmulh v1.8h, v21.8h, v31.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
// Align for table lookup, vtbl requires registers to be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
"st1 {v3.8b}, [%1], #8 \n"
......@@ -776,8 +773,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and
// pack
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
......@@ -827,8 +823,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile(
"add %1, %1, %0 \n"
"1: \n"
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks ->
// 2x1
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
"ld1 {v1.8b}, [%1], %4 \n"
"ld1 {v2.8b}, [%0], %4 \n"
"ld1 {v3.8b}, [%1], %4 \n"
......@@ -891,8 +886,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 3)
// clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per
// loop
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment