Commit 0a3d23c8 authored by Frank Barchard's avatar Frank Barchard

fix clang-format-ing for row arm functions

TBR=kjellander@chromium.org
BUG=None
TEST=git cl lint

Change-Id: I45ecd7f8279981ba037dc051f521f6b6d5506f64
Reviewed-on: https://chromium-review.googlesource.com/664345
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent 753a91cb
...@@ -29,7 +29,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -29,7 +29,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
asm volatile( asm volatile(
"vmov.u16 q4, #0 \n" // accumulator "vmov.u16 q4, #0 \n" // accumulator
"1: \n" "1: \n"
"vld1.8 {q0, q1}, [%0]! \n" "vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n" "vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n" "veor.32 q0, q0, q2 \n"
...@@ -60,7 +60,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -60,7 +60,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q9, #0 \n" "vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
......
...@@ -27,7 +27,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -27,7 +27,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
asm volatile( asm volatile(
"movi v4.8h, #0 \n" "movi v4.8h, #0 \n"
"1: \n" "1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n" "eor v0.16b, v0.16b, v2.16b \n"
...@@ -55,7 +55,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -55,7 +55,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v17.16b, v17.16b, v17.16b \n" "eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n" "subs %w2, %w2, #16 \n"
......
...@@ -115,7 +115,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -115,7 +115,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" READYUV444 YUVTORGB "1: \n" READYUV444 YUVTORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
...@@ -141,7 +141,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -141,7 +141,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
...@@ -167,7 +167,7 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, ...@@ -167,7 +167,7 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
"subs %5, %5, #8 \n" "subs %5, %5, #8 \n"
"vld1.8 {d23}, [%3]! \n" "vld1.8 {d23}, [%3]! \n"
"vst4.8 {d20, d21, d22, d23}, [%4]! \n" "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
...@@ -194,7 +194,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -194,7 +194,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n" // d19 modified by "vmov.u8 d19, #255 \n" // d19 modified by
// YUVTORGB // YUVTORGB
...@@ -221,7 +221,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -221,7 +221,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vst3.8 {d20, d21, d22}, [%3]! \n" "vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
...@@ -253,7 +253,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -253,7 +253,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" ARGBTORGB565 "subs %4, %4, #8 \n" ARGBTORGB565
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "bgt 1b \n"
...@@ -287,7 +287,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -287,7 +287,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" ARGBTOARGB1555 "vmov.u8 d23, #255 \n" ARGBTOARGB1555
"vst1.8 {q0}, [%3]! \n" // store 8 pixels "vst1.8 {q0}, [%3]! \n" // store 8 pixels
...@@ -325,7 +325,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -325,7 +325,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d4, #0x0f \n" // bits to clear with "vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic. // vbic.
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" ARGBTOARGB4444 "vmov.u8 d23, #255 \n" ARGBTOARGB4444
"vst1.8 {q0}, [%3]! \n" // store 8 pixels "vst1.8 {q0}, [%3]! \n" // store 8 pixels
...@@ -348,7 +348,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { ...@@ -348,7 +348,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" READYUV400 YUVTORGB "1: \n" READYUV400 YUVTORGB
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
...@@ -366,7 +366,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { ...@@ -366,7 +366,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" "1: \n"
"vld1.8 {d20}, [%0]! \n" "vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n" "vmov d21, d20 \n"
"vmov d22, d20 \n" "vmov d22, d20 \n"
...@@ -385,23 +385,22 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -385,23 +385,22 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP "vmov.u8 d23, #255 \n"
"vmov.u8 d23, #255 \n" "1: \n" READNV12 YUVTORGB
"1: \n" READNV12 YUVTORGB "subs %3, %3, #8 \n"
"subs %3, %3, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n"
"bgt 1b \n" : "+r"(src_y), // %0
: "+r"(src_y), // %0 "+r"(src_uv), // %1
"+r"(src_uv), // %1 "+r"(dst_argb), // %2
"+r"(dst_argb), // %2 "+r"(width) // %3
"+r"(width) // %3 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
: [kUVToRB] "r"(&yuvconstants->kUVToRB), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kYToRgb] "r"(&yuvconstants->kYToRgb)
[kYToRgb] "r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q10", "q11", "q12", "q13", "q14", "q15");
"q12", "q13", "q14", "q15");
} }
void NV21ToARGBRow_NEON(const uint8* src_y, void NV21ToARGBRow_NEON(const uint8* src_y,
...@@ -409,23 +408,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -409,23 +408,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP "vmov.u8 d23, #255 \n"
"vmov.u8 d23, #255 \n" "1: \n" READNV21 YUVTORGB
"1: \n" READNV21 YUVTORGB "subs %3, %3, #8 \n"
"subs %3, %3, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n"
"bgt 1b \n" : "+r"(src_y), // %0
: "+r"(src_y), // %0 "+r"(src_vu), // %1
"+r"(src_vu), // %1 "+r"(dst_argb), // %2
"+r"(dst_argb), // %2 "+r"(width) // %3
"+r"(width) // %3 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
: [kUVToRB] "r"(&yuvconstants->kUVToRB), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kYToRgb] "r"(&yuvconstants->kYToRgb)
[kYToRgb] "r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q10", "q11", "q12", "q13", "q14", "q15");
"q12", "q13", "q14", "q15");
} }
void NV12ToRGB565Row_NEON(const uint8* src_y, void NV12ToRGB565Row_NEON(const uint8* src_y,
...@@ -435,7 +433,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -435,7 +433,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB "1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n" ARGBTORGB565 "subs %3, %3, #8 \n" ARGBTORGB565
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "bgt 1b \n"
...@@ -455,44 +453,42 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -455,44 +453,42 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP "vmov.u8 d23, #255 \n"
"vmov.u8 d23, #255 \n" "1: \n" READYUY2 YUVTORGB
"1: \n" READYUY2 YUVTORGB "subs %2, %2, #8 \n"
"subs %2, %2, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n"
"bgt 1b \n" : "+r"(src_yuy2), // %0
: "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1
"+r"(dst_argb), // %1 "+r"(width) // %2
"+r"(width) // %2 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
: [kUVToRB] "r"(&yuvconstants->kUVToRB), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kYToRgb] "r"(&yuvconstants->kYToRgb)
[kYToRgb] "r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q10", "q11", "q12", "q13", "q14", "q15");
"q12", "q13", "q14", "q15");
} }
void UYVYToARGBRow_NEON(const uint8* src_uyvy, void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP "vmov.u8 d23, #255 \n"
"vmov.u8 d23, #255 \n" "1: \n" READUYVY YUVTORGB
"1: \n" READUYVY YUVTORGB "subs %2, %2, #8 \n"
"subs %2, %2, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n"
"bgt 1b \n" : "+r"(src_uyvy), // %0
: "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1
"+r"(dst_argb), // %1 "+r"(width) // %2
"+r"(width) // %2 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
: [kUVToRB] "r"(&yuvconstants->kUVToRB), [kUVToG] "r"(&yuvconstants->kUVToG),
[kUVToG] "r"(&yuvconstants->kUVToG), [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), [kYToRgb] "r"(&yuvconstants->kYToRgb)
[kYToRgb] "r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q10", "q11", "q12", "q13", "q14", "q15");
"q12", "q13", "q14", "q15");
} }
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
...@@ -501,7 +497,7 @@ void SplitUVRow_NEON(const uint8* src_uv, ...@@ -501,7 +497,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_v, uint8* dst_v,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vst1.8 {q0}, [%1]! \n" // store U "vst1.8 {q0}, [%1]! \n" // store U
...@@ -522,7 +518,7 @@ void MergeUVRow_NEON(const uint8* src_u, ...@@ -522,7 +518,7 @@ void MergeUVRow_NEON(const uint8* src_u,
uint8* dst_uv, uint8* dst_uv,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V "vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
...@@ -590,7 +586,7 @@ void MergeRGBRow_NEON(const uint8* src_r, ...@@ -590,7 +586,7 @@ void MergeRGBRow_NEON(const uint8* src_r,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
...@@ -607,7 +603,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -607,7 +603,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile( asm volatile(
"vdup.8 q0, %2 \n" // duplicate 16 bytes "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.8 {q0}, [%0]! \n" // store "vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n" "bgt 1b \n"
...@@ -621,7 +617,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) { ...@@ -621,7 +617,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile( asm volatile(
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #4 \n" // 4 pixels per loop "subs %1, %1, #4 \n" // 4 pixels per loop
"vst1.8 {q0}, [%0]! \n" // store "vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n" "bgt 1b \n"
...@@ -638,7 +634,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -638,7 +634,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
"sub %0, #16 \n" "sub %0, #16 \n"
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop. "subs %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
...@@ -662,7 +658,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, ...@@ -662,7 +658,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n" "sub %0, #16 \n"
"1: \n" "1: \n"
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop. "subs %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
...@@ -684,7 +680,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -684,7 +680,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n" "sub %0, #16 \n"
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop. "subs %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n" "vrev64.32 q0, q0 \n"
...@@ -701,7 +697,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -701,7 +697,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
...@@ -717,7 +713,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { ...@@ -717,7 +713,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "vswp.u8 d1, d3 \n" // swap R, B
...@@ -733,7 +729,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { ...@@ -733,7 +729,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "vswp.u8 d1, d3 \n" // swap R, B
...@@ -763,7 +759,7 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { ...@@ -763,7 +759,7 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
...@@ -809,7 +805,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, ...@@ -809,7 +805,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
int width) { int width) {
asm volatile( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
...@@ -838,7 +834,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, ...@@ -838,7 +834,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
int width) { int width) {
asm volatile( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
...@@ -854,7 +850,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, ...@@ -854,7 +850,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
...@@ -870,7 +866,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { ...@@ -870,7 +866,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "vswp.u8 d1, d3 \n" // swap R, B
...@@ -886,7 +882,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { ...@@ -886,7 +882,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
...@@ -901,7 +897,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { ...@@ -901,7 +897,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
...@@ -919,7 +915,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, ...@@ -919,7 +915,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_v, uint8* dst_v,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.8 {d1}, [%1]! \n" // store 8 U. "vst1.8 {d1}, [%1]! \n" // store 8 U.
...@@ -939,7 +935,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, ...@@ -939,7 +935,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_v, uint8* dst_v,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.8 {d0}, [%1]! \n" // store 8 U. "vst1.8 {d0}, [%1]! \n" // store 8 U.
...@@ -961,7 +957,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, ...@@ -961,7 +957,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int width) { int width) {
asm volatile( asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2 "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
...@@ -988,7 +984,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, ...@@ -988,7 +984,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
int width) { int width) {
asm volatile( asm volatile(
"add %1, %0, %1 \n" // stride + src_uyvy "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
...@@ -1015,7 +1011,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, ...@@ -1015,7 +1011,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
int width) { int width) {
asm volatile( asm volatile(
"vld1.8 {q2}, [%3] \n" // shuffler "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 4 pixels. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
...@@ -1036,7 +1032,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1036,7 +1032,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
uint8* dst_yuy2, uint8* dst_yuy2,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
"vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d1}, [%1]! \n" // load 8 Us
"vld1.8 {d3}, [%2]! \n" // load 8 Vs "vld1.8 {d3}, [%2]! \n" // load 8 Vs
...@@ -1058,7 +1054,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1058,7 +1054,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
uint8* dst_uyvy, uint8* dst_uyvy,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
"vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d0}, [%1]! \n" // load 8 Us
"vld1.8 {d2}, [%2]! \n" // load 8 Vs "vld1.8 {d2}, [%2]! \n" // load 8 Vs
...@@ -1076,7 +1072,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1076,7 +1072,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565 ARGBTORGB565
...@@ -1095,13 +1091,14 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, ...@@ -1095,13 +1091,14 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
int width) { int width) {
asm volatile( asm volatile(
"vdup.32 d2, %2 \n" // dither4 "vdup.32 d2, %2 \n" // dither4
"1: \n" "1: \n"
"vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d20, d20, d2 \n" "vqadd.u8 d20, d20, d2 \n"
"vqadd.u8 d21, d21, d2 \n" "vqadd.u8 d21, d21, d2 \n"
"vqadd.u8 d22, d22, d2 \n" ARGBTORGB565 "vqadd.u8 d22, d22, d2 \n" // add for dither
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_rgb) // %0 : "+r"(dst_rgb) // %0
: "r"(src_argb), // %1 : "r"(src_argb), // %1
...@@ -1114,12 +1111,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, ...@@ -1114,12 +1111,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555, uint8* dst_argb1555,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
"vst1.8 {q0}, [%1]! \n" // store 8 pixels "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
// ARGB1555.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1 "+r"(dst_argb1555), // %1
...@@ -1134,12 +1130,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, ...@@ -1134,12 +1130,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
asm volatile( asm volatile(
"vmov.u8 d4, #0x0f \n" // bits to clear with "vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic. // vbic.
"1: \n" "1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
"vst1.8 {q0}, [%1]! \n" // store 8 pixels "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
// ARGB4444.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1 "+r"(dst_argb4444), // %1
...@@ -1154,7 +1149,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1154,7 +1149,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
...@@ -1173,7 +1168,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1173,7 +1168,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
...@@ -1192,7 +1187,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1192,7 +1187,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
...@@ -1221,7 +1216,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, ...@@ -1221,7 +1216,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
...@@ -1249,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, ...@@ -1249,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"q15"); "q15");
} }
// clang-format off
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \ #define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB \ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
", q10 \n" /* B */ \ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
"vmls.s16 q8, " #QG \ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
", q11 \n" /* G */ \
"vmls.s16 q8, " #QR \
", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR \ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
", q10 \n" /* R */ \ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
"vmls.s16 q9, " #QG \ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
", q14 \n" /* G */ \
"vmls.s16 q9, " #QB \
", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVRow_NEON(const uint8* src_argb, void ARGBToUVRow_NEON(const uint8* src_argb,
...@@ -1282,7 +1273,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, ...@@ -1282,7 +1273,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
...@@ -1328,7 +1319,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, ...@@ -1328,7 +1319,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb,
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
...@@ -1373,7 +1364,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, ...@@ -1373,7 +1364,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
...@@ -1418,7 +1409,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, ...@@ -1418,7 +1409,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
...@@ -1463,7 +1454,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, ...@@ -1463,7 +1454,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
...@@ -1508,7 +1499,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, ...@@ -1508,7 +1499,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
...@@ -1553,7 +1544,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, ...@@ -1553,7 +1544,7 @@ void RAWToUVRow_NEON(const uint8* src_raw,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
...@@ -1600,7 +1591,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, ...@@ -1600,7 +1591,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
...@@ -1666,7 +1657,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, ...@@ -1666,7 +1657,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
...@@ -1732,7 +1723,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, ...@@ -1732,7 +1723,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
...@@ -1789,7 +1780,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { ...@@ -1789,7 +1780,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
...@@ -1813,7 +1804,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { ...@@ -1813,7 +1804,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
...@@ -1837,7 +1828,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { ...@@ -1837,7 +1828,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
...@@ -1861,7 +1852,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { ...@@ -1861,7 +1852,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R "vmull.u8 q8, d1, d4 \n" // R
...@@ -1884,7 +1875,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { ...@@ -1884,7 +1875,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R "vmull.u8 q8, d0, d4 \n" // R
...@@ -1907,7 +1898,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { ...@@ -1907,7 +1898,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B "vmull.u8 q8, d1, d4 \n" // B
...@@ -1930,7 +1921,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { ...@@ -1930,7 +1921,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B "vmull.u8 q8, d0, d4 \n" // B
...@@ -1953,7 +1944,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { ...@@ -1953,7 +1944,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B "vmull.u8 q8, d0, d4 \n" // B
...@@ -1988,7 +1979,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -1988,7 +1979,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"rsb %4, #256 \n" "rsb %4, #256 \n"
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
...@@ -2003,7 +1994,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2003,7 +1994,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
...@@ -2013,13 +2004,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2013,13 +2004,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
...@@ -2038,7 +2029,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, ...@@ -2038,7 +2029,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"subs %3, #8 \n" "subs %3, #8 \n"
"blt 89f \n" "blt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
...@@ -2056,12 +2047,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, ...@@ -2056,12 +2047,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"bge 8b \n" "bge 8b \n"
"89: \n" "89: \n"
"adds %3, #8-1 \n" "adds %3, #8-1 \n"
"blt 99f \n" "blt 99f \n"
// Blend 1 pixels. // Blend 1 pixels.
"1: \n" "1: \n"
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop. "subs %3, %3, #1 \n" // 1 processed per loop.
...@@ -2093,7 +2084,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, ...@@ -2093,7 +2084,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile( asm volatile(
// Attenuate 8 pixels. // Attenuate 8 pixels.
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a "vmull.u8 q10, d0, d3 \n" // b * a
...@@ -2125,7 +2116,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, ...@@ -2125,7 +2116,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"vdup.u16 q10, %4 \n" // interval add "vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255) "vmovl.u8 q0, d0 \n" // b (0 .. 255)
...@@ -2166,7 +2157,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, ...@@ -2166,7 +2157,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"vshr.u16 q0, q0, #1 \n" // scale / 2. "vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255) "vmovl.u8 q10, d20 \n" // b (0 .. 255)
...@@ -2198,7 +2189,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2198,7 +2189,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
...@@ -2231,7 +2222,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2231,7 +2222,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d28, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d29, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient "vmov.u8 d30, #50 \n" // BR coefficient
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B "vmull.u8 q2, d0, d20 \n" // B to Sepia B
...@@ -2267,7 +2258,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, ...@@ -2267,7 +2258,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
"1: \n" "1: \n"
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
...@@ -2323,10 +2314,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2323,10 +2314,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
int width) { int width) {
asm volatile( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B "vmull.u8 q0, d0, d1 \n" // multiply B
"vmull.u8 q1, d2, d3 \n" // multiply G "vmull.u8 q1, d2, d3 \n" // multiply G
...@@ -2338,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2338,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2354,16 +2343,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, ...@@ -2354,16 +2343,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
int width) { int width) {
asm volatile( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A "vqadd.u8 q1, q1, q3 \n" // add R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2379,16 +2366,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, ...@@ -2379,16 +2366,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
int width) { int width) {
asm volatile( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A "vqsub.u8 q1, q1, q3 \n" // subtract R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2409,7 +2394,7 @@ void SobelRow_NEON(const uint8* src_sobelx, ...@@ -2409,7 +2394,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
asm volatile( asm volatile(
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d1}, [%1]! \n" // load 8 sobely. "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
...@@ -2433,7 +2418,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, ...@@ -2433,7 +2418,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
int width) { int width) {
asm volatile( asm volatile(
// 16 pixel loop. // 16 pixel loop.
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
"vld1.8 {q1}, [%1]! \n" // load 16 sobely. "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %3, %3, #16 \n" // 16 processed per loop.
...@@ -2460,7 +2445,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, ...@@ -2460,7 +2445,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
asm volatile( asm volatile(
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d0}, [%1]! \n" // load 8 sobely. "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
...@@ -2485,7 +2470,7 @@ void SobelXRow_NEON(const uint8* src_y0, ...@@ -2485,7 +2470,7 @@ void SobelXRow_NEON(const uint8* src_y0,
uint8* dst_sobelx, uint8* dst_sobelx,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n" "vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n" "vsubl.u8 q0, d0, d1 \n"
...@@ -2523,7 +2508,7 @@ void SobelYRow_NEON(const uint8* src_y0, ...@@ -2523,7 +2508,7 @@ void SobelYRow_NEON(const uint8* src_y0,
uint8* dst_sobely, uint8* dst_sobely,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n" "vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n" "vsubl.u8 q0, d0, d1 \n"
...@@ -2555,7 +2540,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { ...@@ -2555,7 +2540,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile( asm volatile(
"vdup.32 q0, %3 \n" "vdup.32 q0, %3 \n"
"1: \n" "1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts "vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop "subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's "vmovl.u16 q2, d2 \n" // 8 int's
...@@ -2580,7 +2565,7 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { ...@@ -2580,7 +2565,7 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile( asm volatile(
"vdup.32 q0, %3 \n" "vdup.32 q0, %3 \n"
"1: \n" "1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts "vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop "subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's "vmovl.u16 q2, d2 \n" // 8 int's
......
...@@ -273,7 +273,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -273,7 +273,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB( "1: \n" READYUV422 YUVTORGB(
v22, v21, v22, v21,
v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
...@@ -310,7 +310,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -310,7 +310,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
"1: \n" READYUV422 YUVTORGB( "1: \n" READYUV422 YUVTORGB(
v22, v21, v22, v21,
v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
...@@ -395,7 +395,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { ...@@ -395,7 +395,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
"1: \n" "1: \n"
"ld1 {v20.8b}, [%0], #8 \n" "ld1 {v20.8b}, [%0], #8 \n"
"orr v21.8b, v20.8b, v20.8b \n" "orr v21.8b, v20.8b, v20.8b \n"
"orr v22.8b, v20.8b, v20.8b \n" "orr v22.8b, v20.8b, v20.8b \n"
...@@ -470,7 +470,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -470,7 +470,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB( "1: \n" READNV12 YUVTORGB(
v22, v21, v22, v21,
v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
...@@ -544,7 +544,7 @@ void SplitUVRow_NEON(const uint8* src_uv, ...@@ -544,7 +544,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_v, uint8* dst_v,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v0.16b}, [%1], #16 \n" // store U
...@@ -565,7 +565,7 @@ void MergeUVRow_NEON(const uint8* src_u, ...@@ -565,7 +565,7 @@ void MergeUVRow_NEON(const uint8* src_u,
uint8* dst_uv, uint8* dst_uv,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
...@@ -631,7 +631,7 @@ void MergeRGBRow_NEON(const uint8* src_r, ...@@ -631,7 +631,7 @@ void MergeRGBRow_NEON(const uint8* src_r,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"subs %w2, %w2, #32 \n" // 32 processed per loop "subs %w2, %w2, #32 \n" // 32 processed per loop
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
...@@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile( asm volatile(
"dup v0.16b, %w2 \n" // duplicate 16 bytes "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop "subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n" "b.gt 1b \n"
...@@ -661,7 +661,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) { ...@@ -661,7 +661,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile( asm volatile(
"dup v0.4s, %w2 \n" // duplicate 4 ints "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop "subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n" "b.gt 1b \n"
...@@ -676,7 +676,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -676,7 +676,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// Start at end of source row. // Start at end of source row.
"add %0, %0, %w2, sxtw \n" "add %0, %0, %w2, sxtw \n"
"sub %0, %0, #16 \n" "sub %0, %0, #16 \n"
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %w2, %w2, #16 \n" // 16 pixels per loop. "subs %w2, %w2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n" "rev64 v0.16b, v0.16b \n"
...@@ -698,7 +698,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, ...@@ -698,7 +698,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
// Start at end of source row. // Start at end of source row.
"add %0, %0, %w3, sxtw #1 \n" "add %0, %0, %w3, sxtw #1 \n"
"sub %0, %0, #16 \n" "sub %0, %0, #16 \n"
"1: \n" "1: \n"
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
"subs %w3, %w3, #8 \n" // 8 pixels per loop. "subs %w3, %w3, #8 \n" // 8 pixels per loop.
"rev64 v0.8b, v0.8b \n" "rev64 v0.8b, v0.8b \n"
...@@ -719,7 +719,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -719,7 +719,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// Start at end of source row. // Start at end of source row.
"add %0, %0, %w2, sxtw #2 \n" "add %0, %0, %w2, sxtw #2 \n"
"sub %0, %0, #16 \n" "sub %0, %0, #16 \n"
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %w2, %w2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
"rev64 v0.4s, v0.4s \n" "rev64 v0.4s, v0.4s \n"
...@@ -736,11 +736,10 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -736,11 +736,10 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"movi v4.8b, #255 \n" // Alpha "movi v4.8b, #255 \n" // Alpha
"1: \n" "1: \n"
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -753,7 +752,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { ...@@ -753,7 +752,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"movi v5.8b, #255 \n" // Alpha "movi v5.8b, #255 \n" // Alpha
"1: \n" "1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g "orr v3.8b, v1.8b, v1.8b \n" // move g
...@@ -770,7 +769,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { ...@@ -770,7 +769,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g "orr v3.8b, v1.8b, v1.8b \n" // move g
...@@ -801,12 +800,11 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { ...@@ -801,12 +800,11 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile( asm volatile(
"movi v3.8b, #255 \n" // Alpha "movi v3.8b, #255 \n" // Alpha
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -858,7 +856,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, ...@@ -858,7 +856,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
int width) { int width) {
asm volatile( asm volatile(
"movi v3.8b, #255 \n" // Alpha "movi v3.8b, #255 \n" // Alpha
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
...@@ -889,7 +887,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, ...@@ -889,7 +887,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
...@@ -906,9 +904,8 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, ...@@ -906,9 +904,8 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24. // RGB24.
...@@ -923,7 +920,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { ...@@ -923,7 +920,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g "orr v4.8b, v2.8b, v2.8b \n" // mov g
...@@ -940,7 +937,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { ...@@ -940,7 +937,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %w2, %w2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
...@@ -955,7 +952,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { ...@@ -955,7 +952,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %w2, %w2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
...@@ -973,9 +970,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, ...@@ -973,9 +970,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_v, uint8* dst_v,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
...@@ -994,9 +990,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, ...@@ -994,9 +990,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_v, uint8* dst_v,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
...@@ -1017,7 +1012,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, ...@@ -1017,7 +1012,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int width) { int width) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2; const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
...@@ -1044,7 +1039,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, ...@@ -1044,7 +1039,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
int width) { int width) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy; const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
...@@ -1071,7 +1066,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, ...@@ -1071,7 +1066,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
int width) { int width) {
asm volatile( asm volatile(
"ld1 {v2.16b}, [%3] \n" // shuffler "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %w2, %w2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
...@@ -1091,7 +1086,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1091,7 +1086,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
uint8* dst_yuy2, uint8* dst_yuy2,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"orr v2.8b, v1.8b, v1.8b \n" "orr v2.8b, v1.8b, v1.8b \n"
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
...@@ -1114,7 +1109,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1114,7 +1109,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
uint8* dst_uyvy, uint8* dst_uyvy,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"orr v3.8b, v2.8b, v2.8b \n" "orr v3.8b, v2.8b, v2.8b \n"
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
...@@ -1133,7 +1128,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1133,7 +1128,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565 ARGBTORGB565
...@@ -1152,7 +1147,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, ...@@ -1152,7 +1147,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
int width) { int width) {
asm volatile( asm volatile(
"dup v1.4s, %w2 \n" // dither4 "dup v1.4s, %w2 \n" // dither4
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n" "uqadd v20.8b, v20.8b, v1.8b \n"
...@@ -1171,7 +1166,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, ...@@ -1171,7 +1166,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555, uint8* dst_argb1555,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
...@@ -1191,7 +1186,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, ...@@ -1191,7 +1186,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
asm volatile( asm volatile(
"movi v4.16b, #0x0f \n" // bits to clear with "movi v4.16b, #0x0f \n" // bits to clear with
// vbic. // vbic.
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
...@@ -1211,9 +1206,8 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1211,9 +1206,8 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
...@@ -1231,7 +1225,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1231,7 +1225,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
// pixels // pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
...@@ -1250,9 +1244,8 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { ...@@ -1250,9 +1244,8 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v4.8b, #15 \n" // B * 0.11400 coefficient "movi v4.8b, #15 \n" // B * 0.11400 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient
"movi v6.8b, #38 \n" // R * 0.29900 coefficient "movi v6.8b, #38 \n" // R * 0.29900 coefficient
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
...@@ -1280,7 +1273,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, ...@@ -1280,7 +1273,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v27.8b, #18 \n" // VB -0.1406 coefficient "movi v27.8b, #18 \n" // VB -0.1406 coefficient
"movi v28.8b, #94 \n" // VG -0.7344 coefficient "movi v28.8b, #94 \n" // VG -0.7344 coefficient
"movi v29.16b,#0x80 \n" // 128.5 "movi v29.16b,#0x80 \n" // 128.5
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels. // pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
...@@ -1318,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, ...@@ -1318,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
// clang-format off
#define RGBTOUV(QB, QG, QR) \ #define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB \ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
",v20.8h \n" /* B */ \ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
"mul v4.8h, " #QR \ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
",v20.8h \n" /* R */ \ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"mls v3.8h, " #QG \ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
",v21.8h \n" /* G */ \ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"mls v4.8h, " #QG \
",v24.8h \n" /* G */ \
"mls v3.8h, " #QR \
",v22.8h \n" /* R */ \
"mls v4.8h, " #QB \
",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides. // TODO(fbarchard): consider ptrdiff_t for all strides.
...@@ -1626,9 +1615,8 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, ...@@ -1626,9 +1615,8 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
"movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
"movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
"movi v27.16b, #0x80 \n" // 128.5 (0x8080 in "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
// 16-bit) "1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB RGB565TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
...@@ -1693,7 +1681,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, ...@@ -1693,7 +1681,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile( asm volatile(
RGBTOUV_SETUP_REG RGBTOUV_SETUP_REG
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB RGB555TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
...@@ -1758,7 +1746,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, ...@@ -1758,7 +1746,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile( asm volatile(
RGBTOUV_SETUP_REG RGBTOUV_SETUP_REG
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB ARGB4444TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
...@@ -1822,7 +1810,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { ...@@ -1822,7 +1810,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant "movi v27.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
...@@ -1847,7 +1835,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { ...@@ -1847,7 +1835,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
...@@ -1871,7 +1859,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { ...@@ -1871,7 +1859,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant "movi v27.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
...@@ -1895,7 +1883,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { ...@@ -1895,7 +1883,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R "umull v16.8h, v1.8b, v4.8b \n" // R
...@@ -1918,7 +1906,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { ...@@ -1918,7 +1906,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R "umull v16.8h, v0.8b, v4.8b \n" // R
...@@ -1941,7 +1929,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { ...@@ -1941,7 +1929,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B "umull v16.8h, v1.8b, v4.8b \n" // B
...@@ -1964,7 +1952,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { ...@@ -1964,7 +1952,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
...@@ -1987,7 +1975,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { ...@@ -1987,7 +1975,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
...@@ -2022,7 +2010,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2022,7 +2010,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"dup v5.16b, %w4 \n" "dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n" "dup v4.16b, %w5 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
...@@ -2037,7 +2025,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2037,7 +2025,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
...@@ -2047,13 +2035,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2047,13 +2035,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(src_ptr1), // %2 "+r"(src_ptr1), // %2
...@@ -2073,7 +2061,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, ...@@ -2073,7 +2061,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"subs %w3, %w3, #8 \n" "subs %w3, %w3, #8 \n"
"b.lt 89f \n" "b.lt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
// pixels // pixels
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
...@@ -2096,12 +2084,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, ...@@ -2096,12 +2084,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
// pixels // pixels
"b.ge 8b \n" "b.ge 8b \n"
"89: \n" "89: \n"
"adds %w3, %w3, #8-1 \n" "adds %w3, %w3, #8-1 \n"
"b.lt 99f \n" "b.lt 99f \n"
// Blend 1 pixels. // Blend 1 pixels.
"1: \n" "1: \n"
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %w3, %w3, #1 \n" // 1 processed per loop. "subs %w3, %w3, #1 \n" // 1 processed per loop.
...@@ -2121,7 +2109,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, ...@@ -2121,7 +2109,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
"b.ge 1b \n" "b.ge 1b \n"
"99: \n" "99: \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
...@@ -2136,9 +2124,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, ...@@ -2136,9 +2124,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile( asm volatile(
// Attenuate 8 pixels. // Attenuate 8 pixels.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v5.8h, v1.8b, v3.8b \n" // g * a
...@@ -2170,9 +2157,8 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, ...@@ -2170,9 +2157,8 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"dup v6.8h, %w4 \n" // interval add "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
// ARGB.
"subs %w1, %w1, #8 \n" // 8 processed per loop. "subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n" "uxtl v1.8h, v1.8b \n"
...@@ -2190,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, ...@@ -2190,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"uqxtn v1.8b, v1.8h \n" "uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n" "uqxtn v2.8b, v2.8h \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
...@@ -2213,9 +2198,8 @@ void ARGBShadeRow_NEON(const uint8* src_argb, ...@@ -2213,9 +2198,8 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"ushr v0.8h, v0.8h, #1 \n" // scale / 2. "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n" "uxtl v5.8h, v5.8b \n"
...@@ -2230,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, ...@@ -2230,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"uqxtn v6.8b, v6.8h \n" "uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n" "uqxtn v7.8b, v7.8h \n"
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2247,9 +2230,8 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2247,9 +2230,8 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"movi v24.8b, #15 \n" // B * 0.11400 coefficient "movi v24.8b, #15 \n" // B * 0.11400 coefficient
"movi v25.8b, #75 \n" // G * 0.58700 coefficient "movi v25.8b, #75 \n" // G * 0.58700 coefficient
"movi v26.8b, #38 \n" // R * 0.29900 coefficient "movi v26.8b, #38 \n" // R * 0.29900 coefficient
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v1.8b, v25.8b \n" // G
...@@ -2282,7 +2264,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2282,7 +2264,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"movi v28.8b, #24 \n" // BB coefficient "movi v28.8b, #24 \n" // BB coefficient
"movi v29.8b, #98 \n" // BG coefficient "movi v29.8b, #98 \n" // BG coefficient
"movi v30.8b, #50 \n" // BR coefficient "movi v30.8b, #50 \n" // BR coefficient
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %w1, %w1, #8 \n" // 8 processed per loop. "subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
...@@ -2318,9 +2300,8 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, ...@@ -2318,9 +2300,8 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sxtl v0.8h, v2.8b \n" // B,G coefficients s16. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
"1: \n" "1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g "uxtl v17.8h, v17.8b \n" // g
...@@ -2358,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, ...@@ -2358,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
// pixels.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2377,11 +2357,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2377,11 +2357,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
int width) { int width) {
asm volatile( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G "umull v1.8h, v1.8b, v5.8b \n" // multiply G
...@@ -2392,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, ...@@ -2392,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2410,20 +2386,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, ...@@ -2410,20 +2386,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
int width) { int width) {
asm volatile( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2439,20 +2411,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, ...@@ -2439,20 +2411,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
int width) { int width) {
asm volatile( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -2473,7 +2441,7 @@ void SobelRow_NEON(const uint8* src_sobelx, ...@@ -2473,7 +2441,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
asm volatile( asm volatile(
"movi v3.8b, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
...@@ -2481,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx, ...@@ -2481,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx,
"orr v1.8b, v0.8b, v0.8b \n" "orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n" "orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
...@@ -2498,7 +2465,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, ...@@ -2498,7 +2465,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
int width) { int width) {
asm volatile( asm volatile(
// 16 pixel loop. // 16 pixel loop.
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %w3, %w3, #16 \n" // 16 processed per loop. "subs %w3, %w3, #16 \n" // 16 processed per loop.
...@@ -2525,13 +2492,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx, ...@@ -2525,13 +2492,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
asm volatile( asm volatile(
"movi v3.8b, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
...@@ -2551,7 +2517,7 @@ void SobelXRow_NEON(const uint8* src_y0, ...@@ -2551,7 +2517,7 @@ void SobelXRow_NEON(const uint8* src_y0,
uint8* dst_sobelx, uint8* dst_sobelx,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v0.8b}, [%0],%5 \n" // top
"ld1 {v1.8b}, [%0],%6 \n" "ld1 {v1.8b}, [%0],%6 \n"
"usubl v0.8h, v0.8b, v1.8b \n" "usubl v0.8h, v0.8b, v1.8b \n"
...@@ -2589,7 +2555,7 @@ void SobelYRow_NEON(const uint8* src_y0, ...@@ -2589,7 +2555,7 @@ void SobelYRow_NEON(const uint8* src_y0,
uint8* dst_sobely, uint8* dst_sobely,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v0.8b}, [%0],%4 \n" // left
"ld1 {v1.8b}, [%1],%4 \n" "ld1 {v1.8b}, [%1],%4 \n"
"usubl v0.8h, v0.8b, v1.8b \n" "usubl v0.8h, v0.8b, v1.8b \n"
...@@ -2620,7 +2586,7 @@ void SobelYRow_NEON(const uint8* src_y0, ...@@ -2620,7 +2586,7 @@ void SobelYRow_NEON(const uint8* src_y0,
// Caveat - rounds float to half float whereas scaling version truncates. // Caveat - rounds float to half float whereas scaling version truncates.
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop "subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's "uxtl v2.4s, v1.4h \n" // 8 int's
...@@ -2640,7 +2606,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { ...@@ -2640,7 +2606,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop "subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's "uxtl v2.4s, v1.4h \n" // 8 int's
...@@ -2680,7 +2646,6 @@ float ScaleMaxSamples_NEON(const float* src, ...@@ -2680,7 +2646,6 @@ float ScaleMaxSamples_NEON(const float* src,
"b.gt 1b \n" "b.gt 1b \n"
"fmax v5.4s, v5.4s, v6.4s \n" // max "fmax v5.4s, v5.4s, v6.4s \n" // max
"fmaxv %s3, v5.4s \n" // signed max acculator "fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width), // %2 "+r"(width), // %2
...@@ -2707,12 +2672,10 @@ float ScaleSumSamples_NEON(const float* src, ...@@ -2707,12 +2672,10 @@ float ScaleSumSamples_NEON(const float* src,
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n" "fmla v6.4s, v2.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
"faddp v5.4s, v5.4s, v6.4s \n" "faddp v5.4s, v5.4s, v6.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n" "faddp v5.4s, v5.4s, v5.4s \n"
"faddp %3.4s, v5.4s, v5.4s \n" // sum "faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width), // %2 "+r"(width), // %2
...@@ -2731,7 +2694,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { ...@@ -2731,7 +2694,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -2768,7 +2730,6 @@ void GaussCol_NEON(const uint16* src0, ...@@ -2768,7 +2730,6 @@ void GaussCol_NEON(const uint16* src0,
"subs %w6, %w6, #8 \n" // 8 processed per loop "subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src0), // %0 : "+r"(src0), // %0
"+r"(src1), // %1 "+r"(src1), // %1
"+r"(src2), // %2 "+r"(src2), // %2
...@@ -2807,7 +2768,6 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) { ...@@ -2807,7 +2768,6 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
"uqrshrn2 v0.8h, v1.4s, #8 \n" "uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(src1), // %1 "+r"(src1), // %1
"+r"(src2), // %2 "+r"(src2), // %2
......
...@@ -77,11 +77,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ...@@ -77,11 +77,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" // row 2 add adjacent + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
// row1
"uadalp v1.8h, v3.16b \n" "uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and "rshrn v0.8b, v0.8h, #2 \n" // round and pack
// pack
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -101,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ...@@ -101,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -230,7 +228,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -230,7 +228,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
...@@ -279,7 +277,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -279,7 +277,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"ld1 {v3.16b}, [%3] \n" "ld1 {v3.16b}, [%3] \n"
"1: \n" "1: \n"
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n" "subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
...@@ -394,8 +392,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -394,8 +392,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"sqrdmulh v0.8h, v20.8h, v31.8h \n" "sqrdmulh v0.8h, v20.8h, v31.8h \n"
"sqrdmulh v1.8h, v21.8h, v31.8h \n" "sqrdmulh v1.8h, v21.8h, v31.8h \n"
// Align for table lookup, vtbl requires registers to // Align for table lookup, vtbl requires registers to be adjacent
// be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
...@@ -776,8 +773,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ...@@ -776,8 +773,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and "rshrn v0.8b, v0.8h, #2 \n" // round and pack
// pack
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
...@@ -827,8 +823,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ...@@ -827,8 +823,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile( asm volatile(
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
// 2x1
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
"ld1 {v3.8b}, [%1], %4 \n" "ld1 {v3.8b}, [%1], %4 \n"
...@@ -891,8 +886,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, ...@@ -891,8 +886,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v1, 3)
// clang-format on // clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per "subs %w2, %w2, #8 \n" // 8 processed per loop
// loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment