Commit a20e2c62 authored by fbarchard@google.com's avatar fbarchard@google.com

row_neon64 fix for warning on ios where int width doesnt match %2 size which is…

row_neon64 fix for warning on ios where int width doesnt match %2 size which is 64 bit by default.  change size to explicitely 32 bit with %w2.
BUG=437
TESTED=try bots
R=bcornell@google.com

Review URL: https://webrtc-codereview.appspot.com/47119004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1399 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6d555466
...@@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV444 READYUV444
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
...@@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
...@@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, ...@@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV411 READYUV411
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
...@@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, ...@@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v21, v22, v23) YUV422TORGB(v21, v22, v23)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v20.8b, #255 \n" /* A */ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
...@@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, ...@@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v20, v21, v22) YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
...@@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v23, v22, v21) YUV422TORGB(v23, v22, v21)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v20.8b, #255 \n" /* A */ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
...@@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
MEMACCESS(3) MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, ...@@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v20, v21, v22) YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
MEMACCESS(3) MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(3) MEMACCESS(3)
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
...@@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(3) MEMACCESS(3)
...@@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(3) MEMACCESS(3)
...@@ -517,13 +517,13 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -517,13 +517,13 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
void I400ToARGBRow_NEON(const uint8* src_y, void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
int64 width64 = (int64)(width); int64 width64 = (int64)(width);
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
"1: \n" "1: \n"
READYUV400 READYUV400
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
...@@ -550,7 +550,7 @@ void J400ToARGBRow_NEON(const uint8* src_y, ...@@ -550,7 +550,7 @@ void J400ToARGBRow_NEON(const uint8* src_y,
"ld1 {v20.8b}, [%0], #8 \n" "ld1 {v20.8b}, [%0], #8 \n"
"orr v21.8b, v20.8b, v20.8b \n" "orr v21.8b, v20.8b, v20.8b \n"
"orr v22.8b, v20.8b, v20.8b \n" "orr v22.8b, v20.8b, v20.8b \n"
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -573,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -573,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
...@@ -600,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -600,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
...@@ -627,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -627,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
...@@ -654,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -654,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
...@@ -675,13 +675,13 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -675,13 +675,13 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
void YUY2ToARGBRow_NEON(const uint8* src_yuy2, void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
int64 width64 = (int64)(width); int64 width64 = (int64)(width);
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
"1: \n" "1: \n"
READYUY2 READYUY2
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
...@@ -701,13 +701,13 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -701,13 +701,13 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
void UYVYToARGBRow_NEON(const uint8* src_uyvy, void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
int64 width64 = (int64)(width); int64 width64 = (int64)(width);
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
"1: \n" "1: \n"
READUYVY READUYVY
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
...@@ -731,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -731,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2) MEMACCESS(2)
...@@ -757,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -757,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" // load V "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
MEMACCESS(2) MEMACCESS(2)
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n" "b.gt 1b \n"
...@@ -779,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -779,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %w2, %w2, #32 \n" // 32 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
"b.gt 1b \n" "b.gt 1b \n"
...@@ -797,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) { ...@@ -797,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile ( asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %w1, %w1, #16 \n" // 16 bytes per loop
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n" "b.gt 1b \n"
...@@ -812,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { ...@@ -812,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( asm volatile (
"dup v0.4s, %w2 \n" // duplicate 4 ints "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #4 \n" // 4 ints per loop "subs %w1, %w1, #4 \n" // 4 ints per loop
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n" "b.gt 1b \n"
...@@ -833,7 +833,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -833,7 +833,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, %2, #16 \n" // 16 pixels per loop. "subs %w2, %w2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n" "rev64 v0.16b, v0.16b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
...@@ -860,7 +860,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -860,7 +860,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
"subs %3, %3, #8 \n" // 8 pixels per loop. "subs %w3, %w3, #8 \n" // 8 pixels per loop.
"rev64 v0.8b, v0.8b \n" "rev64 v0.8b, v0.8b \n"
"rev64 v1.8b, v1.8b \n" "rev64 v1.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
...@@ -888,7 +888,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -888,7 +888,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, %2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
"rev64 v0.4s, v0.4s \n" "rev64 v0.4s, v0.4s \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
...@@ -911,7 +911,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -911,7 +911,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
...@@ -931,7 +931,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -931,7 +931,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g "orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r "orr v4.8b, v0.8b, v0.8b \n" // move r
MEMACCESS(1) MEMACCESS(1)
...@@ -966,7 +966,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { ...@@ -966,7 +966,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
...@@ -1025,7 +1025,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ...@@ -1025,7 +1025,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
...@@ -1058,7 +1058,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, ...@@ -1058,7 +1058,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
...@@ -1078,7 +1078,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ...@@ -1078,7 +1078,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1097,7 +1097,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ...@@ -1097,7 +1097,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g "orr v4.8b, v2.8b, v2.8b \n" // mov g
"orr v5.8b, v1.8b, v1.8b \n" // mov b "orr v5.8b, v1.8b, v1.8b \n" // mov b
MEMACCESS(1) MEMACCESS(1)
...@@ -1118,7 +1118,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1118,7 +1118,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1137,7 +1137,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1137,7 +1137,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1157,7 +1157,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ...@@ -1157,7 +1157,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
...@@ -1180,7 +1180,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ...@@ -1180,7 +1180,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
...@@ -1204,7 +1204,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1204,7 +1204,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
...@@ -1234,7 +1234,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1234,7 +1234,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
...@@ -1264,7 +1264,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1264,7 +1264,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's. "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1287,7 +1287,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -1287,7 +1287,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 4. "st1 {v1.16b}, [%1], #16 \n" // store 4.
...@@ -1315,7 +1315,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1315,7 +1315,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1344,7 +1344,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1344,7 +1344,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1365,7 +1365,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { ...@@ -1365,7 +1365,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565 ARGBTORGB565
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
...@@ -1411,7 +1411,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, ...@@ -1411,7 +1411,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
...@@ -1433,7 +1433,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, ...@@ -1433,7 +1433,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
...@@ -1457,7 +1457,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1457,7 +1457,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
...@@ -1484,7 +1484,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1484,7 +1484,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
...@@ -1515,7 +1515,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1515,7 +1515,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R "umlsl v4.8h, v2.8b, v26.8b \n" // R
...@@ -1559,7 +1559,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1559,7 +1559,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %w3, %w3, #16 \n" // 16 processed per loop.
"mul v3.8h, v0.8h, v20.8h \n" // B "mul v3.8h, v0.8h, v20.8h \n" // B
"mls v3.8h, v1.8h, v21.8h \n" // G "mls v3.8h, v1.8h, v21.8h \n" // G
"mls v3.8h, v2.8h, v22.8h \n" // R "mls v3.8h, v2.8h, v22.8h \n" // R
...@@ -1615,7 +1615,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1615,7 +1615,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %3, %3, #32 \n" // 32 processed per loop. "subs %w3, %w3, #32 \n" // 32 processed per loop.
"mul v3.8h, v0.8h, v20.8h \n" // B "mul v3.8h, v0.8h, v20.8h \n" // B
"mls v3.8h, v1.8h, v21.8h \n" // G "mls v3.8h, v1.8h, v21.8h \n" // G
"mls v3.8h, v2.8h, v22.8h \n" // R "mls v3.8h, v2.8h, v22.8h \n" // R
...@@ -1681,7 +1681,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1681,7 +1681,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
...@@ -1728,7 +1728,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1728,7 +1728,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
...@@ -1769,7 +1769,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ...@@ -1769,7 +1769,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"urshr v1.8h, v3.8h, #1 \n" "urshr v1.8h, v3.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
...@@ -1810,7 +1810,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ...@@ -1810,7 +1810,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h) RGBTOUV(v0.8h, v2.8h, v1.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
...@@ -1851,7 +1851,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ...@@ -1851,7 +1851,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
...@@ -1892,7 +1892,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -1892,7 +1892,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
...@@ -1933,7 +1933,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, ...@@ -1933,7 +1933,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v0.8h, v0.8h, #1 \n" "urshr v0.8h, v0.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h) RGBTOUV(v2.8h, v1.8h, v0.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
...@@ -1999,7 +1999,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, ...@@ -1999,7 +1999,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"urshr v5.8h, v18.8h, #1 \n" "urshr v5.8h, v18.8h, #1 \n"
"urshr v6.8h, v20.8h, #1 \n" "urshr v6.8h, v20.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop. "subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v16.8h, v4.8h, v22.8h \n" // B "mul v16.8h, v4.8h, v22.8h \n" // B
"mls v16.8h, v5.8h, v23.8h \n" // G "mls v16.8h, v5.8h, v23.8h \n" // G
"mls v16.8h, v6.8h, v24.8h \n" // R "mls v16.8h, v6.8h, v24.8h \n" // R
...@@ -2070,7 +2070,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -2070,7 +2070,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"urshr v5.8h, v17.8h, #1 \n" "urshr v5.8h, v17.8h, #1 \n"
"urshr v6.8h, v18.8h, #1 \n" "urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop. "subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v2.8h, v4.8h, v20.8h \n" // B "mul v2.8h, v4.8h, v20.8h \n" // B
"mls v2.8h, v5.8h, v21.8h \n" // G "mls v2.8h, v5.8h, v21.8h \n" // G
"mls v2.8h, v6.8h, v22.8h \n" // R "mls v2.8h, v6.8h, v22.8h \n" // R
...@@ -2141,7 +2141,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -2141,7 +2141,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"urshr v5.8h, v17.8h, #1 \n" "urshr v5.8h, v17.8h, #1 \n"
"urshr v6.8h, v18.8h, #1 \n" "urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop. "subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v2.8h, v4.8h, v20.8h \n" // B "mul v2.8h, v4.8h, v20.8h \n" // B
"mls v2.8h, v5.8h, v21.8h \n" // G "mls v2.8h, v5.8h, v21.8h \n" // G
"mls v2.8h, v6.8h, v22.8h \n" // R "mls v2.8h, v6.8h, v22.8h \n" // R
...@@ -2181,7 +2181,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { ...@@ -2181,7 +2181,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B "umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v1.8b, v25.8b \n" // G
...@@ -2211,7 +2211,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { ...@@ -2211,7 +2211,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
...@@ -2240,7 +2240,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { ...@@ -2240,7 +2240,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B "umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v1.8b, v25.8b \n" // G
...@@ -2269,7 +2269,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -2269,7 +2269,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R "umull v16.8h, v1.8b, v4.8b \n" // R
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // B "umlal v16.8h, v3.8b, v6.8b \n" // B
...@@ -2297,7 +2297,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -2297,7 +2297,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R "umull v16.8h, v0.8b, v4.8b \n" // R
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // B "umlal v16.8h, v2.8b, v6.8b \n" // B
...@@ -2325,7 +2325,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ...@@ -2325,7 +2325,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B "umull v16.8h, v1.8b, v4.8b \n" // B
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // R "umlal v16.8h, v3.8b, v6.8b \n" // R
...@@ -2353,7 +2353,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ...@@ -2353,7 +2353,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
...@@ -2381,7 +2381,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2381,7 +2381,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
...@@ -2425,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2425,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v2.8h, v0.8b, v4.8b \n" "umull v2.8h, v0.8b, v4.8b \n"
"umull2 v3.8h, v0.16b, v4.16b \n" "umull2 v3.8h, v0.16b, v4.16b \n"
"umlal v2.8h, v1.8b, v5.8b \n" "umlal v2.8h, v1.8b, v5.8b \n"
...@@ -2443,7 +2443,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2443,7 +2443,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
...@@ -2457,7 +2457,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2457,7 +2457,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
...@@ -2470,7 +2470,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2470,7 +2470,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
...@@ -2482,7 +2482,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2482,7 +2482,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"100: \n" "100: \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
...@@ -2505,7 +2505,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2505,7 +2505,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
"b.lt 89f \n" "b.lt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
...@@ -2513,7 +2513,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2513,7 +2513,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a
...@@ -2541,7 +2541,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2541,7 +2541,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop. "subs %w3, %w3, #1 \n" // 1 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a
...@@ -2580,7 +2580,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2580,7 +2580,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v5.8h, v1.8b, v3.8b \n" // g * a
"umull v6.8h, v2.8b, v3.8b \n" // r * a "umull v6.8h, v2.8b, v3.8b \n" // r * a
...@@ -2614,7 +2614,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2614,7 +2614,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n" "uxtl v1.8h, v1.8b \n"
"uxtl v2.8h, v2.8b \n" "uxtl v2.8h, v2.8b \n"
...@@ -2658,7 +2658,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2658,7 +2658,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n" "uxtl v5.8h, v5.8b \n"
"uxtl v6.8h, v6.8b \n" "uxtl v6.8h, v6.8b \n"
...@@ -2695,7 +2695,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2695,7 +2695,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R "umlal v4.8h, v2.8b, v26.8b \n" // R
...@@ -2734,7 +2734,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2734,7 +2734,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R "umlal v4.8h, v2.8b, v22.8b \n" // R
...@@ -2774,7 +2774,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -2774,7 +2774,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g "uxtl v17.8h, v17.8b \n" // g
"uxtl v18.8h, v18.8b \n" // r "uxtl v18.8h, v18.8b \n" // r
...@@ -2836,7 +2836,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2836,7 +2836,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G "umull v1.8h, v1.8b, v5.8b \n" // multiply G
"umull v2.8h, v2.8b, v6.8b \n" // multiply R "umull v2.8h, v2.8b, v6.8b \n" // multiply R
...@@ -2870,7 +2870,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2870,7 +2870,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n"
...@@ -2900,7 +2900,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2900,7 +2900,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n"
...@@ -2935,7 +2935,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2935,7 +2935,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v1.8b \n" // add "uqadd v0.8b, v0.8b, v1.8b \n" // add
"orr v1.8b, v0.8b, v0.8b \n" "orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n" "orr v2.8b, v0.8b, v0.8b \n"
...@@ -2963,7 +2963,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2963,7 +2963,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %w3, %w3, #16 \n" // 16 processed per loop.
"uqadd v0.16b, v0.16b, v1.16b \n" // add "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
...@@ -2994,7 +2994,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2994,7 +2994,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
...@@ -3034,7 +3034,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3034,7 +3034,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"ld1 {v2.8b}, [%2],%5 \n" // bottom "ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2) MEMACCESS(2)
"ld1 {v3.8b}, [%2],%6 \n" "ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels "subs %w4, %w4, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n" "usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n"
"abs v0.8h, v0.8h \n" "abs v0.8h, v0.8h \n"
...@@ -3079,7 +3079,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3079,7 +3079,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"ld1 {v2.8b}, [%0],%5 \n" // right "ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1) MEMACCESS(1)
"ld1 {v3.8b}, [%1],%5 \n" "ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels "subs %w3, %w3, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n" "usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n"
"abs v0.8h, v0.8h \n" "abs v0.8h, v0.8h \n"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment