Commit a62a97f1 authored by fbarchard@google.com's avatar fbarchard@google.com

Change branch notation to clang compatible b dot cc

BUG=357
TESTED=local ios a64 build
R=yunqingwang@google.com

Review URL: https://webrtc-codereview.appspot.com/25549004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1084 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8cbfc5d4
...@@ -80,7 +80,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -80,7 +80,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"smlal v17.4s, v3.4h, v3.4h \n" "smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n" "smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n" "smlal2 v19.4s, v3.8h, v3.8h \n"
"bgt 1b \n" "b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n" "add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n" "add v18.4s, v18.4s, v19.4s \n"
......
...@@ -104,19 +104,19 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ...@@ -104,19 +104,19 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"add %1, %1, #8 \n" // src += 8 "add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
"subs %3, %3, #8 \n" // w -= 8 "subs %3, %3, #8 \n" // w -= 8
"bge 1b \n" "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %3, %3, #8 \n" "adds %3, %3, #8 \n"
"beq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %3, #2 \n" "cmp %3, #2 \n"
"blt 3f \n" "b.lt 3f \n"
"cmp %3, #4 \n" "cmp %3, #4 \n"
"blt 2f \n" "b.lt 2f \n"
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
...@@ -169,12 +169,12 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ...@@ -169,12 +169,12 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"add %1, %1, #4 \n" // src += 4 "add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %3, %3, #4 \n" // w -= 4 "subs %3, %3, #4 \n" // w -= 4
"beq 4f \n" "b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %3, #2 \n" "cmp %3, #2 \n"
"blt 3f \n" "b.lt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
...@@ -209,7 +209,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ...@@ -209,7 +209,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"add %1, %1, #2 \n" // src += 2 "add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %3, %3, #2 \n" // w -= 2 "subs %3, %3, #2 \n" // w -= 2
"beq 4f \n" "b.eq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
...@@ -352,19 +352,19 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -352,19 +352,19 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %4, %4, #8 \n" // w -= 8 "subs %4, %4, #8 \n" // w -= 8
"bge 1b \n" "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %4, %4, #8 \n" "adds %4, %4, #8 \n"
"beq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %4, #2 \n" "cmp %4, #2 \n"
"blt 3f \n" "b.lt 3f \n"
"cmp %4, #4 \n" "cmp %4, #4 \n"
"blt 2f \n" "b.lt 2f \n"
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
...@@ -441,12 +441,12 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -441,12 +441,12 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %4, %4, #4 \n" // w -= 4 "subs %4, %4, #4 \n" // w -= 4
"beq 4f \n" "b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %4, #2 \n" "cmp %4, #2 \n"
"blt 3f \n" "b.lt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
...@@ -491,7 +491,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -491,7 +491,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %4, %4, #2 \n" // w -= 2 "subs %4, %4, #2 \n" // w -= 2
"beq 4f \n" "b.eq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
......
...@@ -146,7 +146,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -146,7 +146,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -182,7 +182,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -182,7 +182,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -218,7 +218,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, ...@@ -218,7 +218,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -255,7 +255,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, ...@@ -255,7 +255,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -292,7 +292,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, ...@@ -292,7 +292,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -328,7 +328,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -328,7 +328,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -363,7 +363,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -363,7 +363,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
MEMACCESS(3) MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n" "vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -399,7 +399,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, ...@@ -399,7 +399,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
MEMACCESS(3) MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n" "vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -447,7 +447,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -447,7 +447,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
ARGBTORGB565 ARGBTORGB565
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -499,7 +499,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -499,7 +499,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -546,7 +546,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -546,7 +546,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -580,7 +580,7 @@ void YToARGBRow_NEON(const uint8* src_y, ...@@ -580,7 +580,7 @@ void YToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -607,7 +607,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, ...@@ -607,7 +607,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -638,7 +638,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -638,7 +638,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -672,7 +672,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -672,7 +672,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -706,7 +706,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -706,7 +706,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
"+r"(dst_rgb565), // %2 "+r"(dst_rgb565), // %2
...@@ -740,7 +740,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -740,7 +740,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
"+r"(dst_rgb565), // %2 "+r"(dst_rgb565), // %2
...@@ -773,7 +773,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -773,7 +773,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -805,7 +805,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -805,7 +805,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -831,7 +831,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -831,7 +831,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2) MEMACCESS(2)
"st1 {v1.16b}, [%2], #16 \n" // store V "st1 {v1.16b}, [%2], #16 \n" // store V
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -856,7 +856,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -856,7 +856,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2) MEMACCESS(2)
"st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"bgt 1b \n" "b.gt 1b \n"
: :
"+r"(src_u), // %0 "+r"(src_u), // %0
"+r"(src_v), // %1 "+r"(src_v), // %1
...@@ -879,7 +879,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -879,7 +879,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 // Output registers "+r"(count) // %2 // Output registers
...@@ -898,7 +898,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { ...@@ -898,7 +898,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"bgt 1b \n" "b.gt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
: "r"(v32) // %2 : "r"(v32) // %2
...@@ -936,7 +936,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -936,7 +936,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.D}[0], [%1], #8 \n" "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -965,7 +965,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -965,7 +965,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"st1 {v0.8b}, [%1], #8 \n" // dst += 8 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
MEMACCESS(2) MEMACCESS(2)
"st1 {v1.8b}, [%2], #8 \n" "st1 {v1.8b}, [%2], #8 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -993,7 +993,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -993,7 +993,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.D}[0], [%1], #8 \n" "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -1014,7 +1014,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -1014,7 +1014,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1037,7 +1037,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -1037,7 +1037,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"mov v4.8b, v0.8b \n" // move r "mov v4.8b, v0.8b \n" // move r
MEMACCESS(1) MEMACCESS(1)
"st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1071,7 +1071,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { ...@@ -1071,7 +1071,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
RGB565TOARGB RGB565TOARGB
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1121,7 +1121,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ...@@ -1121,7 +1121,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
ARGB1555TOARGB ARGB1555TOARGB
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1154,7 +1154,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, ...@@ -1154,7 +1154,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
ARGB4444TOARGB ARGB4444TOARGB
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1174,7 +1174,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ...@@ -1174,7 +1174,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1 "+r"(dst_rgb24), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1196,7 +1196,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ...@@ -1196,7 +1196,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
"mov v5.8b, v1.8b \n" // mov b "mov v5.8b, v1.8b \n" // mov b
MEMACCESS(1) MEMACCESS(1)
"st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_raw), // %1 "+r"(dst_raw), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1216,7 +1216,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1216,7 +1216,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1236,7 +1236,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1236,7 +1236,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1259,7 +1259,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ...@@ -1259,7 +1259,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
"st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
"st1 {v3.8b}, [%2], #8 \n" // store 8 V. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1283,7 +1283,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ...@@ -1283,7 +1283,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
"st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
"st1 {v2.8b}, [%2], #8 \n" // store 8 V. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1312,7 +1312,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1312,7 +1312,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"st1 {v1.8b}, [%2], #8 \n" // store 8 U. "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3) MEMACCESS(3)
"st1 {v3.8b}, [%3], #8 \n" // store 8 V. "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1 "+r"(stride_yuy2), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -1342,7 +1342,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1342,7 +1342,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"st1 {v0.8b}, [%2], #8 \n" // store 8 U. "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3) MEMACCESS(3)
"st1 {v2.8b}, [%3], #8 \n" // store 8 V. "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1 "+r"(stride_uyvy), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -1369,7 +1369,7 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, ...@@ -1369,7 +1369,7 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
"urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1 "+r"(src_uv_stride), // %1
"+r"(dst_uv), // %2 "+r"(dst_uv), // %2
...@@ -1395,7 +1395,7 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1395,7 +1395,7 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
"trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
MEMACCESS(1) MEMACCESS(1)
"st1 {v4.8b}, [%1], #8 \n" // store 8. "st1 {v4.8b}, [%1], #8 \n" // store 8.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1416,7 +1416,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1416,7 +1416,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's. "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1440,7 +1440,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -1440,7 +1440,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 4. "st1 {v1.16b}, [%1], #16 \n" // store 4.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1468,7 +1468,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1468,7 +1468,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -1498,7 +1498,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1498,7 +1498,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -1521,7 +1521,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { ...@@ -1521,7 +1521,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
ARGBTORGB565 ARGBTORGB565
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1 "+r"(dst_rgb565), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1543,7 +1543,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, ...@@ -1543,7 +1543,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1 "+r"(dst_argb1555), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1566,7 +1566,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, ...@@ -1566,7 +1566,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1 "+r"(dst_argb4444), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1595,7 +1595,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1595,7 +1595,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1622,7 +1622,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1622,7 +1622,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1665,7 +1665,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1665,7 +1665,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
MEMACCESS(2) MEMACCESS(2)
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1715,7 +1715,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1715,7 +1715,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
MEMACCESS(2) MEMACCESS(2)
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1774,7 +1774,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1774,7 +1774,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
MEMACCESS(2) MEMACCESS(2)
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1838,7 +1838,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1838,7 +1838,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1 "+r"(src_stride_argb), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -1890,7 +1890,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1890,7 +1890,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1 "+r"(src_stride_argb), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -1941,7 +1941,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ...@@ -1941,7 +1941,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1 "+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -1992,7 +1992,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ...@@ -1992,7 +1992,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1 "+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -2043,7 +2043,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ...@@ -2043,7 +2043,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1 "+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -2094,7 +2094,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -2094,7 +2094,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(src_stride_rgb24), // %1 "+r"(src_stride_rgb24), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -2145,7 +2145,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, ...@@ -2145,7 +2145,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(src_stride_raw), // %1 "+r"(src_stride_raw), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -2217,7 +2217,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, ...@@ -2217,7 +2217,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
"+r"(src_stride_rgb565), // %1 "+r"(src_stride_rgb565), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -2289,7 +2289,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -2289,7 +2289,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
"+r"(src_stride_argb1555), // %1 "+r"(src_stride_argb1555), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -2361,7 +2361,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -2361,7 +2361,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
"+r"(src_stride_argb4444), // %1 "+r"(src_stride_argb4444), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
...@@ -2394,7 +2394,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { ...@@ -2394,7 +2394,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2424,7 +2424,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { ...@@ -2424,7 +2424,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2454,7 +2454,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { ...@@ -2454,7 +2454,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2483,7 +2483,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -2483,7 +2483,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2512,7 +2512,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -2512,7 +2512,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2541,7 +2541,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ...@@ -2541,7 +2541,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2570,7 +2570,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ...@@ -2570,7 +2570,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2599,7 +2599,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2599,7 +2599,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -2619,13 +2619,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2619,13 +2619,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "b.eq 100f \n"
"cmp %4, #64 \n" "cmp %4, #64 \n"
"beq 75f \n" "b.eq 75f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
"beq 50f \n" "b.eq 50f \n"
"cmp %4, #192 \n" "cmp %4, #192 \n"
"beq 25f \n" "b.eq 25f \n"
"dup v5.16b, %w4 \n" "dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n" "dup v4.16b, %w5 \n"
...@@ -2644,7 +2644,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2644,7 +2644,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"rshrn2 v0.16b, v3.8h, #8 \n" "rshrn2 v0.16b, v3.8h, #8 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
...@@ -2658,7 +2658,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2658,7 +2658,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 25b \n" "b.gt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
...@@ -2671,7 +2671,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2671,7 +2671,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
...@@ -2685,7 +2685,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2685,7 +2685,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 75b \n" "b.gt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
...@@ -2695,7 +2695,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2695,7 +2695,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -2716,7 +2716,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2716,7 +2716,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
"blt 89f \n" "b.lt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
MEMACCESS(0) MEMACCESS(0)
...@@ -2739,11 +2739,11 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2739,11 +2739,11 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"movi v3.8b, #255 \n" // a = 255 "movi v3.8b, #255 \n" // a = 255
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB.
"bge 8b \n" "b.ge 8b \n"
"89: \n" "89: \n"
"adds %3, %3, #8-1 \n" "adds %3, %3, #8-1 \n"
"blt 99f \n" "b.lt 99f \n"
// Blend 1 pixels. // Blend 1 pixels.
"1: \n" "1: \n"
...@@ -2767,7 +2767,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2767,7 +2767,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"movi v3.8b, #255 \n" // a = 255 "movi v3.8b, #255 \n" // a = 255
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel. "st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel.
"bge 1b \n" "b.ge 1b \n"
"99: \n" "99: \n"
...@@ -2799,7 +2799,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2799,7 +2799,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -2843,7 +2843,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2843,7 +2843,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"uqxtn v2.8b, v2.8h \n" "uqxtn v2.8b, v2.8h \n"
MEMACCESS(0) MEMACCESS(0)
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
: "r"(scale), // %2 : "r"(scale), // %2
...@@ -2885,7 +2885,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2885,7 +2885,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"uqxtn v7.8b, v7.8h \n" "uqxtn v7.8b, v7.8h \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -2917,7 +2917,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2917,7 +2917,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"mov v2.8b, v0.8b \n" // R "mov v2.8b, v0.8b \n" // R
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -2963,7 +2963,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2963,7 +2963,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
MEMACCESS(0) MEMACCESS(0)
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
: :
...@@ -3028,7 +3028,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -3028,7 +3028,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
MEMACCESS(1) MEMACCESS(1)
"st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels. "st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -3063,7 +3063,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3063,7 +3063,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
...@@ -3094,7 +3094,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3094,7 +3094,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"uqadd v3.8b, v3.8b, v7.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
...@@ -3125,7 +3125,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3125,7 +3125,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"uqsub v3.8b, v3.8b, v7.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
...@@ -3160,7 +3160,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3160,7 +3160,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"mov v2.8b, v0.8b \n" "mov v2.8b, v0.8b \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -3187,7 +3187,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3187,7 +3187,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"uqadd v0.16b, v0.16b, v1.16b \n" // add "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_y), // %2 "+r"(dst_y), // %2
...@@ -3219,7 +3219,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3219,7 +3219,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"uqadd v1.8b, v0.8b, v2.8b \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
...@@ -3263,7 +3263,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3263,7 +3263,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"uqxtn v0.8b, v0.8h \n" "uqxtn v0.8b, v0.8h \n"
MEMACCESS(3) MEMACCESS(3)
"st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
"+r"(src_y2), // %2 "+r"(src_y2), // %2
...@@ -3309,7 +3309,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3309,7 +3309,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"uqxtn v0.8b, v0.8h \n" "uqxtn v0.8b, v0.8h \n"
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 sobely "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
"+r"(dst_sobely), // %2 "+r"(dst_sobely), // %2
......
...@@ -32,7 +32,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -32,7 +32,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -63,7 +63,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -63,7 +63,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
"+r"(dst), // %2 "+r"(dst), // %2
...@@ -84,7 +84,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -84,7 +84,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -119,7 +119,7 @@ asm volatile ( ...@@ -119,7 +119,7 @@ asm volatile (
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.s}[0], [%1], #4 \n" "st1 {v0.s}[0], [%1], #4 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -147,7 +147,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -147,7 +147,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
"mov v2.8b, v3.8b \n" // order v0, v1, v2 "mov v2.8b, v3.8b \n" // order v0, v1, v2
MEMACCESS(1) MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n" "st3 {v0.8b-v2.8b}, [%1], #24 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -207,7 +207,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -207,7 +207,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
MEMACCESS(1) MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n" "st3 {v0.8b-v2.8b}, [%1], #24 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -253,7 +253,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -253,7 +253,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
MEMACCESS(1) MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n" "st3 {v0.8b-v2.8b}, [%1], #24 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -292,7 +292,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -292,7 +292,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v2.s}[2], [%1], #4 \n" "st1 {v2.s}[2], [%1], #4 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -417,7 +417,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -417,7 +417,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -527,7 +527,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -527,7 +527,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -547,14 +547,14 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -547,14 +547,14 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
int y_fraction = 256 - source_y_fraction; int y_fraction = 256 - source_y_fraction;
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "b.eq 100f \n"
"add %2, %2, %1 \n" "add %2, %2, %1 \n"
"cmp %4, #64 \n" "cmp %4, #64 \n"
"beq 75f \n" "b.eq 75f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
"beq 50f \n" "b.eq 50f \n"
"cmp %4, #192 \n" "cmp %4, #192 \n"
"beq 25f \n" "b.eq 25f \n"
"dup v5.8b, %w4 \n" "dup v5.8b, %w4 \n"
"dup v4.8b, %w5 \n" "dup v4.8b, %w5 \n"
...@@ -573,7 +573,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -573,7 +573,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"rshrn2 v0.16b, v7.8h, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
...@@ -587,7 +587,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -587,7 +587,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 25b \n" "b.gt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
...@@ -600,7 +600,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -600,7 +600,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
...@@ -614,7 +614,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -614,7 +614,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 75b \n" "b.gt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
...@@ -624,7 +624,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -624,7 +624,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0) MEMACCESS(0)
...@@ -655,7 +655,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -655,7 +655,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1) MEMACCESS (1)
"st1 {v3.16b}, [%1], #16 \n" "st1 {v3.16b}, [%1], #16 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
"+r" (dst), // %1 "+r" (dst), // %1
"+r" (dst_width) // %2 "+r" (dst_width) // %2
...@@ -691,7 +691,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -691,7 +691,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2) MEMACCESS (2)
"st4 {v0.8b - v3.8b}, [%2], #32 \n" "st4 {v0.8b - v3.8b}, [%2], #32 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
"+r" (src_stride), // %1 "+r" (src_stride), // %1
"+r" (dst), // %2 "+r" (dst), // %2
...@@ -720,7 +720,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -720,7 +720,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"subs %2, %2, #4 \n" // 4 pixels per loop. "subs %2, %2, #4 \n" // 4 pixels per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -774,7 +774,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -774,7 +774,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"subs %3, %3, #4 \n" // 4 pixels per loop. "subs %3, %3, #4 \n" // 4 pixels per loop.
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment