Commit 2c4e3993 authored by fbarchard@google.com's avatar fbarchard@google.com

Change vld and vst to specify only elements and size not sign for better arm…

Change vld and vst to specify only elements and size not sign for better arm compiler compatability.
BUG=none
TEST=none
R=kma@webrtc.org, mflodman@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/1643004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@721 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent ae67c900
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 720
Version: 721
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 720
#define LIBYUV_VERSION 721
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -27,8 +27,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
".p2align 2 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
......
......@@ -20,57 +20,57 @@ extern "C" {
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u32 {d2[0]}, [%1]! \n" \
"vld1.u32 {d2[1]}, [%2]! \n"
"vld1.8 {d0}, [%0]! \n" \
"vld1.32 {d2[0]}, [%1]! \n" \
"vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 2 U and 2 V from 422
#define READYUV411 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u16 {d2[0]}, [%1]! \n" \
"vld1.u16 {d2[1]}, [%2]! \n" \
"vld1.8 {d0}, [%0]! \n" \
"vld1.16 {d2[0]}, [%1]! \n" \
"vld1.16 {d2[1]}, [%2]! \n" \
"vmov.u8 d3, d2 \n" \
"vzip.u8 d2, d3 \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \
"vld1.u8 {d3}, [%2]! \n" \
"vld1.8 {d0}, [%0]! \n" \
"vld1.8 {d2}, [%1]! \n" \
"vld1.8 {d3}, [%2]! \n" \
"vpaddl.u8 q1, q1 \n" \
"vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \
"vld1.8 {d0}, [%0]! \n" \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
// Read 8 Y and 4 VU from NV21
#define READNV21 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \
"vld1.8 {d0}, [%0]! \n" \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \
"vtrn.u32 d2, d3 \n"
// Read 8 YUY2
#define READYUY2 \
"vld2.u8 {d0, d2}, [%0]! \n" \
"vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
// Read 8 UYVY
#define READUYVY \
"vld2.u8 {d2, d3}, [%0]! \n" \
"vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
......@@ -113,8 +113,8 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -144,8 +144,8 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -175,8 +175,8 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -206,8 +206,8 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -238,8 +238,8 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -270,8 +270,8 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -301,8 +301,8 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -331,8 +331,8 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -374,8 +374,8 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -420,8 +420,8 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -461,8 +461,8 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vld1.8 {d24}, [%5] \n"
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -492,8 +492,8 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n"
"vld1.8 {d24}, [%3] \n"
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -522,7 +522,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
".p2align 2 \n"
"vmov.u8 d23, #255 \n"
"1: \n"
"vld1.u8 {d20}, [%0]! \n"
"vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
"subs %2, %2, #8 \n"
......@@ -541,8 +541,8 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n"
"vld1.8 {d24}, [%4] \n"
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -570,8 +570,8 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n"
"vld1.8 {d24}, [%4] \n"
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -599,8 +599,8 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
"vld1.u8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n"
"vld1.8 {d24}, [%4] \n"
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -628,8 +628,8 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
"vld1.u8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n"
"vld1.8 {d24}, [%4] \n"
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -656,8 +656,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n"
"vld1.8 {d24}, [%3] \n"
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -683,8 +683,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n"
"vld1.8 {d24}, [%3] \n"
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
......@@ -712,10 +712,10 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
asm volatile (
".p2align 2 \n"
"1: \n"
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // store V
"vst1.8 {q0}, [%1]! \n" // store U
"vst1.8 {q1}, [%2]! \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
......@@ -732,8 +732,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load U
"vld1.u8 {q1}, [%1]! \n" // load V
"vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n"
......@@ -747,14 +747,14 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
);
}
// Copy multiple of 32. vld4.u8 allow unaligned and is fastest on a15.
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
"vst1.u8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -770,7 +770,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u8 {q0}, [%0]! \n" // store
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
......@@ -1037,9 +1037,9 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y.
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
......@@ -1053,9 +1053,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y.
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
......@@ -1072,8 +1072,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.u8 {d1}, [%1]! \n" // store 8 U.
"vst1.u8 {d3}, [%2]! \n" // store 8 V.
"vst1.8 {d1}, [%1]! \n" // store 8 U.
"vst1.8 {d3}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
......@@ -1091,8 +1091,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.u8 {d0}, [%1]! \n" // store 8 U.
"vst1.u8 {d2}, [%2]! \n" // store 8 V.
"vst1.8 {d0}, [%1]! \n" // store 8 U.
"vst1.8 {d2}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
......@@ -1114,8 +1114,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
"vst1.u8 {d1}, [%2]! \n" // store 8 U.
"vst1.u8 {d3}, [%3]! \n" // store 8 V.
"vst1.8 {d1}, [%2]! \n" // store 8 U.
"vst1.8 {d3}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
......@@ -1138,8 +1138,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
"vst1.u8 {d0}, [%2]! \n" // store 8 U.
"vst1.u8 {d2}, [%3]! \n" // store 8 V.
"vst1.8 {d0}, [%2]! \n" // store 8 U.
"vst1.8 {d2}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
......@@ -1157,11 +1157,11 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load row 1 16 pixels.
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
"vld1.u8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2
"vst1.u8 {q0}, [%2]! \n"
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1
......@@ -1178,12 +1178,12 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"vmov.u32 d6[0], %3 \n" // selector
"1: \n"
"vld1.u8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels
"vst1.u8 {d4}, [%1]! \n" // store 8.
"vst1.8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
......@@ -1197,13 +1197,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
"vld1.u8 {q2}, [%3] \n" // shuffler
"vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load 4 pixels.
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
"vst1.u8 {q1}, [%1]! \n" // store 4.
"vst1.8 {q1}, [%1]! \n" // store 4.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -1224,7 +1224,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"vld1.8 {d1}, [%1]! \n" // load 8 Us
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
"vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
......@@ -1247,7 +1247,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"vld1.8 {d0}, [%1]! \n" // load 8 Us
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
"vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
......@@ -2181,8 +2181,8 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
......@@ -2190,47 +2190,47 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"vld1.8 {q1}, [%1]! \n"
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
......@@ -2478,7 +2478,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
int width) {
asm volatile (
"vld1.u8 {q2}, [%2] \n" // load 3 ARGB vectors.
"vld1.8 {q2}, [%2] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R coefficients s16.
......@@ -2670,22 +2670,22 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {d0}, [%0],%5 \n" // top
"vld1.u8 {d1}, [%0],%6 \n"
"vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n"
"vld1.u8 {d2}, [%1],%5 \n" // center * 2
"vld1.u8 {d3}, [%1],%6 \n"
"vld1.8 {d2}, [%1],%5 \n" // center * 2
"vld1.8 {d3}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
"vld1.u8 {d2}, [%2],%5 \n" // bottom
"vld1.u8 {d3}, [%2],%6 \n"
"vld1.8 {d2}, [%2],%5 \n" // bottom
"vld1.8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
"vst1.u8 {d0}, [%3]! \n" // store 8 sobelx
"vst1.8 {d0}, [%3]! \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
......@@ -2707,22 +2707,22 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {d0}, [%0],%4 \n" // left
"vld1.u8 {d1}, [%1],%4 \n"
"vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n"
"vld1.u8 {d2}, [%0],%4 \n" // center * 2
"vld1.u8 {d3}, [%1],%4 \n"
"vld1.8 {d2}, [%0],%4 \n" // center * 2
"vld1.8 {d3}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
"vld1.u8 {d2}, [%0],%5 \n" // right
"vld1.u8 {d3}, [%1],%5 \n"
"vld1.8 {d2}, [%0],%5 \n" // right
"vld1.8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
"vst1.u8 {d0}, [%2]! \n" // store 8 sobely
"vst1.8 {d0}, [%2]! \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
......
......@@ -24,11 +24,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u32 {q0, q1}, [%0]! \n"
"vld2.u32 {q2, q3}, [%0]! \n"
"vld2.32 {q0, q1}, [%0]! \n"
"vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.u8 {q1}, [%1]! \n" // store odd pixels
"vst1.u8 {q3}, [%1]! \n"
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"vst1.8 {q3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
......@@ -61,7 +61,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n"
"vst4.u8 {d0, d1, d2, d3}, [%2]! \n"
"vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
......
......@@ -27,9 +27,9 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u8 {q0, q1}, [%0]! \n"
"vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
"vst1.u8 {q1}, [%1]! \n" // store odd pixels
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
......@@ -45,8 +45,8 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
......@@ -54,7 +54,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
"vst1.u8 {q0}, [%2]! \n"
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
......@@ -69,9 +69,9 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.u8 {d2}, [%1]! \n"
"vst1.8 {d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -88,10 +88,10 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load up 16x4
"vld1.u8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [r4]! \n"
"vld1.8 {q2}, [r5]! \n"
"vld1.8 {q3}, [%3]! \n"
"subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
......@@ -100,7 +100,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
"vst1.u32 {d0[0]}, [%1]! \n"
"vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -118,10 +118,10 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -138,8 +138,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
// filter src line 0 with src line 1
......@@ -175,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
......@@ -194,8 +194,8 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
......@@ -214,7 +214,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -242,14 +242,14 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u8 {q3}, [%3] \n"
"vld1.8 {q3}, [%3] \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n"
"vst1.8 {d4}, [%1]! \n"
"vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -264,9 +264,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"vld1.u8 {q15}, [%6] \n"
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"vld1.8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
"1: \n"
......@@ -275,9 +275,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.8 {d16, d17, d18, d19}, [r4]! \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
......@@ -354,8 +354,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"vst1.8 {d3}, [%1]! \n"
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -374,8 +374,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
"1: \n"
......@@ -383,8 +383,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
......@@ -450,8 +450,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"vst1.8 {d3}, [%1]! \n"
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -483,8 +483,8 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
......@@ -492,51 +492,51 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"vld1.8 {q1}, [%1]! \n"
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"vst1.u8 {d1[7]}, [%0] \n"
"vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment