Commit 2c4e3993 authored by fbarchard@google.com's avatar fbarchard@google.com

Change vld and vst to specify only elements and size not sign for better arm…

Change vld and vst to specify only elements and size not sign for better arm compiler compatability.
BUG=none
TEST=none
R=kma@webrtc.org, mflodman@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/1643004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@721 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent ae67c900
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 720 Version: 721
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 720 #define LIBYUV_VERSION 721
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -27,8 +27,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ...@@ -27,8 +27,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld1.u8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n" "vsubl.u8 q3, d1, d3 \n"
......
...@@ -20,57 +20,57 @@ extern "C" { ...@@ -20,57 +20,57 @@ extern "C" {
// Read 8 Y, 4 U and 4 V from 422 // Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \ #define READYUV422 \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
"vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.32 {d2[0]}, [%1]! \n" \
"vld1.u32 {d2[1]}, [%2]! \n" "vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 2 U and 2 V from 422 // Read 8 Y, 2 U and 2 V from 422
#define READYUV411 \ #define READYUV411 \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
"vld1.u16 {d2[0]}, [%1]! \n" \ "vld1.16 {d2[0]}, [%1]! \n" \
"vld1.u16 {d2[1]}, [%2]! \n" \ "vld1.16 {d2[1]}, [%2]! \n" \
"vmov.u8 d3, d2 \n" \ "vmov.u8 d3, d2 \n" \
"vzip.u8 d2, d3 \n" "vzip.u8 d2, d3 \n"
// Read 8 Y, 8 U and 8 V from 444 // Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \ #define READYUV444 \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \ "vld1.8 {d2}, [%1]! \n" \
"vld1.u8 {d3}, [%2]! \n" \ "vld1.8 {d3}, [%2]! \n" \
"vpaddl.u8 q1, q1 \n" \ "vpaddl.u8 q1, q1 \n" \
"vrshrn.u16 d2, q1, #1 \n" "vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128 // Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \ #define READYUV400 \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n" "vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12 // Read 8 Y and 4 UV from NV12
#define READNV12 \ #define READNV12 \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \ "vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \ "vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n" "vtrn.u32 d2, d3 \n"
// Read 8 Y and 4 VU from NV21 // Read 8 Y and 4 VU from NV21
#define READNV21 \ #define READNV21 \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \ "vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \ "vuzp.u8 d3, d2 \n" \
"vtrn.u32 d2, d3 \n" "vtrn.u32 d2, d3 \n"
// Read 8 YUY2 // Read 8 YUY2
#define READYUY2 \ #define READYUY2 \
"vld2.u8 {d0, d2}, [%0]! \n" \ "vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \ "vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \ "vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n" "vtrn.u32 d2, d3 \n"
// Read 8 UYVY // Read 8 UYVY
#define READUYVY \ #define READUYVY \
"vld2.u8 {d2, d3}, [%0]! \n" \ "vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \ "vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \ "vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \ "vuzp.u8 d2, d3 \n" \
...@@ -113,8 +113,8 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -113,8 +113,8 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -144,8 +144,8 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -144,8 +144,8 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -175,8 +175,8 @@ void I411ToARGBRow_NEON(const uint8* src_y, ...@@ -175,8 +175,8 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -206,8 +206,8 @@ void I422ToBGRARow_NEON(const uint8* src_y, ...@@ -206,8 +206,8 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra, uint8* dst_bgra,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -238,8 +238,8 @@ void I422ToABGRRow_NEON(const uint8* src_y, ...@@ -238,8 +238,8 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr, uint8* dst_abgr,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -270,8 +270,8 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -270,8 +270,8 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba, uint8* dst_rgba,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -301,8 +301,8 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -301,8 +301,8 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24, uint8* dst_rgb24,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -331,8 +331,8 @@ void I422ToRAWRow_NEON(const uint8* src_y, ...@@ -331,8 +331,8 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw, uint8* dst_raw,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -374,8 +374,8 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -374,8 +374,8 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -420,8 +420,8 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -420,8 +420,8 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555, uint8* dst_argb1555,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -461,8 +461,8 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -461,8 +461,8 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444, uint8* dst_argb4444,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -492,8 +492,8 @@ void YToARGBRow_NEON(const uint8* src_y, ...@@ -492,8 +492,8 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%3] \n" "vld1.8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n" "vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -522,7 +522,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, ...@@ -522,7 +522,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
".p2align 2 \n" ".p2align 2 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" "1: \n"
"vld1.u8 {d20}, [%0]! \n" "vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n" "vmov d21, d20 \n"
"vmov d22, d20 \n" "vmov d22, d20 \n"
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
...@@ -541,8 +541,8 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -541,8 +541,8 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -570,8 +570,8 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -570,8 +570,8 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -599,8 +599,8 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -599,8 +599,8 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -628,8 +628,8 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -628,8 +628,8 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -656,8 +656,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -656,8 +656,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%3] \n" "vld1.8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n" "vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -683,8 +683,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -683,8 +683,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {d24}, [%3] \n" "vld1.8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n" "vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
...@@ -712,10 +712,10 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -712,10 +712,10 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U "vst1.8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // store V "vst1.8 {q1}, [%2]! \n" // store V
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -732,8 +732,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -732,8 +732,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld1.u8 {q0}, [%0]! \n" // load U "vld1.8 {q0}, [%0]! \n" // load U
"vld1.u8 {q1}, [%1]! \n" // load V "vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n" "bgt 1b \n"
...@@ -747,14 +747,14 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -747,14 +747,14 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
); );
} }
// Copy multiple of 32. vld4.u8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
"vst1.u8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -770,7 +770,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { ...@@ -770,7 +770,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u8 {q0}, [%0]! \n" // store "vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
...@@ -1037,9 +1037,9 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1037,9 +1037,9 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
...@@ -1053,9 +1053,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1053,9 +1053,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
...@@ -1072,8 +1072,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ...@@ -1072,8 +1072,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.u8 {d1}, [%1]! \n" // store 8 U. "vst1.8 {d1}, [%1]! \n" // store 8 U.
"vst1.u8 {d3}, [%2]! \n" // store 8 V. "vst1.8 {d3}, [%2]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -1091,8 +1091,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ...@@ -1091,8 +1091,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.u8 {d0}, [%1]! \n" // store 8 U. "vst1.8 {d0}, [%1]! \n" // store 8 U.
"vst1.u8 {d2}, [%2]! \n" // store 8 V. "vst1.8 {d2}, [%2]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -1114,8 +1114,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1114,8 +1114,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V "vrhadd.u8 d3, d3, d7 \n" // average rows of V
"vst1.u8 {d1}, [%2]! \n" // store 8 U. "vst1.8 {d1}, [%2]! \n" // store 8 U.
"vst1.u8 {d3}, [%3]! \n" // store 8 V. "vst1.8 {d3}, [%3]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1 "+r"(stride_yuy2), // %1
...@@ -1138,8 +1138,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1138,8 +1138,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V "vrhadd.u8 d2, d2, d6 \n" // average rows of V
"vst1.u8 {d0}, [%2]! \n" // store 8 U. "vst1.8 {d0}, [%2]! \n" // store 8 U.
"vst1.u8 {d2}, [%3]! \n" // store 8 V. "vst1.8 {d2}, [%3]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1 "+r"(stride_uyvy), // %1
...@@ -1157,11 +1157,11 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, ...@@ -1157,11 +1157,11 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
"vld1.u8 {q0}, [%0]! \n" // load row 1 16 pixels. "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vld1.u8 {q1}, [%1]! \n" // load row 2 16 pixels. "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2 "vrhadd.u8 q0, q1 \n" // average row 1 and 2
"vst1.u8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1 "+r"(src_uv_stride), // %1
...@@ -1178,12 +1178,12 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1178,12 +1178,12 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile ( asm volatile (
"vmov.u32 d6[0], %3 \n" // selector "vmov.u32 d6[0], %3 \n" // selector
"1: \n" "1: \n"
"vld1.u8 {q0, q1}, [%0]! \n" // load row 8 pixels. "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels "vtrn.u32 d4, d5 \n" // combine 8 pixels
"vst1.u8 {d4}, [%1]! \n" // store 8. "vst1.8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
...@@ -1197,13 +1197,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1197,13 +1197,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
asm volatile ( asm volatile (
"vld1.u8 {q2}, [%3] \n" // shuffler "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n" "1: \n"
"vld1.u8 {q0}, [%0]! \n" // load 4 pixels. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
"vst1.u8 {q1}, [%1]! \n" // store 4. "vst1.8 {q1}, [%1]! \n" // store 4.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -1224,7 +1224,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1224,7 +1224,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d1}, [%1]! \n" // load 8 Us
"vld1.8 {d3}, [%2]! \n" // load 8 Vs "vld1.8 {d3}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
"vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -1247,7 +1247,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1247,7 +1247,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d0}, [%1]! \n" // load 8 Us
"vld1.8 {d2}, [%2]! \n" // load 8 Vs "vld1.8 {d2}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
"vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -2181,8 +2181,8 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2181,8 +2181,8 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n" "vmull.u8 q14, d1, d4 \n"
...@@ -2190,47 +2190,47 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2190,47 +2190,47 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n" "vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 25b \n" "bgt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
"vld1.u8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n" "vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 75b \n" "bgt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
...@@ -2478,7 +2478,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2478,7 +2478,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
int width) { int width) {
asm volatile ( asm volatile (
"vld1.u8 {q2}, [%2] \n" // load 3 ARGB vectors. "vld1.8 {q2}, [%2] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R coefficients s16. "vmovl.s8 q1, d5 \n" // R coefficients s16.
...@@ -2670,22 +2670,22 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -2670,22 +2670,22 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld1.u8 {d0}, [%0],%5 \n" // top "vld1.8 {d0}, [%0],%5 \n" // top
"vld1.u8 {d1}, [%0],%6 \n" "vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n" "vsubl.u8 q0, d0, d1 \n"
"vld1.u8 {d2}, [%1],%5 \n" // center * 2 "vld1.8 {d2}, [%1],%5 \n" // center * 2
"vld1.u8 {d3}, [%1],%6 \n" "vld1.8 {d3}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vld1.u8 {d2}, [%2],%5 \n" // bottom "vld1.8 {d2}, [%2],%5 \n" // bottom
"vld1.u8 {d3}, [%2],%6 \n" "vld1.8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels "subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n" "vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n" "vqmovn.u16 d0, q0 \n"
"vst1.u8 {d0}, [%3]! \n" // store 8 sobelx "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
...@@ -2707,22 +2707,22 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -2707,22 +2707,22 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld1.u8 {d0}, [%0],%4 \n" // left "vld1.8 {d0}, [%0],%4 \n" // left
"vld1.u8 {d1}, [%1],%4 \n" "vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n" "vsubl.u8 q0, d0, d1 \n"
"vld1.u8 {d2}, [%0],%4 \n" // center * 2 "vld1.8 {d2}, [%0],%4 \n" // center * 2
"vld1.u8 {d3}, [%1],%4 \n" "vld1.8 {d3}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vld1.u8 {d2}, [%0],%5 \n" // right "vld1.8 {d2}, [%0],%5 \n" // right
"vld1.u8 {d3}, [%1],%5 \n" "vld1.8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels "subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n" "vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n" "vqmovn.u16 d0, q0 \n"
"vst1.u8 {d0}, [%2]! \n" // store 8 sobely "vst1.8 {d0}, [%2]! \n" // store 8 sobely
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
......
...@@ -24,11 +24,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -24,11 +24,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.u32 {q0, q1}, [%0]! \n" "vld2.32 {q0, q1}, [%0]! \n"
"vld2.u32 {q2, q3}, [%0]! \n" "vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vst1.u8 {q1}, [%1]! \n" // store odd pixels "vst1.8 {q1}, [%1]! \n" // store odd pixels
"vst1.u8 {q3}, [%1]! \n" "vst1.8 {q3}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -61,7 +61,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -61,7 +61,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n" "vrshrn.u16 d3, q3, #2 \n"
"vst4.u8 {d0, d1, d2, d3}, [%2]! \n" "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
......
...@@ -27,9 +27,9 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -27,9 +27,9 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.u8 {q0, q1}, [%0]! \n" "vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vst1.u8 {q1}, [%1]! \n" // store odd pixels "vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -45,8 +45,8 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -45,8 +45,8 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
"vld1.u8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2, q3}, [%1]! \n" // load row 2 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1 \n"
...@@ -54,7 +54,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -54,7 +54,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpadal.u8 q1, q3 \n" "vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
"vst1.u8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
...@@ -69,9 +69,9 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -69,9 +69,9 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"1: \n" "1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vst1.u8 {d2}, [%1]! \n" "vst1.8 {d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -88,10 +88,10 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -88,10 +88,10 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add r5, r4, %3 \n" "add r5, r4, %3 \n"
"add %3, r5, %3 \n" "add %3, r5, %3 \n"
"1: \n" "1: \n"
"vld1.u8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.u8 {q1}, [r4]! \n" "vld1.8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n" "vld1.8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n" "vld1.8 {q3}, [%3]! \n"
"subs %2, %2, #4 \n" "subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n" "vpadal.u8 q0, q1 \n"
...@@ -100,7 +100,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -100,7 +100,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpaddl.u16 q0, q0 \n" "vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n" "vmovn.u16 d0, q0 \n"
"vst1.u32 {d0[0]}, [%1]! \n" "vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -118,10 +118,10 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -118,10 +118,10 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"1: \n" "1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2 "vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -138,8 +138,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -138,8 +138,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
// filter src line 0 with src line 1 // filter src line 0 with src line 1
...@@ -175,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ...@@ -175,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q8, d3, d24 \n" "vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n" "vqrshrn.u16 d2, q8, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -194,8 +194,8 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -194,8 +194,8 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n" "vrhadd.u8 q0, q0, q2 \n"
...@@ -214,7 +214,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ...@@ -214,7 +214,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q3, d3, d24 \n" "vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n" "vqrshrn.u16 d2, q3, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -242,14 +242,14 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -242,14 +242,14 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"vld1.u8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
"1: \n" "1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n" "vst1.8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n" "vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -264,9 +264,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -264,9 +264,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"vld1.u16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"vld1.u8 {q15}, [%6] \n" "vld1.8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n" "add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
...@@ -275,9 +275,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -275,9 +275,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n" "vld4.8 {d16, d17, d18, d19}, [r4]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
...@@ -354,8 +354,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -354,8 +354,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n" "vst1.8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -374,8 +374,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -374,8 +374,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"vld1.u16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
...@@ -383,8 +383,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -383,8 +383,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
...@@ -450,8 +450,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -450,8 +450,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n" "vst1.8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -483,8 +483,8 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -483,8 +483,8 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n" "vmull.u8 q14, d1, d4 \n"
...@@ -492,51 +492,51 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -492,51 +492,51 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n" "vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 25b \n" "bgt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
"vld1.u8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n" "vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 75b \n" "bgt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
"vst1.u8 {d1[7]}, [%0] \n" "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment