Commit 0bb310eb authored by fbarchard@google.com's avatar fbarchard@google.com

Add bic instructions before each load or store for nacl

BUG=333
TESTED=validator
R=jfb@chromium.org

Review URL: https://webrtc-codereview.appspot.com/13669004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1013 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent bf3b1111
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1012 Version: 1013
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -449,7 +449,7 @@ typedef uint8 uvec8[16]; ...@@ -449,7 +449,7 @@ typedef uint8 uvec8[16];
#if defined(__arm__) #if defined(__arm__)
#undef MEMACCESS #undef MEMACCESS
#if defined(__native_client__) #if defined(__native_client__)
#define MEMACCESS(base) "bic %" #base ", #0xc0000000\n" #define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
#else #else
#define MEMACCESS(base) "\n" #define MEMACCESS(base) "\n"
#endif #endif
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1012 #define LIBYUV_VERSION 1013
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -16,39 +16,50 @@ extern "C" { ...@@ -16,39 +16,50 @@ extern "C" {
#endif #endif
// This module is for GCC Neon // This module is for GCC Neon
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
!defined(__native_client__)
// Read 8 Y, 4 U and 4 V from 422 // Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \ #define READYUV422 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.32 {d2[0]}, [%1]! \n" \ "vld1.32 {d2[0]}, [%1]! \n" \
MEMACCESS(2) \
"vld1.32 {d2[1]}, [%2]! \n" "vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 2 U and 2 V from 422 // Read 8 Y, 2 U and 2 V from 422
#define READYUV411 \ #define READYUV411 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.16 {d2[0]}, [%1]! \n" \ "vld1.16 {d2[0]}, [%1]! \n" \
MEMACCESS(2) \
"vld1.16 {d2[1]}, [%2]! \n" \ "vld1.16 {d2[1]}, [%2]! \n" \
"vmov.u8 d3, d2 \n" \ "vmov.u8 d3, d2 \n" \
"vzip.u8 d2, d3 \n" "vzip.u8 d2, d3 \n"
// Read 8 Y, 8 U and 8 V from 444 // Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \ #define READYUV444 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \ "vld1.8 {d2}, [%1]! \n" \
MEMACCESS(2) \
"vld1.8 {d3}, [%2]! \n" \ "vld1.8 {d3}, [%2]! \n" \
"vpaddl.u8 q1, q1 \n" \ "vpaddl.u8 q1, q1 \n" \
"vrshrn.u16 d2, q1, #1 \n" "vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128 // Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \ #define READYUV400 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n" "vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12 // Read 8 Y and 4 UV from NV12
#define READNV12 \ #define READNV12 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \ "vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \ "vuzp.u8 d2, d3 \n" \
...@@ -56,7 +67,9 @@ extern "C" { ...@@ -56,7 +67,9 @@ extern "C" {
// Read 8 Y and 4 VU from NV21 // Read 8 Y and 4 VU from NV21
#define READNV21 \ #define READNV21 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \ "vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \ "vuzp.u8 d3, d2 \n" \
...@@ -64,6 +77,7 @@ extern "C" { ...@@ -64,6 +77,7 @@ extern "C" {
// Read 8 YUY2 // Read 8 YUY2
#define READYUY2 \ #define READYUY2 \
MEMACCESS(0) \
"vld2.8 {d0, d2}, [%0]! \n" \ "vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \ "vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \ "vuzp.u8 d2, d3 \n" \
...@@ -71,6 +85,7 @@ extern "C" { ...@@ -71,6 +85,7 @@ extern "C" {
// Read 8 UYVY // Read 8 UYVY
#define READUYVY \ #define READUYVY \
MEMACCESS(0) \
"vld2.8 {d2, d3}, [%0]! \n" \ "vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \ "vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \ "vmov.u8 d3, d2 \n" \
...@@ -114,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -114,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -125,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -125,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -145,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -145,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -156,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -156,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -176,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y, ...@@ -176,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -187,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, ...@@ -187,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -207,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y, ...@@ -207,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra, uint8* dst_bgra,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -219,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, ...@@ -219,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -239,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y, ...@@ -239,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr, uint8* dst_abgr,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -251,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, ...@@ -251,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -271,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -271,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba, uint8* dst_rgba,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -282,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -282,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -302,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -302,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24, uint8* dst_rgb24,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -312,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -312,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
READYUV422 READYUV422
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n" "vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -332,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y, ...@@ -332,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw, uint8* dst_raw,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -343,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, ...@@ -343,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n" "vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -375,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -375,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -386,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -386,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -421,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -421,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555, uint8* dst_argb1555,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -433,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -433,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -462,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -462,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444, uint8* dst_argb4444,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n" "vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n" "vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -475,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -475,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -493,7 +541,10 @@ void YToARGBRow_NEON(const uint8* src_y, ...@@ -493,7 +541,10 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(3)
MEMACCESS(3)
"vld1.8 {d24}, [%3] \n" "vld1.8 {d24}, [%3] \n"
MEMACCESS(4)
"vld1.8 {d25}, [%4] \n" "vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -504,6 +555,7 @@ void YToARGBRow_NEON(const uint8* src_y, ...@@ -504,6 +555,7 @@ void YToARGBRow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -523,10 +575,12 @@ void I400ToARGBRow_NEON(const uint8* src_y, ...@@ -523,10 +575,12 @@ void I400ToARGBRow_NEON(const uint8* src_y,
".p2align 2 \n" ".p2align 2 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n" "vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n" "vmov d21, d20 \n"
"vmov d22, d20 \n" "vmov d22, d20 \n"
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -542,7 +596,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -542,7 +596,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -553,6 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -553,6 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -571,7 +628,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -571,7 +628,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -582,6 +641,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -582,6 +641,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -600,7 +660,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -600,7 +660,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -611,6 +673,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -611,6 +673,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -629,7 +692,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -629,7 +692,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n" "vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n" "vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -640,6 +705,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -640,6 +705,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB YUV422TORGB
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -657,7 +723,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -657,7 +723,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(3)
"vld1.8 {d24}, [%3] \n" "vld1.8 {d24}, [%3] \n"
MEMACCESS(4)
"vld1.8 {d25}, [%4] \n" "vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -668,6 +736,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -668,6 +736,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
YUV422TORGB YUV422TORGB
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
...@@ -684,7 +753,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -684,7 +753,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(3)
"vld1.8 {d24}, [%3] \n" "vld1.8 {d24}, [%3] \n"
MEMACCESS(4)
"vld1.8 {d25}, [%4] \n" "vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n" "vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
...@@ -695,6 +766,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -695,6 +766,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
YUV422TORGB YUV422TORGB
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
...@@ -713,9 +785,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -713,9 +785,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store U "vst1.8 {q0}, [%1]! \n" // store U
MEMACCESS(2)
"vst1.8 {q1}, [%2]! \n" // store V "vst1.8 {q1}, [%2]! \n" // store V
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
...@@ -733,9 +808,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -733,9 +808,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q0}, [%0]! \n" // load U
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load V "vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2)
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n" "bgt 1b \n"
: :
...@@ -753,8 +831,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -753,8 +831,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1)
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
...@@ -771,6 +851,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { ...@@ -771,6 +851,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store "vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
...@@ -799,10 +880,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -799,10 +880,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop. "subs %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16 "vst1.8 {d1}, [%1]! \n" // dst += 16
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" "vst1.8 {d0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
...@@ -823,10 +907,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -823,10 +907,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop. "subs %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // dst += 8 "vst1.8 {d0}, [%1]! \n" // dst += 8
MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" "vst1.8 {d1}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
...@@ -847,10 +934,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -847,10 +934,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop. "subs %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n" "vrev64.32 q0, q0 \n"
MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16 "vst1.8 {d1}, [%1]! \n" // dst += 16
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" "vst1.8 {d0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
...@@ -866,8 +956,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -866,8 +956,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
...@@ -883,9 +975,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -883,9 +975,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "vswp.u8 d1, d3 \n" // swap R, B
MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
...@@ -913,9 +1007,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { ...@@ -913,9 +1007,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
...@@ -959,9 +1055,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ...@@ -959,9 +1055,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
...@@ -988,9 +1086,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, ...@@ -988,9 +1086,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
...@@ -1005,8 +1105,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ...@@ -1005,8 +1105,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1021,9 +1123,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ...@@ -1021,9 +1123,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "vswp.u8 d1, d3 \n" // swap R, B
MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1038,8 +1142,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1038,8 +1142,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
...@@ -1054,8 +1160,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1054,8 +1160,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
...@@ -1071,9 +1179,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ...@@ -1071,9 +1179,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 U. "vst1.8 {d1}, [%1]! \n" // store 8 U.
MEMACCESS(2)
"vst1.8 {d3}, [%2]! \n" // store 8 V. "vst1.8 {d3}, [%2]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
...@@ -1090,9 +1201,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ...@@ -1090,9 +1201,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 U. "vst1.8 {d0}, [%1]! \n" // store 8 U.
MEMACCESS(2)
"vst1.8 {d2}, [%2]! \n" // store 8 V. "vst1.8 {d2}, [%2]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
...@@ -1110,12 +1224,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1110,12 +1224,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"add %1, %0, %1 \n" // stride + src_yuy2 "add %1, %0, %1 \n" // stride + src_yuy2
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V "vrhadd.u8 d3, d3, d7 \n" // average rows of V
MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 U. "vst1.8 {d1}, [%2]! \n" // store 8 U.
MEMACCESS(3)
"vst1.8 {d3}, [%3]! \n" // store 8 V. "vst1.8 {d3}, [%3]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
...@@ -1134,12 +1252,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1134,12 +1252,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"add %1, %0, %1 \n" // stride + src_uyvy "add %1, %0, %1 \n" // stride + src_uyvy
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V "vrhadd.u8 d2, d2, d6 \n" // average rows of V
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 U. "vst1.8 {d0}, [%2]! \n" // store 8 U.
MEMACCESS(3)
"vst1.8 {d2}, [%3]! \n" // store 8 V. "vst1.8 {d2}, [%3]! \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
...@@ -1158,10 +1280,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, ...@@ -1158,10 +1280,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2 "vrhadd.u8 q0, q1 \n" // average row 1 and 2
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
...@@ -1179,11 +1304,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1179,11 +1304,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile ( asm volatile (
"vmov.u32 d6[0], %3 \n" // selector "vmov.u32 d6[0], %3 \n" // selector
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels "vtrn.u32 d4, d5 \n" // combine 8 pixels
MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" // store 8. "vst1.8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1199,8 +1326,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1199,8 +1326,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) { uint32 /*selector*/, int pix) {
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's. "vst1.8 {d1}, [%1]! \n" // store 8 G's.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1215,12 +1344,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1215,12 +1344,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
asm volatile ( asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 4 pixels. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 4. "vst1.8 {q1}, [%1]! \n" // store 4.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1238,10 +1370,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1238,10 +1370,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d1}, [%1]! \n" // load 8 Us
MEMACCESS(2)
"vld1.8 {d3}, [%2]! \n" // load 8 Vs "vld1.8 {d3}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -1261,10 +1397,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1261,10 +1397,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d0}, [%1]! \n" // load 8 Us
MEMACCESS(2)
"vld1.8 {d2}, [%2]! \n" // load 8 Vs "vld1.8 {d2}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -1281,9 +1421,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { ...@@ -1281,9 +1421,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565 ARGBTORGB565
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1299,9 +1441,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, ...@@ -1299,9 +1441,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1318,9 +1462,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, ...@@ -1318,9 +1462,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic. "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1339,6 +1485,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1339,6 +1485,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
...@@ -1346,6 +1493,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1346,6 +1493,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1363,12 +1511,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1363,12 +1511,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1391,6 +1541,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1391,6 +1541,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
...@@ -1406,7 +1557,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1406,7 +1557,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1430,7 +1583,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1430,7 +1583,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
...@@ -1451,7 +1606,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1451,7 +1606,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1476,12 +1633,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1476,12 +1633,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(0)
"vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
MEMACCESS(0)
"vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
"vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
...@@ -1509,7 +1670,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1509,7 +1670,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1548,12 +1711,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1548,12 +1711,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
...@@ -1565,7 +1732,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1565,7 +1732,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(q0, q1, q2)
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1592,12 +1761,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1592,12 +1761,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
...@@ -1609,7 +1782,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1609,7 +1782,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(q0, q1, q2)
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1635,12 +1810,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ...@@ -1635,12 +1810,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
...@@ -1652,7 +1831,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ...@@ -1652,7 +1831,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q3, q2, q1) RGBTOUV(q3, q2, q1)
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
...@@ -1678,12 +1859,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ...@@ -1678,12 +1859,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
...@@ -1695,7 +1880,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ...@@ -1695,7 +1880,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0) RGBTOUV(q2, q1, q0)
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
...@@ -1721,12 +1908,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ...@@ -1721,12 +1908,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
...@@ -1738,7 +1929,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ...@@ -1738,7 +1929,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(q0, q1, q2)
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
...@@ -1764,12 +1957,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -1764,12 +1957,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
...@@ -1781,7 +1978,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -1781,7 +1978,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(q0, q1, q2)
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
...@@ -1807,12 +2006,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, ...@@ -1807,12 +2006,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
...@@ -1824,7 +2027,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, ...@@ -1824,7 +2027,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0) RGBTOUV(q2, q1, q0)
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
...@@ -1851,22 +2056,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, ...@@ -1851,22 +2056,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB RGB565TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB RGB565TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB RGB565TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
...@@ -1888,7 +2097,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, ...@@ -1888,7 +2097,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
...@@ -1915,22 +2126,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -1915,22 +2126,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB RGB555TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB RGB555TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB RGB555TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
...@@ -1952,7 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -1952,7 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
...@@ -1979,22 +2196,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -1979,22 +2196,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vmov.u16 q15, #0x8080 \n" // 128.5 "vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB ARGB4444TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB ARGB4444TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB ARGB4444TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
...@@ -2016,7 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -2016,7 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
...@@ -2038,6 +2261,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { ...@@ -2038,6 +2261,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
...@@ -2046,6 +2270,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { ...@@ -2046,6 +2270,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
...@@ -2064,6 +2289,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { ...@@ -2064,6 +2289,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
...@@ -2072,6 +2298,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { ...@@ -2072,6 +2298,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
...@@ -2090,6 +2317,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { ...@@ -2090,6 +2317,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
...@@ -2098,6 +2326,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { ...@@ -2098,6 +2326,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
...@@ -2116,6 +2345,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -2116,6 +2345,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R "vmull.u8 q8, d1, d4 \n" // R
...@@ -2123,6 +2353,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -2123,6 +2353,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmlal.u8 q8, d3, d6 \n" // B "vmlal.u8 q8, d3, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
...@@ -2141,6 +2372,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -2141,6 +2372,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R "vmull.u8 q8, d0, d4 \n" // R
...@@ -2148,6 +2380,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -2148,6 +2380,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // B "vmlal.u8 q8, d2, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
...@@ -2166,6 +2399,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ...@@ -2166,6 +2399,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B "vmull.u8 q8, d1, d4 \n" // B
...@@ -2173,6 +2407,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ...@@ -2173,6 +2407,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmlal.u8 q8, d3, d6 \n" // R "vmlal.u8 q8, d3, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
...@@ -2191,6 +2426,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ...@@ -2191,6 +2426,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B "vmull.u8 q8, d0, d4 \n" // B
...@@ -2198,6 +2434,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ...@@ -2198,6 +2434,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // R "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
...@@ -2216,6 +2453,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2216,6 +2453,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B "vmull.u8 q8, d0, d4 \n" // B
...@@ -2223,6 +2461,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2223,6 +2461,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // R "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
...@@ -2253,7 +2492,9 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2253,7 +2492,9 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "vmull.u8 q13, d0, d4 \n"
...@@ -2262,46 +2503,58 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2262,46 +2503,58 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n" "vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 25b \n" "bgt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n" "vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 75b \n" "bgt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
...@@ -2324,7 +2577,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2324,7 +2577,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"blt 89f \n" "blt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a "vmull.u8 q10, d4, d3 \n" // db * a
...@@ -2338,6 +2593,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2338,6 +2593,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr "vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255 "vmov.u8 d3, #255 \n" // a = 255
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"bge 8b \n" "bge 8b \n"
...@@ -2347,7 +2603,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2347,7 +2603,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// Blend 1 pixels. // Blend 1 pixels.
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
MEMACCESS(1)
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop. "subs %3, %3, #1 \n" // 1 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a "vmull.u8 q10, d4, d3 \n" // db * a
...@@ -2361,6 +2619,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2361,6 +2619,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr "vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255 "vmov.u8 d3, #255 \n" // a = 255
MEMACCESS(2)
"vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
"bge 1b \n" "bge 1b \n"
...@@ -2380,6 +2639,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2380,6 +2639,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
// Attenuate 8 pixels. // Attenuate 8 pixels.
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a "vmull.u8 q10, d0, d3 \n" // b * a
...@@ -2388,6 +2648,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2388,6 +2648,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -2411,6 +2672,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2411,6 +2672,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255) "vmovl.u8 q0, d0 \n" // b (0 .. 255)
...@@ -2428,6 +2690,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2428,6 +2690,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"vqmovn.u16 d0, q0 \n" "vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d2, q1 \n" "vqmovn.u16 d2, q1 \n"
"vqmovn.u16 d4, q2 \n" "vqmovn.u16 d4, q2 \n"
MEMACCESS(0)
"vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
...@@ -2452,6 +2715,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2452,6 +2715,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255) "vmovl.u8 q10, d20 \n" // b (0 .. 255)
...@@ -2466,6 +2730,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2466,6 +2730,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"vqmovn.u16 d22, q11 \n" "vqmovn.u16 d22, q11 \n"
"vqmovn.u16 d24, q12 \n" "vqmovn.u16 d24, q12 \n"
"vqmovn.u16 d26, q13 \n" "vqmovn.u16 d26, q13 \n"
MEMACCESS(1)
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -2486,6 +2751,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2486,6 +2751,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
...@@ -2494,6 +2760,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2494,6 +2760,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
"vmov d1, d0 \n" // G "vmov d1, d0 \n" // G
"vmov d2, d0 \n" // R "vmov d2, d0 \n" // R
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -2521,6 +2788,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2521,6 +2788,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d30, #50 \n" // BR coefficient "vmov.u8 d30, #50 \n" // BR coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B "vmull.u8 q2, d0, d20 \n" // B to Sepia B
...@@ -2535,6 +2803,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2535,6 +2803,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
"vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
"vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
MEMACCESS(0)
"vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
...@@ -2551,12 +2820,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2551,12 +2820,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) { const int8* matrix_argb, int width) {
asm volatile ( asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
...@@ -2595,6 +2866,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -2595,6 +2866,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
MEMACCESS(1)
"vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -2615,7 +2887,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2615,7 +2887,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(1)
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B "vmull.u8 q0, d0, d1 \n" // multiply B
...@@ -2626,6 +2900,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2626,6 +2900,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
"vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
...@@ -2646,11 +2921,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2646,11 +2921,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A "vqadd.u8 q1, q1, q3 \n" // add R, A
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
...@@ -2670,11 +2948,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2670,11 +2948,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A "vqsub.u8 q1, q1, q3 \n" // subtract R, A
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
...@@ -2699,12 +2980,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2699,12 +2980,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 sobely. "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d0, d0, d1 \n" // add "vqadd.u8 d0, d0, d1 \n" // add
"vmov.u8 d1, d0 \n" "vmov.u8 d1, d0 \n"
"vmov.u8 d2, d0 \n" "vmov.u8 d2, d0 \n"
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
...@@ -2723,10 +3007,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2723,10 +3007,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 16 pixel loop. // 16 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load 16 sobely. "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %3, %3, #16 \n" // 16 processed per loop.
"vqadd.u8 q0, q0, q1 \n" // add "vqadd.u8 q0, q0, q1 \n" // add
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 16 pixels. "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
...@@ -2750,10 +3037,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -2750,10 +3037,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 sobely. "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d1, d0, d2 \n" // add "vqadd.u8 d1, d0, d2 \n" // add
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
...@@ -2774,21 +3064,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -2774,21 +3064,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d0}, [%0],%5 \n" // top
MEMACCESS(0)
"vld1.8 {d1}, [%0],%6 \n" "vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n" "vsubl.u8 q0, d0, d1 \n"
MEMACCESS(1)
"vld1.8 {d2}, [%1],%5 \n" // center * 2 "vld1.8 {d2}, [%1],%5 \n" // center * 2
MEMACCESS(1)
"vld1.8 {d3}, [%1],%6 \n" "vld1.8 {d3}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
MEMACCESS(2)
"vld1.8 {d2}, [%2],%5 \n" // bottom "vld1.8 {d2}, [%2],%5 \n" // bottom
MEMACCESS(2)
"vld1.8 {d3}, [%2],%6 \n" "vld1.8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels "subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n" "vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n" "vqmovn.u16 d0, q0 \n"
MEMACCESS(3)
"vst1.8 {d0}, [%3]! \n" // store 8 sobelx "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
...@@ -2811,21 +3108,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -2811,21 +3108,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d0}, [%0],%4 \n" // left
MEMACCESS(1)
"vld1.8 {d1}, [%1],%4 \n" "vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n" "vsubl.u8 q0, d0, d1 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0],%4 \n" // center * 2 "vld1.8 {d2}, [%0],%4 \n" // center * 2
MEMACCESS(1)
"vld1.8 {d3}, [%1],%4 \n" "vld1.8 {d3}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0],%5 \n" // right "vld1.8 {d2}, [%0],%5 \n" // right
MEMACCESS(1)
"vld1.8 {d3}, [%1],%5 \n" "vld1.8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels "subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n" "vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n" "vqmovn.u16 d0, q0 \n"
MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 sobely "vst1.8 {d0}, [%2]! \n" // store 8 sobely
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment