Commit 147bbede authored by ashok.bhat@gmail.com's avatar ashok.bhat@gmail.com

Row AArch64 Neon implementation - Part 8

BUG=319
TESTED=libyuv_unittest
R=fbarchard@google.com

Change-Id: If30eb2d255a09dece9d216a9d29317dd748ef496
Signed-off-by: 's avatarAshok Bhat <ashok.bhat@arm.com>

Review URL: https://webrtc-codereview.appspot.com/22769004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1109 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent d1a0e7e7
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1107 Version: 1109
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -266,15 +266,15 @@ extern "C" { ...@@ -266,15 +266,15 @@ extern "C" {
#define HAS_I422TORGBAROW_NEON #define HAS_I422TORGBAROW_NEON
#define HAS_I422TORGB24ROW_NEON #define HAS_I422TORGB24ROW_NEON
#define HAS_I422TORAWROW_NEON #define HAS_I422TORAWROW_NEON
// #define HAS_I422TORGB565ROW_NEON #define HAS_I422TORGB565ROW_NEON
// #define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB1555ROW_NEON
// #define HAS_I422TOARGB4444ROW_NEON #define HAS_I422TOARGB4444ROW_NEON
#define HAS_YTOARGBROW_NEON #define HAS_YTOARGBROW_NEON
#define HAS_I400TOARGBROW_NEON #define HAS_I400TOARGBROW_NEON
#define HAS_NV12TOARGBROW_NEON #define HAS_NV12TOARGBROW_NEON
#define HAS_NV21TOARGBROW_NEON #define HAS_NV21TOARGBROW_NEON
// #define HAS_NV12TORGB565ROW_NEON #define HAS_NV12TORGB565ROW_NEON
// #define HAS_NV21TORGB565ROW_NEON #define HAS_NV21TORGB565ROW_NEON
#define HAS_YUY2TOARGBROW_NEON #define HAS_YUY2TOARGBROW_NEON
#define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOARGBROW_NEON
#define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_NEON
...@@ -311,13 +311,13 @@ extern "C" { ...@@ -311,13 +311,13 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUV422ROW_NEON #define HAS_ARGBTOUV422ROW_NEON
#define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV411ROW_NEON
// #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOUVROW_NEON
// #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVJROW_NEON
// #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOUVROW_NEON
// #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOUVROW_NEON
// #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOUVROW_NEON
// #define HAS_RGB24TOUVROW_NEON #define HAS_RGB24TOUVROW_NEON
// #define HAS_RAWTOUVROW_NEON #define HAS_RAWTOUVROW_NEON
#define HAS_RGB565TOUVROW_NEON #define HAS_RGB565TOUVROW_NEON
#define HAS_ARGB1555TOUVROW_NEON #define HAS_ARGB1555TOUVROW_NEON
#define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOUVROW_NEON
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1107 #define LIBYUV_VERSION 1109
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -86,19 +86,19 @@ extern "C" { ...@@ -86,19 +86,19 @@ extern "C" {
#define READUYVY \ #define READUYVY \
MEMACCESS(0) \ MEMACCESS(0) \
"ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
"mov v0.8b, v3.8b \n" \ "orr v0.8b, v3.8b, v3.8b \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \ "uzp1 v1.8b, v2.8b, v2.8b \n" \
"uzp2 v3.8b, v2.8b, v2.8b \n" \ "uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n" "ins v1.s[1], v3.s[0] \n"
#define YUV422TORGB_SETUP_REG \ #define YUV422TORGB_SETUP_REG \
"movi v24.8b, #128 \n" \ "movi v24.8b, #128 \n" \
"movi v25.8h, #74 \n" /* YG */\ "movi v25.8h, #74, lsl #0 \n" /* YG */\
"movi v26.8h, #16 \n" \ "movi v26.8h, #16, lsl #0 \n" \
"movi v27.8h, #127 \n" /* UB */\ "movi v27.8h, #127, lsl #0 \n" /* UB */\
"movi v28.8h, #102 \n" /* VR */\ "movi v28.8h, #102, lsl #0 \n" /* VR */\
"mvni v29.8h, #0x18 \n" /* UG -25 */\ "mvni v29.8h, #0x18, lsl #0 \n" /* UG -25 */\
"mvni v30.8h, #0x33 \n" /* VG -52 */ "mvni v30.8h, #0x33, lsl #0 \n" /* VG -52 */
#define YUV422TORGB(vR, vG, vB) \ #define YUV422TORGB(vR, vG, vB) \
"eor v1.8b, v1.8b, v24.8b \n" /* Subtract 128 from U&V */ \ "eor v1.8b, v1.8b, v24.8b \n" /* Subtract 128 from U&V */ \
...@@ -122,6 +122,15 @@ extern "C" { ...@@ -122,6 +122,15 @@ extern "C" {
"sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
"sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */
#define RGBTOUV_SETUP_REG \
"movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
"movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
"movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
"movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
"movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
#ifdef HAS_I444TOARGBROW_NEON #ifdef HAS_I444TOARGBROW_NEON
void I444ToARGBRow_NEON(const uint8* src_y, void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
...@@ -130,14 +139,13 @@ void I444ToARGBRow_NEON(const uint8* src_y, ...@@ -130,14 +139,13 @@ void I444ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV444 READYUV444
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b-v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -159,14 +167,13 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -159,14 +167,13 @@ void I422ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b-v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -188,14 +195,13 @@ void I411ToARGBRow_NEON(const uint8* src_y, ...@@ -188,14 +195,13 @@ void I411ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV411 READYUV411
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b-v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -217,14 +223,13 @@ void I422ToBGRARow_NEON(const uint8* src_y, ...@@ -217,14 +223,13 @@ void I422ToBGRARow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v21, v22, v23) YUV422TORGB(v21, v22, v23)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"movi v20.8b, #255 \n" /* A */ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b-v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -246,14 +251,13 @@ void I422ToABGRRow_NEON(const uint8* src_y, ...@@ -246,14 +251,13 @@ void I422ToABGRRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v20, v21, v22) YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b-v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -275,14 +279,13 @@ void I422ToRGBARow_NEON(const uint8* src_y, ...@@ -275,14 +279,13 @@ void I422ToRGBARow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v23, v22, v21) YUV422TORGB(v23, v22, v21)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"movi v20.8b, #255 \n" /* A */ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b-v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -304,13 +307,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ...@@ -304,13 +307,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
MEMACCESS(3) MEMACCESS(3)
"st3 {v20.8b-v22.8b}, [%3], #24 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -332,13 +334,12 @@ void I422ToRAWRow_NEON(const uint8* src_y, ...@@ -332,13 +334,12 @@ void I422ToRAWRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v20, v21, v22) YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
MEMACCESS(3) MEMACCESS(3)
"st3 {v20.8b-v22.8b}, [%3], #24 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -366,31 +367,23 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ...@@ -366,31 +367,23 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5) YUV422TORGB_SETUP_REG
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
"+r"(dst_rgb565), // %3 "+r"(dst_rgb565), // %3
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 :
"r"(&kUVToG) // %6 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
: "cc", "memory", "q0", "q1", "q2", "q3", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TORGB565ROW_NEON #endif // HAS_I422TORGB565ROW_NEON
...@@ -411,32 +404,24 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ...@@ -411,32 +404,24 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555, uint8* dst_argb1555,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5) YUV422TORGB_SETUP_REG
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "movi v23.8b, #255 \n"
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
"+r"(dst_argb1555), // %3 "+r"(dst_argb1555), // %3
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 :
"r"(&kUVToG) // %6 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
: "cc", "memory", "q0", "q1", "q2", "q3", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TOARGB1555ROW_NEON #endif // HAS_I422TOARGB1555ROW_NEON
...@@ -458,33 +443,25 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -458,33 +443,25 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444, uint8* dst_argb4444,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(5) YUV422TORGB_SETUP_REG
"vld1.8 {d24}, [%5] \n" "movi v4.16b, #0x0f \n" // bits to clear with vbic.
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "movi v23.8b, #255 \n"
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
"+r"(dst_argb4444), // %3 "+r"(dst_argb4444), // %3
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 :
"r"(&kUVToG) // %6 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
: "cc", "memory", "q0", "q1", "q2", "q3", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TOARGB4444ROW_NEON #endif // HAS_I422TOARGB4444ROW_NEON
...@@ -495,14 +472,13 @@ void YToARGBRow_NEON(const uint8* src_y, ...@@ -495,14 +472,13 @@ void YToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUV400 READYUV400
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b-v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -519,16 +495,15 @@ void I400ToARGBRow_NEON(const uint8* src_y, ...@@ -519,16 +495,15 @@ void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v20.8b}, [%0], #8 \n" "ld1 {v20.8b}, [%0], #8 \n"
"mov v21.8b, v20.8b \n" "orr v21.8b, v20.8b, v20.8b \n"
"mov v22.8b, v20.8b \n" "orr v22.8b, v20.8b, v20.8b \n"
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b-v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -546,14 +521,13 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ...@@ -546,14 +521,13 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v20.8b-v23.8b}, [%2], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
...@@ -573,22 +547,21 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ...@@ -573,22 +547,21 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v20.8b-v23.8b}, [%2], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
); );
} }
#endif // HAS_NV21TOARGBROW_NEON #endif // HAS_NV21TOARGBROW_NEON
...@@ -599,30 +572,22 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ...@@ -599,30 +572,22 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(4) YUV422TORGB_SETUP_REG
"vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
"+r"(dst_rgb565), // %2 "+r"(dst_rgb565), // %2
"+r"(width) // %3 "+r"(width) // %3
: "r"(&kUVToRB), // %4 :
"r"(&kUVToG) // %5 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
: "cc", "memory", "q0", "q1", "q2", "q3", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_NV12TORGB565ROW_NEON #endif // HAS_NV12TORGB565ROW_NEON
...@@ -633,30 +598,22 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, ...@@ -633,30 +598,22 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile (
MEMACCESS(4) YUV422TORGB_SETUP_REG
"vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %3, %3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_uv), // %1 "+r"(src_uv), // %1
"+r"(dst_rgb565), // %2 "+r"(dst_rgb565), // %2
"+r"(width) // %3 "+r"(width) // %3
: "r"(&kUVToRB), // %4 :
"r"(&kUVToG) // %5 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
: "cc", "memory", "q0", "q1", "q2", "q3", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_NV21TORGB565ROW_NEON #endif // HAS_NV21TORGB565ROW_NEON
...@@ -667,14 +624,13 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ...@@ -667,14 +624,13 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READYUY2 READYUY2
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b-v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -692,14 +648,13 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -692,14 +648,13 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n" "1: \n"
READUYVY READUYVY
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %2, %2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b-v23.8b}, [%1], 32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -716,7 +671,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -716,7 +671,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
...@@ -741,7 +695,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -741,7 +695,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) { int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v0.16b}, [%0], #16 \n" // load U
...@@ -766,7 +719,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -766,7 +719,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#ifdef HAS_COPYROW_NEON #ifdef HAS_COPYROW_NEON
void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
...@@ -820,7 +772,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -820,7 +772,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
"sub %0, %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
...@@ -848,7 +799,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -848,7 +799,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
"sub %0, %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
...@@ -877,7 +827,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -877,7 +827,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
"sub %0, %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
...@@ -901,13 +850,12 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -901,13 +850,12 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"movi v4.8b, #255 \n" // Alpha "movi v4.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -922,13 +870,12 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -922,13 +870,12 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"movi v5.8b, #255 \n" // Alpha "movi v5.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"mov v3.8b, v1.8b \n" // move g "orr v3.8b, v1.8b, v1.8b \n" // move g
"mov v4.8b, v0.8b \n" // move r "orr v4.8b, v0.8b, v0.8b \n" // move r
MEMACCESS(1) MEMACCESS(1)
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
"b.gt 1b \n" "b.gt 1b \n"
...@@ -958,14 +905,13 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -958,14 +905,13 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"movi v3.8b, #255 \n" // Alpha "movi v3.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -1018,14 +964,13 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ...@@ -1018,14 +964,13 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
int pix) { int pix) {
asm volatile ( asm volatile (
"movi v3.8b, #255 \n" // Alpha "movi v3.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -1052,14 +997,13 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ...@@ -1052,14 +997,13 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -1073,10 +1017,9 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, ...@@ -1073,10 +1017,9 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
#ifdef HAS_ARGBTORGB24ROW_NEON #ifdef HAS_ARGBTORGB24ROW_NEON
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
...@@ -1093,13 +1036,12 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ...@@ -1093,13 +1036,12 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
#ifdef HAS_ARGBTORAWROW_NEON #ifdef HAS_ARGBTORAWROW_NEON
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"mov v4.8b, v2.8b \n" // mov g "orr v4.8b, v2.8b, v2.8b \n" // mov g
"mov v5.8b, v1.8b \n" // mov b "orr v5.8b, v1.8b, v1.8b \n" // mov b
MEMACCESS(1) MEMACCESS(1)
"st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
"b.gt 1b \n" "b.gt 1b \n"
...@@ -1115,7 +1057,6 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ...@@ -1115,7 +1057,6 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
#ifdef HAS_YUY2TOYROW_NEON #ifdef HAS_YUY2TOYROW_NEON
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
...@@ -1135,7 +1076,6 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1135,7 +1076,6 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
#ifdef HAS_UYVYTOYROW_NEON #ifdef HAS_UYVYTOYROW_NEON
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
...@@ -1156,10 +1096,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1156,10 +1096,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
...@@ -1180,10 +1119,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ...@@ -1180,10 +1119,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
...@@ -1205,13 +1143,12 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1205,13 +1143,12 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2; const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row YUY2. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2) MEMACCESS(2)
...@@ -1225,7 +1162,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1225,7 +1162,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7" // Clobber List
); );
} }
#endif // HAS_YUY2TOUVROW_NEON #endif // HAS_YUY2TOUVROW_NEON
...@@ -1235,13 +1173,12 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1235,13 +1173,12 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy; const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row UYVY. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2) MEMACCESS(2)
...@@ -1255,7 +1192,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1255,7 +1192,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7" // Clobber List
); );
} }
#endif // HAS_UYVYTOUVROW_NEON #endif // HAS_UYVYTOUVROW_NEON
...@@ -1292,7 +1230,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1292,7 +1230,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's. "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
...@@ -1336,18 +1274,17 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1336,18 +1274,17 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_yuy2, int width) { uint8* dst_yuy2, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"mov v2.8b, v1.8b \n" "orr v2.8b, v1.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -1366,18 +1303,17 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1366,18 +1303,17 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_uyvy, int width) { uint8* dst_uyvy, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"mov v3.8b, v2.8b \n" "orr v3.8b, v2.8b, v2.8b \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -1393,10 +1329,9 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1393,10 +1329,9 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
#ifdef HAS_ARGBTORGB565ROW_NEON #ifdef HAS_ARGBTORGB565ROW_NEON
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b-v23.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565 ARGBTORGB565
MEMACCESS(1) MEMACCESS(1)
...@@ -1415,10 +1350,9 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { ...@@ -1415,10 +1350,9 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b-v23.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(1) MEMACCESS(1)
...@@ -1438,10 +1372,9 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, ...@@ -1438,10 +1372,9 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
int pix) { int pix) {
asm volatile ( asm volatile (
"movi v4.16b, #0x0f \n" // bits to clear with vbic. "movi v4.16b, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b-v23.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(1) MEMACCESS(1)
...@@ -1463,7 +1396,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1463,7 +1396,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
...@@ -1491,7 +1423,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1491,7 +1423,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"movi v4.8b, #15 \n" // B * 0.11400 coefficient "movi v4.8b, #15 \n" // B * 0.11400 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient
"movi v6.8b, #38 \n" // R * 0.29900 coefficient "movi v6.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
...@@ -1523,7 +1454,6 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1523,7 +1454,6 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"movi v27.8b, #18 \n" // VB -0.1406 coefficient "movi v27.8b, #18 \n" // VB -0.1406 coefficient
"movi v28.8b, #94 \n" // VG -0.7344 coefficient "movi v28.8b, #94 \n" // VG -0.7344 coefficient
"movi v29.16b,#0x80 \n" // 128.5 "movi v29.16b,#0x80 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
...@@ -1562,16 +1492,10 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1562,16 +1492,10 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) { int pix) {
asm volatile ( asm volatile (
"movi v20.8h, #56 \n" // UB / VR 0.875 / 2 coefficient RGBTOUV_SETUP_REG
"movi v21.8h, #37 \n" // UG -0.5781 / 2 coefficient
"movi v22.8h, #19 \n" // UR -0.2969 / 2 coefficient
"movi v23.8h, #9 \n" // VB -0.1406 / 2 coefficient
"movi v24.8h, #47 \n" // VG -0.7344 / 2 coefficient
"movi v25.16b, #0x80 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
...@@ -1612,21 +1536,15 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1612,21 +1536,15 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) { int pix) {
asm volatile ( asm volatile (
"movi v20.8h, #56 \n" // UB / VR 0.875 / 2 coefficient RGBTOUV_SETUP_REG
"movi v21.8h, #37 \n" // UG -0.5781 / 2 coefficient
"movi v22.8h, #19 \n" // UR -0.2969 / 2 coefficient
"movi v23.8h, #9 \n" // VB -0.1406 / 2 coefficient
"movi v24.8h, #47 \n" // VG -0.7344 / 2 coefficient
"movi v25.16b, #0x80 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(0) MEMACCESS(0)
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16 ARGB pixels. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
"uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
...@@ -1668,16 +1586,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1668,16 +1586,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \ #define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides. // TODO(fbarchard): consider ptrdiff_t for all strides.
...@@ -1685,50 +1603,41 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -1685,50 +1603,41 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
#ifdef HAS_ARGBTOUVROW_NEON #ifdef HAS_ARGBTOUVROW_NEON
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb RGBTOUV_SETUP_REG
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
MEMACCESS(0) "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average "urshr v0.8h, v0.8h, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"vrshr.u16 q2, q2, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1 "+r"(src_argb_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_ARGBTOUVROW_NEON #endif // HAS_ARGBTOUVROW_NEON
...@@ -1737,50 +1646,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1737,50 +1646,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
#ifdef HAS_ARGBTOUVJROW_NEON #ifdef HAS_ARGBTOUVJROW_NEON
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
"vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
"vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
"vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
MEMACCESS(0) "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average "urshr v0.8h, v0.8h, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"vrshr.u16 q2, q2, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1 "+r"(src_argb_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_ARGBTOUVJROW_NEON #endif // HAS_ARGBTOUVJROW_NEON
...@@ -1788,50 +1692,40 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1788,50 +1692,40 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
#ifdef HAS_BGRATOUVROW_NEON #ifdef HAS_BGRATOUVROW_NEON
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // src_stride + src_bgra RGBTOUV_SETUP_REG
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
MEMACCESS(0) "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q1, q1, #1 \n" // 2x average "urshr v0.8h, v0.8h, #1 \n" // 2x average
"vrshr.u16 q2, q2, #1 \n" "urshr v1.8h, v3.8h, #1 \n"
"vrshr.u16 q3, q3, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q3, q2, q1) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1 "+r"(src_bgra_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_BGRATOUVROW_NEON #endif // HAS_BGRATOUVROW_NEON
...@@ -1839,50 +1733,40 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ...@@ -1839,50 +1733,40 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
#ifdef HAS_ABGRTOUVROW_NEON #ifdef HAS_ABGRTOUVROW_NEON
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // src_stride + src_abgr RGBTOUV_SETUP_REG
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
MEMACCESS(0) "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average "urshr v0.8h, v3.8h, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"vrshr.u16 q2, q2, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0) RGBTOUV(v0.8h, v2.8h, v1.8h)
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1 "+r"(src_abgr_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_ABGRTOUVROW_NEON #endif // HAS_ABGRTOUVROW_NEON
...@@ -1890,50 +1774,40 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ...@@ -1890,50 +1774,40 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
#ifdef HAS_RGBATOUVROW_NEON #ifdef HAS_RGBATOUVROW_NEON
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgba RGBTOUV_SETUP_REG
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
MEMACCESS(0) "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average "urshr v0.8h, v0.8h, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"vrshr.u16 q2, q2, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1 "+r"(src_rgba_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_RGBATOUVROW_NEON #endif // HAS_RGBATOUVROW_NEON
...@@ -1941,50 +1815,40 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ...@@ -1941,50 +1815,40 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
#ifdef HAS_RGB24TOUVROW_NEON #ifdef HAS_RGB24TOUVROW_NEON
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgb24 RGBTOUV_SETUP_REG
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
MEMACCESS(0) "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
MEMACCESS(1) MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average "urshr v0.8h, v0.8h, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"vrshr.u16 q2, q2, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(src_stride_rgb24), // %1 "+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_RGB24TOUVROW_NEON #endif // HAS_RGB24TOUVROW_NEON
...@@ -1992,50 +1856,40 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -1992,50 +1856,40 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
#ifdef HAS_RAWTOUVROW_NEON #ifdef HAS_RAWTOUVROW_NEON
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_raw_1 = src_raw + src_stride_raw;
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // src_stride + src_raw RGBTOUV_SETUP_REG
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
MEMACCESS(0) "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
MEMACCESS(1) MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average "urshr v2.8h, v2.8h, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"vrshr.u16 q2, q2, #1 \n" "urshr v0.8h, v0.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0) RGBTOUV(v2.8h, v1.8h, v0.8h)
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(src_stride_raw), // %1 "+r"(src_raw_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_RAWTOUVROW_NEON #endif // HAS_RAWTOUVROW_NEON
...@@ -2044,15 +1898,14 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, ...@@ -2044,15 +1898,14 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
#ifdef HAS_RGB565TOUVROW_NEON #ifdef HAS_RGB565TOUVROW_NEON
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile ( asm volatile (
"add %x1, %x0, %w1, sxtw \n" // src_stride + src_argb "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
"movi v22.8h, #112 / 2 \n" // UB / VR 0.875 coefficient "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
"movi v23.8h, #74 / 2 \n" // UG -0.5781 coefficient "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
"movi v24.8h, #38 / 2 \n" // UR -0.2969 coefficient "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
"movi v25.8h, #18 / 2 \n" // VB -0.1406 coefficient "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
"movi v26.8h, #94 / 2 \n" // VG -0.7344 coefficient
"movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
...@@ -2105,7 +1958,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, ...@@ -2105,7 +1958,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb565), // %0 : "+r"(src_rgb565), // %0
"+r"(src_stride_rgb565), // %1 "+r"(src_rgb565_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
...@@ -2121,15 +1974,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, ...@@ -2121,15 +1974,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
#ifdef HAS_ARGB1555TOUVROW_NEON #ifdef HAS_ARGB1555TOUVROW_NEON
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile ( asm volatile (
"add %x1, %x0, %w1, sxtw \n" // src_stride + src_argb RGBTOUV_SETUP_REG
"movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient
"movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient
"movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient
"movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient
"movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient
"movi v25.16b, #0x80 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
...@@ -2182,7 +2029,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -2182,7 +2029,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb1555), // %0 : "+r"(src_argb1555), // %0
"+r"(src_stride_argb1555), // %1 "+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
...@@ -2198,15 +2045,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, ...@@ -2198,15 +2045,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
#ifdef HAS_ARGB4444TOUVROW_NEON #ifdef HAS_ARGB4444TOUVROW_NEON
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile ( asm volatile (
"add %x1, %x0, %w1, sxtw \n" // src_stride + src_argb RGBTOUV_SETUP_REG
"movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient
"movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient
"movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient
"movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient
"movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient
"movi v25.16b, #0x80 \n" // 128.5
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
...@@ -2259,7 +2100,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ...@@ -2259,7 +2100,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb4444), // %0 : "+r"(src_argb4444), // %0
"+r"(src_stride_argb4444), // %1 "+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
...@@ -2279,7 +2120,6 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { ...@@ -2279,7 +2120,6 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant "movi v27.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
...@@ -2310,7 +2150,6 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { ...@@ -2310,7 +2150,6 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
...@@ -2340,7 +2179,6 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { ...@@ -2340,7 +2179,6 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant "movi v27.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
...@@ -2370,10 +2208,9 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -2370,10 +2208,9 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R "umull v16.8h, v1.8b, v4.8b \n" // R
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
...@@ -2399,10 +2236,9 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -2399,10 +2236,9 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R "umull v16.8h, v0.8b, v4.8b \n" // R
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
...@@ -2428,10 +2264,9 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ...@@ -2428,10 +2264,9 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B "umull v16.8h, v1.8b, v4.8b \n" // B
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
...@@ -2457,10 +2292,9 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ...@@ -2457,10 +2292,9 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
...@@ -2486,10 +2320,9 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2486,10 +2320,9 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels of RAW. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
...@@ -2619,9 +2452,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2619,9 +2452,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a
...@@ -2637,7 +2470,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2637,7 +2470,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255 "movi v3.8b, #255 \n" // a = 255
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
"b.ge 8b \n" "b.ge 8b \n"
"89: \n" "89: \n"
...@@ -2688,7 +2521,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2688,7 +2521,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Attenuate 8 pixels. // Attenuate 8 pixels.
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v5.8h, v1.8b, v3.8b \n" // g * a
...@@ -2697,7 +2530,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2697,7 +2530,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2720,7 +2553,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2720,7 +2553,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"dup v6.8h, %w4 \n" // interval add "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
...@@ -2741,7 +2573,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2741,7 +2573,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"uqxtn v1.8b, v1.8h \n" "uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n" "uqxtn v2.8b, v2.8h \n"
MEMACCESS(0) MEMACCESS(0)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
...@@ -2765,10 +2597,9 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2765,10 +2597,9 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"ushr v0.8h, v0.8h, #1 \n" // scale / 2. "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n" "uxtl v5.8h, v5.8b \n"
...@@ -2783,7 +2614,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2783,7 +2614,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"uqxtn v6.8b, v6.8h \n" "uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n" "uqxtn v7.8b, v7.8h \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2803,7 +2634,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2803,7 +2634,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"movi v24.8b, #15 \n" // B * 0.11400 coefficient "movi v24.8b, #15 \n" // B * 0.11400 coefficient
"movi v25.8b, #75 \n" // G * 0.58700 coefficient "movi v25.8b, #75 \n" // G * 0.58700 coefficient
"movi v26.8b, #38 \n" // R * 0.29900 coefficient "movi v26.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
...@@ -2812,10 +2642,10 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2812,10 +2642,10 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R "umlal v4.8h, v2.8b, v26.8b \n" // R
"sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
"mov v1.8b, v0.8b \n" // G "orr v1.8b, v0.8b, v0.8b \n" // G
"mov v2.8b, v0.8b \n" // R "orr v2.8b, v0.8b, v0.8b \n" // R
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2843,7 +2673,6 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2843,7 +2673,6 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"movi v28.8b, #24 \n" // BB coefficient "movi v28.8b, #24 \n" // BB coefficient
"movi v29.8b, #98 \n" // BG coefficient "movi v29.8b, #98 \n" // BG coefficient
"movi v30.8b, #50 \n" // BR coefficient "movi v30.8b, #50 \n" // BR coefficient
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
...@@ -2861,7 +2690,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ...@@ -2861,7 +2690,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
MEMACCESS(0) MEMACCESS(0)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
...@@ -2884,10 +2713,9 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -2884,10 +2713,9 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"sxtl v0.8h, v2.8b \n" // B,G coefficients s16. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g "uxtl v17.8h, v17.8b \n" // g
...@@ -2926,7 +2754,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -2926,7 +2754,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
MEMACCESS(1) MEMACCESS(1)
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB pixels. "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -2945,12 +2773,11 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2945,12 +2773,11 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G "umull v1.8h, v1.8b, v5.8b \n" // multiply G
...@@ -2961,7 +2788,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2961,7 +2788,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
...@@ -2980,19 +2807,18 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2980,19 +2807,18 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
...@@ -3011,19 +2837,18 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3011,19 +2837,18 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
...@@ -3047,7 +2872,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3047,7 +2872,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile ( asm volatile (
"movi v3.8b, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
...@@ -3055,10 +2879,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3055,10 +2879,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v1.8b \n" // add "uqadd v0.8b, v0.8b, v1.8b \n" // add
"mov v1.8b, v0.8b \n" "orr v1.8b, v0.8b, v0.8b \n"
"mov v2.8b, v0.8b \n" "orr v2.8b, v0.8b, v0.8b \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
...@@ -3076,7 +2900,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3076,7 +2900,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) { uint8* dst_y, int width) {
asm volatile ( asm volatile (
// 16 pixel loop. // 16 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
...@@ -3108,7 +2931,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3108,7 +2931,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile ( asm volatile (
"movi v3.8b, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
...@@ -3117,7 +2939,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3117,7 +2939,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
...@@ -3137,7 +2959,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3137,7 +2959,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) { const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v0.8b}, [%0],%5 \n" // top
...@@ -3183,7 +3004,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3183,7 +3004,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) { uint8* dst_sobely, int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v0.8b}, [%0],%4 \n" // left
......
...@@ -714,7 +714,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ ...@@ -714,7 +714,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
benchmark_width_, DIFF, _Opt, +, 0) benchmark_width_, DIFF, _Opt, +, 0)
TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
#ifdef __arm__ #if defined(__arm__) || defined (__aarch64__)
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
#else #else
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment