Commit cb8be2fb authored by ashok.bhat@gmail.com's avatar ashok.bhat@gmail.com

Row AArch64 Neon implementation - Part 4

BUG=319
TESTED=libyuv_unittest
R=fbarchard@chromium.org, fbarchard@google.com

Change-Id: If145660d999e95246efeedb64a45ba70bf0fe23e
Signed-off-by: 's avatarAshok Bhat <ashok.bhat@arm.com>

Review URL: https://webrtc-codereview.appspot.com/13199004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1054 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 720e3a24
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1053 Version: 1054
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -279,25 +279,25 @@ extern "C" { ...@@ -279,25 +279,25 @@ extern "C" {
// #define HAS_MIRRORROW_NEON // #define HAS_MIRRORROW_NEON
// #define HAS_MIRRORUVROW_NEON // #define HAS_MIRRORUVROW_NEON
// #define HAS_ARGBMIRRORROW_NEON // #define HAS_ARGBMIRRORROW_NEON
// #define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOARGBROW_NEON
// #define HAS_RAWTOARGBROW_NEON #define HAS_RAWTOARGBROW_NEON
// #define HAS_RGB565TOARGBROW_NEON // #define HAS_RGB565TOARGBROW_NEON
// #define HAS_ARGB1555TOARGBROW_NEON // #define HAS_ARGB1555TOARGBROW_NEON
// #define HAS_ARGB4444TOARGBROW_NEON // #define HAS_ARGB4444TOARGBROW_NEON
// #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB24ROW_NEON
// #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORAWROW_NEON
// #define HAS_YUY2TOYROW_NEON #define HAS_YUY2TOYROW_NEON
// #define HAS_UYVYTOYROW_NEON #define HAS_UYVYTOYROW_NEON
// #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUV422ROW_NEON
// #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUV422ROW_NEON
// #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOUVROW_NEON
// #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOUVROW_NEON
// #define HAS_HALFROW_NEON #define HAS_HALFROW_NEON
// #define HAS_ARGBTOBAYERROW_NEON #define HAS_ARGBTOBAYERROW_NEON
// #define HAS_ARGBTOBAYERGGROW_NEON #define HAS_ARGBTOBAYERGGROW_NEON
// #define HAS_ARGBSHUFFLEROW_NEON #define HAS_ARGBSHUFFLEROW_NEON
// #define HAS_I422TOYUY2ROW_NEON #define HAS_I422TOYUY2ROW_NEON
// #define HAS_I422TOUYVYROW_NEON #define HAS_I422TOUYVYROW_NEON
// #define HAS_ARGBTORGB565ROW_NEON // #define HAS_ARGBTORGB565ROW_NEON
// #define HAS_ARGBTOARGB1555ROW_NEON // #define HAS_ARGBTOARGB1555ROW_NEON
// #define HAS_ARGBTOARGB4444ROW_NEON // #define HAS_ARGBTOARGB4444ROW_NEON
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1053 #define LIBYUV_VERSION 1054
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, ...@@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7) 1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
#endif // HAS_I422TOARGBROW_NEON
#ifdef HAS_I422TOYUY2ROW_NEON
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
#endif // HAS_I422TOYUY2ROW_NEON
#ifdef HAS_I422TOUYVYROW_NEON
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOARGBROW_NEON #endif // HAS_I422TOUYVYROW_NEON
#undef YANY #undef YANY
// Wrappers to handle odd width // Wrappers to handle odd width
...@@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) ...@@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_YUY2TOYROW_NEON
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
#endif
#ifdef HAS_UYVYTOYROW_NEON
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
#endif
#ifdef HAS_RAWTOARGBROW_NEON
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
#endif
#ifdef HAS_RGB565TOARGBROW_NEON
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif #endif
#undef YANY #undef YANY
...@@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) ...@@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
#endif
#ifdef HAS_YUY2TOUVROW_NEON
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
#endif
#ifdef HAS_UYVYTOUVROW_NEON
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif #endif
#undef UVANY #undef UVANY
......
...@@ -1007,20 +1007,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -1007,20 +1007,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
#ifdef HAS_RGB24TOARGBROW_NEON #ifdef HAS_RGB24TOARGBROW_NEON
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #255 \n" // Alpha "movi v4.8b, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
#endif // HAS_RGB24TOARGBROW_NEON #endif // HAS_RGB24TOARGBROW_NEON
...@@ -1028,21 +1028,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -1028,21 +1028,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
#ifdef HAS_RAWTOARGBROW_NEON #ifdef HAS_RAWTOARGBROW_NEON
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #255 \n" // Alpha "movi v5.8b, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "mov v3.8b, v1.8b \n" // move g
"mov v4.8b, v0.8b \n" // move r
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
); );
} }
#endif // HAS_RAWTOARGBROW_NEON #endif // HAS_RAWTOARGBROW_NEON
...@@ -1170,16 +1171,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ...@@ -1170,16 +1171,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1 "+r"(dst_rgb24), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
#endif // HAS_ARGBTORGB24ROW_NEON #endif // HAS_ARGBTORGB24ROW_NEON
...@@ -1190,17 +1191,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ...@@ -1190,17 +1191,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "mov v4.8b, v2.8b \n" // mov g
"mov v5.8b, v1.8b \n" // mov b
MEMACCESS(1) MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_raw), // %1 "+r"(dst_raw), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
); );
} }
#endif // HAS_ARGBTORAWROW_NEON #endif // HAS_ARGBTORAWROW_NEON
...@@ -1211,16 +1213,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1211,16 +1213,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_YUY2TOYROW_NEON #endif // HAS_YUY2TOYROW_NEON
...@@ -1231,16 +1233,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1231,16 +1233,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_UYVYTOYROW_NEON #endif // HAS_UYVYTOYROW_NEON
...@@ -1252,19 +1254,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ...@@ -1252,19 +1254,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d3}, [%2]! \n" // store 8 V. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: :
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_YUY2TOUV422ROW_NEON #endif // HAS_YUY2TOUV422ROW_NEON
...@@ -1276,19 +1278,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ...@@ -1276,19 +1278,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d2}, [%2]! \n" // store 8 V. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: :
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_UYVYTOUV422ROW_NEON #endif // HAS_UYVYTOUV422ROW_NEON
...@@ -1297,20 +1299,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ...@@ -1297,20 +1299,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // stride + src_yuy2 "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 U. "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d3}, [%3]! \n" // store 8 V. "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1 "+r"(stride_yuy2), // %1
...@@ -1318,7 +1320,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1318,7 +1320,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
); );
} }
#endif // HAS_YUY2TOUVROW_NEON #endif // HAS_YUY2TOUVROW_NEON
...@@ -1327,20 +1329,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1327,20 +1329,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // stride + src_uyvy "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 U. "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d2}, [%3]! \n" // store 8 V. "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1 "+r"(stride_uyvy), // %1
...@@ -1348,7 +1350,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1348,7 +1350,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
); );
} }
#endif // HAS_UYVYTOUVROW_NEON #endif // HAS_UYVYTOUVROW_NEON
...@@ -1358,23 +1360,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, ...@@ -1358,23 +1360,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) { uint8* dst_uv, int pix) {
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %x1, %x0, %w1, sxtw \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1 "+r"(src_uv_stride), // %1
"+r"(dst_uv), // %2 "+r"(dst_uv), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_HALFROW_NEON #endif // HAS_HALFROW_NEON
...@@ -1384,22 +1386,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, ...@@ -1384,22 +1386,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) { uint32 selector, int pix) {
asm volatile ( asm volatile (
"vmov.u32 d6[0], %3 \n" // selector "mov v2.s[0], %w3 \n" // selector
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" // store 8. "st1 {v4.8b}, [%1], #8 \n" // store 8.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "r"(selector) // %3 : "r"(selector) // %3
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
); );
} }
#endif // HAS_ARGBTOBAYERROW_NEON #endif // HAS_ARGBTOBAYERROW_NEON
...@@ -1411,16 +1413,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, ...@@ -1411,16 +1413,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's. "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_ARGBTOBAYERGGROW_NEON #endif // HAS_ARGBTOBAYERGGROW_NEON
...@@ -1431,21 +1433,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -1431,21 +1433,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
asm volatile ( asm volatile (
MEMACCESS(3) MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 4 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 4. "st1 {v1.16b}, [%1], #16 \n" // store 4.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "r"(shuffler) // %3 : "r"(shuffler) // %3
: "cc", "memory", "q0", "q1", "q2" // Clobber List : "cc", "memory", "v0", "v1", "v2" // Clobber List
); );
} }
#endif // HAS_ARGBSHUFFLEROW_NEON #endif // HAS_ARGBSHUFFLEROW_NEON
...@@ -1459,14 +1460,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1459,14 +1460,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"mov v2.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 Us "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d3}, [%2]! \n" // load 8 Vs "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -1474,7 +1476,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ...@@ -1474,7 +1476,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"+r"(dst_yuy2), // %3 "+r"(dst_yuy2), // %3
"+r"(width) // %4 "+r"(width) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_I422TOYUY2ROW_NEON #endif // HAS_I422TOYUY2ROW_NEON
...@@ -1488,14 +1490,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1488,14 +1490,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
"mov v3.8b, v2.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 Us "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d2}, [%2]! \n" // load 8 Vs "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
...@@ -1503,7 +1506,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ...@@ -1503,7 +1506,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"+r"(dst_uyvy), // %3 "+r"(dst_uyvy), // %3
"+r"(width) // %4 "+r"(width) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_I422TOUYVYROW_NEON #endif // HAS_I422TOUYVYROW_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment