Commit de9fa43c authored by ashok.bhat@gmail.com's avatar ashok.bhat@gmail.com

Row AArch64 Neon implementation - Part 1

BUG=319
TEST=libyuv_unittest
R=fbarchard@google.com

Change-Id: I367ffa7bb0fd0337ab8486d3eb4fb94afea7400c
Signed-off-by: 's avatarAshok Bhat <ashok.bhat@arm.com>

Review URL: https://webrtc-codereview.appspot.com/21149004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1044 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 26f43db1
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1041 Version: 1044
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -252,6 +252,94 @@ extern "C" { ...@@ -252,6 +252,94 @@ extern "C" {
// The following are available on arm64 platforms: // The following are available on arm64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// #define HAS_I444TOARGBROW_NEON
// #define HAS_I422TOARGBROW_NEON
// #define HAS_I411TOARGBROW_NEON
// #define HAS_I422TOBGRAROW_NEON
// #define HAS_I422TOABGRROW_NEON
// #define HAS_I422TORGBAROW_NEON
// #define HAS_I422TORGB24ROW_NEON
// #define HAS_I422TORAWROW_NEON
// #define HAS_I422TORGB565ROW_NEON
// #define HAS_I422TOARGB1555ROW_NEON
// #define HAS_I422TOARGB4444ROW_NEON
// #define HAS_YTOARGBROW_NEON
// #define HAS_I400TOARGBROW_NEON
// #define HAS_NV12TOARGBROW_NEON
// #define HAS_NV21TOARGBROW_NEON
// #define HAS_NV12TORGB565ROW_NEON
// #define HAS_NV21TORGB565ROW_NEON
// #define HAS_YUY2TOARGBROW_NEON
// #define HAS_UYVYTOARGBROW_NEON
// #define HAS_SPLITUVROW_NEON
// #define HAS_MERGEUVROW_NEON
// #define HAS_COPYROW_NEON
// #define HAS_SETROW_NEON
// #define HAS_ARGBSETROWS_NEON
// #define HAS_MIRRORROW_NEON
// #define HAS_MIRRORUVROW_NEON
// #define HAS_ARGBMIRRORROW_NEON
// #define HAS_RGB24TOARGBROW_NEON
// #define HAS_RAWTOARGBROW_NEON
// #define HAS_RGB565TOARGBROW_NEON
// #define HAS_ARGB1555TOARGBROW_NEON
// #define HAS_ARGB4444TOARGBROW_NEON
// #define HAS_ARGBTORGB24ROW_NEON
// #define HAS_ARGBTORAWROW_NEON
// #define HAS_YUY2TOYROW_NEON
// #define HAS_UYVYTOYROW_NEON
// #define HAS_YUY2TOUV422ROW_NEON
// #define HAS_UYVYTOUV422ROW_NEON
// #define HAS_YUY2TOUVROW_NEON
// #define HAS_UYVYTOUVROW_NEON
// #define HAS_HALFROW_NEON
// #define HAS_ARGBTOBAYERROW_NEON
// #define HAS_ARGBTOBAYERGGROW_NEON
// #define HAS_ARGBSHUFFLEROW_NEON
// #define HAS_I422TOYUY2ROW_NEON
// #define HAS_I422TOUYVYROW_NEON
// #define HAS_ARGBTORGB565ROW_NEON
// #define HAS_ARGBTOARGB1555ROW_NEON
// #define HAS_ARGBTOARGB4444ROW_NEON
// #define HAS_ARGBTOYROW_NEON
// #define HAS_ARGBTOYJROW_NEON
// #define HAS_ARGBTOUV444ROW_NEON
// #define HAS_ARGBTOUV422ROW_NEON
// #define HAS_ARGBTOUV411ROW_NEON
// #define HAS_ARGBTOUVROW_NEON
// #define HAS_ARGBTOUVJROW_NEON
// #define HAS_BGRATOUVROW_NEON
// #define HAS_ABGRTOUVROW_NEON
// #define HAS_RGBATOUVROW_NEON
// #define HAS_RGB24TOUVROW_NEON
// #define HAS_RAWTOUVROW_NEON
// #define HAS_RGB565TOUVROW_NEON
// #define HAS_ARGB1555TOUVROW_NEON
// #define HAS_ARGB4444TOUVROW_NEON
// #define HAS_RGB565TOYROW_NEON
// #define HAS_ARGB1555TOYROW_NEON
// #define HAS_ARGB4444TOYROW_NEON
// #define HAS_BGRATOYROW_NEON
// #define HAS_ABGRTOYROW_NEON
// #define HAS_RGBATOYROW_NEON
// #define HAS_RGB24TOYROW_NEON
// #define HAS_RAWTOYROW_NEON
// #define HAS_INTERPOLATEROW_NEON
// #define HAS_ARGBBLENDROW_NEON
// #define HAS_ARGBATTENUATEROW_NEON
// #define HAS_ARGBQUANTIZEROW_NEON
// #define HAS_ARGBSHADEROW_NEON
// #define HAS_ARGBGRAYROW_NEON
// #define HAS_ARGBSEPIAROW_NEON
// #define HAS_ARGBCOLORMATRIXROW_NEON
// #define HAS_ARGBMULTIPLYROW_NEON
// #define HAS_ARGBADDROW_NEON
// #define HAS_ARGBSUBTRACTROW_NEON
#define HAS_SOBELROW_NEON
#define HAS_SOBELTOPLANEROW_NEON
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELXROW_NEON
#define HAS_SOBELYROW_NEON
#endif #endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1041 #define LIBYUV_VERSION 1044
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -3141,27 +3141,27 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3141,27 +3141,27 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 sobely. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d0, d0, d1 \n" // add "uqadd v0.8b, v0.8b, v1.8b \n" // add
"vmov.u8 d1, d0 \n" "mov v1.8b, v0.8b \n"
"vmov.u8 d2, d0 \n" "mov v2.8b, v0.8b \n"
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_SOBELROW_NEON #endif // HAS_SOBELROW_NEON
...@@ -3175,20 +3175,20 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3175,20 +3175,20 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load 16 sobely. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %3, %3, #16 \n" // 16 processed per loop.
"vqadd.u8 q0, q0, q1 \n" // add "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 16 pixels. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_y), // %2 "+r"(dst_y), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1" : "cc", "memory", "v0", "v1"
); );
} }
#endif // HAS_SOBELTOPLANEROW_NEON #endif // HAS_SOBELTOPLANEROW_NEON
...@@ -3202,25 +3202,25 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -3202,25 +3202,25 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 sobely. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d1, d0, d2 \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_SOBELXYROW_NEON #endif // HAS_SOBELXYROW_NEON
...@@ -3236,28 +3236,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3236,28 +3236,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top "ld1 {v0.8b}, [%0],%5 \n" // top
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d1}, [%0],%6 \n" "ld1 {v1.8b}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n" "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d2}, [%1],%5 \n" // center * 2 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d3}, [%1],%6 \n" "ld1 {v3.8b}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d2}, [%2],%5 \n" // bottom "ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d3}, [%2],%6 \n" "ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels "subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vabs.s16 q0, q0 \n" "abs v0.8h, v0.8h \n"
"vqmovn.u16 d0, q0 \n" "uqxtn v0.8b, v0.8h \n"
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d0}, [%3]! \n" // store 8 sobelx "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
...@@ -3266,7 +3266,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3266,7 +3266,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %4 "+r"(width) // %4
: "r"(2), // %5 : "r"(2), // %5
"r"(6) // %6 "r"(6) // %6
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_SOBELXROW_NEON #endif // HAS_SOBELXROW_NEON
...@@ -3282,28 +3282,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3282,28 +3282,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left "ld1 {v0.8b}, [%0],%4 \n" // left
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d1}, [%1],%4 \n" "ld1 {v1.8b}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n" "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d2}, [%0],%4 \n" // center * 2 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d3}, [%1],%4 \n" "ld1 {v3.8b}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d2}, [%0],%5 \n" // right "ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d3}, [%1],%5 \n" "ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels "subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vabs.s16 q0, q0 \n" "abs v0.8h, v0.8h \n"
"vqmovn.u16 d0, q0 \n" "uqxtn v0.8b, v0.8h \n"
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 sobely "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
...@@ -3311,7 +3311,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -3311,7 +3311,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %3 "+r"(width) // %3
: "r"(1), // %4 : "r"(1), // %4
"r"(6) // %5 "r"(6) // %5
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_SOBELYROW_NEON #endif // HAS_SOBELYROW_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment