Commit c1155cb5 authored by ashok.bhat@gmail.com's avatar ashok.bhat@gmail.com

Row AArch64 Neon implementation - Part 3

BUG=319
TESTED=libyuv_unittest
R=fbarchard@google.com

Change-Id: Ia818ca62d4a84d76b0144f904983d82d41cab651
Signed-off-by: 's avatarAshok Bhat <ashok.bhat@arm.com>

Review URL: https://webrtc-codereview.appspot.com/15149004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1056 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8f04ca5b
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1055 Version: 1056
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -271,14 +271,14 @@ extern "C" { ...@@ -271,14 +271,14 @@ extern "C" {
// #define HAS_NV21TORGB565ROW_NEON // #define HAS_NV21TORGB565ROW_NEON
// #define HAS_YUY2TOARGBROW_NEON // #define HAS_YUY2TOARGBROW_NEON
// #define HAS_UYVYTOARGBROW_NEON // #define HAS_UYVYTOARGBROW_NEON
// #define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_NEON
// #define HAS_MERGEUVROW_NEON #define HAS_MERGEUVROW_NEON
// #define HAS_COPYROW_NEON #define HAS_COPYROW_NEON
// #define HAS_SETROW_NEON #define HAS_SETROW_NEON
// #define HAS_ARGBSETROWS_NEON #define HAS_ARGBSETROWS_NEON
// #define HAS_MIRRORROW_NEON #define HAS_MIRRORROW_NEON
// #define HAS_MIRRORUVROW_NEON #define HAS_MIRRORUVROW_NEON
// #define HAS_ARGBMIRRORROW_NEON #define HAS_ARGBMIRRORROW_NEON
#define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOARGBROW_NEON
#define HAS_RAWTOARGBROW_NEON #define HAS_RAWTOARGBROW_NEON
// #define HAS_RGB565TOARGBROW_NEON // #define HAS_RGB565TOARGBROW_NEON
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1055 #define LIBYUV_VERSION 1056
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -824,19 +824,19 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -824,19 +824,19 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store U "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q1}, [%2]! \n" // store V "st1 {v1.16b}, [%2], #16 \n" // store V
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_SPLITUVROW_NEON #endif // HAS_SPLITUVROW_NEON
...@@ -849,12 +849,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -849,12 +849,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U "ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load V "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2) MEMACCESS(2)
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"bgt 1b \n" "bgt 1b \n"
: :
"+r"(src_u), // %0 "+r"(src_u), // %0
...@@ -862,7 +862,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -862,7 +862,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"+r"(dst_uv), // %2 "+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_MERGEUVROW_NEON #endif // HAS_MERGEUVROW_NEON
...@@ -874,16 +874,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -874,16 +874,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 // Output registers "+r"(count) // %2 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_COPYROW_NEON #endif // HAS_COPYROW_NEON
...@@ -892,16 +892,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -892,16 +892,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#ifdef HAS_SETROW_NEON #ifdef HAS_SETROW_NEON
void SetRow_NEON(uint8* dst, uint32 v32, int count) { void SetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( asm volatile (
"vdup.u32 q0, %2 \n" // duplicate 4 ints "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
: "r"(v32) // %2 : "r"(v32) // %2
: "cc", "memory", "q0" : "cc", "memory", "v0"
); );
} }
#endif // HAS_SETROW_NEON #endif // HAS_SETROW_NEON
...@@ -922,26 +922,25 @@ void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, ...@@ -922,26 +922,25 @@ void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"mov r3, #-16 \n"
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
"sub %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop. "subs %2, %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n" "rev64 v0.16b, v0.16b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: : "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "r3", "q0" : "cc", "memory", "v0"
); );
} }
#endif // HAS_MIRRORROW_NEON #endif // HAS_MIRRORROW_NEON
...@@ -951,27 +950,27 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -951,27 +950,27 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop. "subs %3, %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n" "rev64 v0.8b, v0.8b \n"
"rev64 v1.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // dst += 8 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" "st1 {v1.8b}, [%2], #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(width) // %3 "+r"(width) // %3
: : "r"((ptrdiff_t)-16) // %4
: "cc", "memory", "r12", "q0" : "cc", "memory", "v0", "v1"
); );
} }
#endif // HAS_MIRRORUVROW_NEON #endif // HAS_MIRRORUVROW_NEON
...@@ -980,26 +979,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -980,26 +979,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"mov r3, #-16 \n"
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop. "subs %2, %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n" "rev64 v0.4s, v0.4s \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: : "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "r3", "q0" : "cc", "memory", "v0"
); );
} }
#endif // HAS_ARGBMIRRORROW_NEON #endif // HAS_ARGBMIRRORROW_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment