Commit 26f43db1 authored by yang.zhang@arm.com's avatar yang.zhang@arm.com

AArch64:add SumSquareError_NEON armv8 assembly version

BUG=none
TESTED=libyuv_unittest
R=fbarchard@google.com

Review URL: https://webrtc-codereview.appspot.com/16259004/

the benckmarking result is as follows:
toolchain: gcc 4.9
hardware: A53

| count | C Times/NEON times |
| 16    | 3.35               |
| 128   | 6.63               |
| 512   | 7.47               |
| 1024  | 7.72               |

Change-Id: Ic10bf22d77d069a1a2074b68bd5a310c579ec490

Review URL: https://webrtc-codereview.appspot.com/21159004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1043 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 1afdfb3d
......@@ -80,7 +80,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif
......
......@@ -56,6 +56,45 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"bgt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // __ARM_NEON__
#ifdef __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment