Commit 4d5c3f34 authored by zhongwei.yao@arm.com's avatar zhongwei.yao@arm.com

implement ARM64 ScaleRowDown4 and ScaleRowDown4Box

TESTED=libyuv_unittest
BUG=319
R=fbarchard@chromium.org, fbarchard@google.com

Review URL: https://webrtc-codereview.appspot.com/21279004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1068 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 3389f8ef
...@@ -54,7 +54,7 @@ extern "C" { ...@@ -54,7 +54,7 @@ extern "C" {
#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ #elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__aarch64__) || defined(LIBYUV_NEON)) (defined(__aarch64__) || defined(LIBYUV_NEON))
#define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN2_NEON
/* #define HAS_SCALEROWDOWN4_NEON */ #define HAS_SCALEROWDOWN4_NEON
/* #define HAS_SCALEROWDOWN34_NEON */ /* #define HAS_SCALEROWDOWN34_NEON */
/* #define HAS_SCALEROWDOWN38_NEON */ /* #define HAS_SCALEROWDOWN38_NEON */
/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */ /* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
......
...@@ -75,19 +75,18 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -75,19 +75,18 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "ld4 {v0.8b-3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d2}, [%1]! \n" "st1 {v2.8b}, [%1], #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "q0", "q1", "memory", "cc" : "v0", "v1", "v2", "v3", "memory", "cc"
); );
} }
#endif //HAS_SCALEROWDOWN4_NEON #endif //HAS_SCALEROWDOWN4_NEON
...@@ -99,26 +98,24 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -99,26 +98,24 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3) MEMACCESS(3)
"vld1.8 {q1}, [%3]! \n" "ld1 {v1.16b}, [%3], #16 \n"
MEMACCESS(4) MEMACCESS(4)
"vld1.8 {q2}, [%4]! \n" "ld1 {v2.16b}, [%4], #16 \n"
MEMACCESS(5) MEMACCESS(5)
"vld1.8 {q3}, [%5]! \n" "ld1 {v3.16b}, [%5], #16 \n"
"subs %2, %2, #4 \n" "subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n" "uaddlp v0.8h, v0.16b \n"
"vpadal.u8 q0, q1 \n" "uadalp v0.8h, v1.16b \n"
"vpadal.u8 q0, q2 \n" "uadalp v0.8h, v2.16b \n"
"vpadal.u8 q0, q3 \n" "uadalp v0.8h, v3.16b \n"
"vpaddl.u16 q0, q0 \n" "addp v0.8h, v0.8h, v0.8h \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.32 {d0[0]}, [%1]! \n" "st1 {v0.s}[0], [%1], #4 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -127,7 +124,7 @@ asm volatile ( ...@@ -127,7 +124,7 @@ asm volatile (
"+r"(src_ptr2), // %4 "+r"(src_ptr2), // %4
"+r"(src_ptr3) // %5 "+r"(src_ptr3) // %5
: :
: "q0", "q1", "q2", "q3", "memory", "cc" : "v0", "v1", "v2", "v3", "memory", "cc"
); );
} }
#endif //HAS_SCALEROWDOWN4_NEON #endif //HAS_SCALEROWDOWN4_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment