Commit 4e439631 authored by zhongwei.yao@arm.com's avatar zhongwei.yao@arm.com

Add function ScaleFilterRows_NEON for ARM64 Scale Neon implementation

TESTED=libyuv_unittest
BUG=319
R=fbarchard@chromium.org, fbarchard@google.com

Review URL: https://webrtc-codereview.appspot.com/22439004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1066 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 218ebde8
...@@ -531,15 +531,15 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -531,15 +531,15 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
} }
#endif //HAS_SCALEROWDOWN38_NEON #endif //HAS_SCALEROWDOWN38_NEON
#if 0
// 16x2 -> 16x1 // 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %2, %1 \n"
"cmp %4, #64 \n" "cmp %4, #64 \n"
"beq 75f \n" "beq 75f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
...@@ -547,90 +547,89 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -547,90 +547,89 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"cmp %4, #192 \n" "cmp %4, #192 \n"
"beq 25f \n" "beq 25f \n"
"vdup.8 d5, %4 \n" "dup v5.8b, %w4 \n"
"rsb %4, #256 \n" "dup v4.8b, %w5 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "umull v6.8h, v0.8b, v4.8b \n"
"vmull.u8 q14, d1, d4 \n" "umull2 v7.8h, v0.16b, v4.16b \n"
"vmlal.u8 q13, d2, d5 \n" "umlal v6.8h, v1.8b, v5.8b \n"
"vmlal.u8 q14, d3, d5 \n" "umlal2 v7.8h, v1.16b, v5.16b \n"
"vrshrn.u16 d0, q13, #8 \n" "rshrn v0.8b, v6.8h, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 25b \n" "bgt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 75b \n" "bgt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {d1[7]}, [%0] \n" "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
"+r"(dst_width), // %3 "+r"(dst_width), // %3
"+r"(source_y_fraction) // %4 "+r"(source_y_fraction),// %4
"+r"(y_fraction) // %5
: :
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
); );
} }
#endif //0
#ifdef HAS_SCALEARGBROWDOWN2_NEON #ifdef HAS_SCALEARGBROWDOWN2_NEON
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment