Commit 311add63 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

CopyRow_NEON use ldp instead of ld1 for better performance.

Under cache thrashing circumstances, ldp/stp perform better than
ld1/st1 on QC820/QC821 CPUs.  Same performance when hitting cache.

Bug: libyuv:738
Test: LibYUVPlanarTest.TestCopySamples_Opt (445 ms)
Change-Id: Ib6a0a5d5e6a1b7ef667b9bb2edb39d681cf3614c
Reviewed-on: https://chromium-review.googlesource.com/691281
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent ccd6d6fc
...@@ -628,19 +628,19 @@ void MergeRGBRow_NEON(const uint8* src_r, ...@@ -628,19 +628,19 @@ void MergeRGBRow_NEON(const uint8* src_r,
); );
} }
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #32 \n" // 32 processed per loop "subs %w2, %w2, #32 \n" // 32 processed per loop
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 "stp q0, q1, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 // Output registers "+r"(count) // %2 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment