Commit 790e0634 authored by Frank Barchard's avatar Frank Barchard

Port HammingDistance_NEON 32 bit code to 64 bit

The 32 bit version of HammingDistance_NEON accumulates
using vertical add and paired adds, which takes 3 instructions
instead of 4.
The instructions are also portable between 32 and 64 bit.

Was BenchmarkHammingDistance_Opt (105 ms)
Now BenchmarkHammingDistance_Opt (90 ms)

TBR=kjellander@chromium.org
BUG=libyuv:701
TEST=BenchmarkHammingDistance

BenchmarkHammingDistance_Opt (90 ms)

Change-Id: If9e621e0bd2fe2492a1532056f8a1b451ba53d7e
Reviewed-on: https://chromium-review.googlesource.com/526365Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 47d6eaa3
...@@ -24,24 +24,22 @@ extern "C" { ...@@ -24,24 +24,22 @@ extern "C" {
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff; uint32 diff;
asm volatile ( asm volatile (
"movi d4, #0 \n" "movi v4.4s, #0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"subs %w2, %w2, #32 \n"
"eor v0.16b, v0.16b, v2.16b \n" "eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n" "eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n" "cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n" "cnt v1.16b, v1.16b \n"
"uaddlv h0, v0.16b \n" "subs %w2, %w2, #32 \n"
"uaddlv h1, v1.16b \n" "add v0.16b, v0.16b, v1.16b \n"
"add d4, d4, d0 \n" "uaddlp v0.8h, v0.16b \n"
"add d4, d4, d1 \n" "uadalp v4.4s, v0.8h \n"
"b.gt 1b \n" "b.gt 1b \n"
"addv s4, v4.4s \n"
"fmov %w3, s4 \n" "fmov %w3, s4 \n"
: "+r"(src_a), : "+r"(src_a),
"+r"(src_b), "+r"(src_b),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment