Commit de8e9ae1 authored by Frank Barchard's avatar Frank Barchard Committed by Frank Barchard

HammingDistance_SSE42 register optimized to avoid push

Bug: libyuv:701
Test: objdump to confirm code gen
Change-Id: Ibdcb2cc6bc9bf14b4ccb874c49fc9ff664650e1a
Reviewed-on: https://chromium-review.googlesource.com/745390Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent ffc48118
...@@ -29,44 +29,44 @@ uint32 HammingDistance_SSE42(const uint8* src_a, ...@@ -29,44 +29,44 @@ uint32 HammingDistance_SSE42(const uint8* src_a,
uint64 diff = 0u; uint64 diff = 0u;
asm volatile( asm volatile(
"xor %%r15,%%r15 \n" "xor %3,%3 \n"
"xor %%r14,%%r14 \n" "xor %%r8,%%r8 \n"
"xor %%r13,%%r13 \n" "xor %%r9,%%r9 \n"
"xor %%r12,%%r12 \n" "xor %%r10,%%r10 \n"
// Process 32 bytes per loop. // Process 32 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"mov (%0),%%rax \n" "mov (%0),%%rcx \n"
"mov 0x8(%0),%%rdx \n" "mov 0x8(%0),%%rdx \n"
"xor (%1),%%rax \n" "xor (%1),%%rcx \n"
"xor 0x8(%1),%%rdx \n" "xor 0x8(%1),%%rdx \n"
"popcnt %%rax,%%rax \n"
"popcnt %%rdx,%%rdx \n"
"mov 0x10(%0),%%rcx \n"
"mov 0x18(%0),%%rsi \n"
"xor 0x10(%1),%%rcx \n"
"xor 0x18(%1),%%rsi \n"
"popcnt %%rcx,%%rcx \n" "popcnt %%rcx,%%rcx \n"
"popcnt %%rdx,%%rdx \n"
"mov 0x10(%0),%%rsi \n"
"mov 0x18(%0),%%rdi \n"
"xor 0x10(%1),%%rsi \n"
"xor 0x18(%1),%%rdi \n"
"popcnt %%rsi,%%rsi \n" "popcnt %%rsi,%%rsi \n"
"popcnt %%rdi,%%rdi \n"
"add $0x20,%0 \n" "add $0x20,%0 \n"
"add $0x20,%1 \n" "add $0x20,%1 \n"
"add %%rax,%%r15 \n" "add %%rcx,%3 \n"
"add %%rdx,%%r14 \n" "add %%rdx,%%r8 \n"
"add %%rcx,%%r13 \n" "add %%rsi,%%r9 \n"
"add %%rsi,%%r12 \n" "add %%rdi,%%r10 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"add %%r15, %%r14 \n"
"add %%r13, %%r12 \n" "add %%r8, %3 \n"
"add %%r14, %%r12 \n" "add %%r9, %3 \n"
"mov %%r12, %3 \n" "add %%r10, %3 \n"
: "+r"(src_a), // %0 : "+r"(src_a), // %0
"+r"(src_b), // %1 "+r"(src_b), // %1
"+r"(count), // %2 "+r"(count), // %2
"=r"(diff) // %3 "=r"(diff) // %3
: :
: "memory", "cc", "rax", "rdx", "rcx", "rsi", "r12", "r13", "r14", "r15"); : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
return static_cast<uint32>(diff); return static_cast<uint32>(diff);
} }
...@@ -80,20 +80,20 @@ uint32 HammingDistance_SSE42(const uint8* src_a, ...@@ -80,20 +80,20 @@ uint32 HammingDistance_SSE42(const uint8* src_a,
// Process 16 bytes per loop. // Process 16 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"mov (%0),%%eax \n" "mov (%0),%%ecx \n"
"mov 0x4(%0),%%edx \n" "mov 0x4(%0),%%edx \n"
"xor (%1),%%eax \n" "xor (%1),%%ecx \n"
"xor 0x4(%1),%%edx \n" "xor 0x4(%1),%%edx \n"
"popcnt %%eax,%%eax \n" "popcnt %%ecx,%%ecx \n"
"add %%eax,%3 \n" "add %%ecx,%3 \n"
"popcnt %%edx,%%edx \n" "popcnt %%edx,%%edx \n"
"add %%edx,%3 \n" "add %%edx,%3 \n"
"mov 0x8(%0),%%eax \n" "mov 0x8(%0),%%ecx \n"
"mov 0xc(%0),%%edx \n" "mov 0xc(%0),%%edx \n"
"xor 0x8(%1),%%eax \n" "xor 0x8(%1),%%ecx \n"
"xor 0xc(%1),%%edx \n" "xor 0xc(%1),%%edx \n"
"popcnt %%eax,%%eax \n" "popcnt %%ecx,%%ecx \n"
"add %%eax,%3 \n" "add %%ecx,%3 \n"
"popcnt %%edx,%%edx \n" "popcnt %%edx,%%edx \n"
"add %%edx,%3 \n" "add %%edx,%3 \n"
"add $0x10,%0 \n" "add $0x10,%0 \n"
...@@ -105,7 +105,7 @@ uint32 HammingDistance_SSE42(const uint8* src_a, ...@@ -105,7 +105,7 @@ uint32 HammingDistance_SSE42(const uint8* src_a,
"+r"(count), // %2 "+r"(count), // %2
"+r"(diff) // %3 "+r"(diff) // %3
: :
: "memory", "cc", "eax", "edx"); : "memory", "cc", "ecx", "edx");
return diff; return diff;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment