Commit 80077a80 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

HammingDistance_X86 using popcnt assembly

popcnt has a fake dependency on the destination.
This assembly avoids the dependency by using a different
register for each popcnt.

Bug: libyuv:701
Test: LIBYUV_DISABLE_SSSE3=1 out/Release/libyuv_unittest --gtest_filter=*Ham*Opt --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=9999 --libyuv_flags=-1 --libyuv_cpu_info=-1
Change-Id: Ie1d202e2613b7fa8a3c02acd433940e92c80eafa
Reviewed-on: https://chromium-review.googlesource.com/731826Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 3e5bbea5
...@@ -60,7 +60,7 @@ extern "C" { ...@@ -60,7 +60,7 @@ extern "C" {
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86)) (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
#define HAS_HASHDJB2_SSE41 #define HAS_HASHDJB2_SSE41
#define HAS_SUMSQUAREERROR_SSE2 #define HAS_SUMSQUAREERROR_SSE2
#define HAS_HAMMINGDISTANCE_X86 #define HAS_HAMMINGDISTANCE_SSE42
#endif #endif
// The following are available for Visual C and clangcl 32 bit: // The following are available for Visual C and clangcl 32 bit:
...@@ -98,7 +98,7 @@ extern "C" { ...@@ -98,7 +98,7 @@ extern "C" {
#endif #endif
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
......
...@@ -130,16 +130,16 @@ uint64 ComputeHammingDistance(const uint8* src_a, ...@@ -130,16 +130,16 @@ uint64 ComputeHammingDistance(const uint8* src_a,
HammingDistance = HammingDistance_NEON; HammingDistance = HammingDistance_NEON;
} }
#endif #endif
#if defined(HAS_HAMMINGDISTANCE_X86)
if (TestCpuFlag(kCpuHasX86)) {
HammingDistance = HammingDistance_X86;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_SSSE3) #if defined(HAS_HAMMINGDISTANCE_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
HammingDistance = HammingDistance_SSSE3; HammingDistance = HammingDistance_SSSE3;
} }
#endif #endif
#if defined(HAS_HAMMINGDISTANCE_SSE42)
if (TestCpuFlag(kCpuHasSSE42)) {
HammingDistance = HammingDistance_SSE42;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_AVX2) #if defined(HAS_HAMMINGDISTANCE_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
HammingDistance = HammingDistance_AVX2; HammingDistance = HammingDistance_AVX2;
......
...@@ -22,18 +22,92 @@ extern "C" { ...@@ -22,18 +22,92 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { #if defined(__x86_64__)
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint64 diff = 0u;
asm volatile(
"xor %%r15,%%r15 \n"
"xor %%r14,%%r14 \n"
"xor %%r13,%%r13 \n"
"xor %%r12,%%r12 \n"
LABELALIGN
"1: \n"
"mov (%0),%%rax \n"
"mov 0x8(%0),%%rdx \n"
"xor (%1),%%rax \n"
"xor 0x8(%1),%%rdx \n"
"popcnt %%rax,%%rax \n"
"popcnt %%rdx,%%rdx \n"
"mov 0x10(%0),%%rcx \n"
"mov 0x18(%0),%%rsi \n"
"xor 0x10(%1),%%rcx \n"
"xor 0x18(%1),%%rsi \n"
"popcnt %%rcx,%%rcx \n"
"popcnt %%rsi,%%rsi \n"
"add $0x20,%0 \n"
"add $0x20,%1 \n"
"add %%rax,%%r15 \n"
"add %%rdx,%%r14 \n"
"add %%rcx,%%r13 \n"
"add %%rsi,%%r12 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"add %%r15, %%r14 \n"
"add %%r13, %%r12 \n"
"add %%r14, %%r12 \n"
"mov %%r12, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=r"(diff) // %3
:
: "memory", "cc", "rax", "rdx", "rcx", "rsi", "r12", "r13", "r14", "r15");
return static_cast<uint32>(diff);
}
#else
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u; uint32 diff = 0u;
int i; asm volatile(LABELALIGN
for (i = 0; i < count - 7; i += 8) { "1: \n"
uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b); "mov (%0),%%eax \n"
src_a += 8; "mov 0x4(%0),%%edx \n"
src_b += 8; "xor (%1),%%eax \n"
diff += __builtin_popcountll(x); "xor 0x4(%1),%%edx \n"
} "popcnt %%eax,%%eax \n"
"add %%eax,%3 \n"
"popcnt %%edx,%%edx \n"
"add %%edx,%3 \n"
"mov 0x8(%0),%%eax \n"
"mov 0xc(%0),%%edx \n"
"xor 0x8(%1),%%eax \n"
"xor 0xc(%1),%%edx \n"
"popcnt %%eax,%%eax \n"
"add %%eax,%3 \n"
"popcnt %%edx,%%edx \n"
"add %%edx,%3 \n"
"add $0x10,%0 \n"
"add $0x10,%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"+r"(diff) // %3
:
: "memory", "cc", "eax", "edx");
return diff; return diff;
} }
#endif
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15}; 15, 15, 15, 15, 15, 15, 15, 15};
......
...@@ -25,7 +25,9 @@ extern "C" { ...@@ -25,7 +25,9 @@ extern "C" {
// This module is for 32 bit Visual C x86 and clangcl // This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u; uint32 diff = 0u;
int i; int i;
......
...@@ -234,18 +234,29 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { ...@@ -234,18 +234,29 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
if (has_avx2) { if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
} else { } else {
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); int has_sse42 = TestCpuFlag(kCpuHasSSE42);
if (has_ssse3) { if (has_sse42) {
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
} else { } else {
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
} }
} }
#elif defined(HAS_HAMMINGDISTANCE_X86) #elif defined(HAS_HAMMINGDISTANCE_SSE42)
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); int has_sse42 = TestCpuFlag(kCpuHasSSE42);
#else if (has_sse42) {
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
#else
h1 = HammingDistance_C(src_a, src_b, kMaxWidth); h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
#endif #endif
} }
EXPECT_EQ(h0, h1); EXPECT_EQ(h0, h1);
...@@ -328,59 +339,63 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 ...@@ -328,59 +339,63 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
uint32 h1 = 0; uint32 h1 = 0;
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_); const int kMaxWidth =benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_a, kMaxWidth);
memset(src_a, 255u, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_b, kMaxWidth);
memset(src_b, 0, benchmark_width_ * benchmark_height_); memset(src_a, 255u, kMaxWidth);
memset(src_b, 0u, kMaxWidth);
uint64 h0 = ComputeHammingDistance(src_a, src_b, uint64 h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
benchmark_width_ * benchmark_height_); EXPECT_EQ(kMaxWidth * 8ULL, h0);
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h0);
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
#if defined(HAS_HAMMINGDISTANCE_NEON) #if defined(HAS_HAMMINGDISTANCE_NEON)
h1 = HammingDistance_NEON(src_a, src_b, h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
benchmark_width_ * benchmark_height_);
#elif defined(HAS_HAMMINGDISTANCE_AVX2) #elif defined(HAS_HAMMINGDISTANCE_AVX2)
int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_avx2 = TestCpuFlag(kCpuHasAVX2);
if (has_avx2) { if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b, h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
benchmark_width_ * benchmark_height_);
} else { } else {
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); int has_sse42 = TestCpuFlag(kCpuHasSSE42);
if (has_ssse3) { if (has_sse42) {
h1 = HammingDistance_SSSE3(src_a, src_b, h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
benchmark_width_ * benchmark_height_);
} else { } else {
h1 = HammingDistance_X86(src_a, src_b, int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
benchmark_width_ * benchmark_height_); if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
} }
} }
#elif defined(HAS_HAMMINGDISTANCE_X86) #elif defined(HAS_HAMMINGDISTANCE_SSE42)
h1 = int has_sse42 = TestCpuFlag(kCpuHasSSE42);
HammingDistance_X86(src_a, src_b, benchmark_width_ * benchmark_height_); if (has_sse42) {
#else h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
h1 = HammingDistance_C(src_a, src_b, benchmark_width_ * benchmark_height_); } else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
#else
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
#endif #endif
} }
// A large count will cause the low level to potentially overflow so the // A large count will cause the low level to potentially overflow so the
// result can not be expected to be correct. // result can not be expected to be correct.
// TODO(fbarchard): Consider expecting the low 16 bits to match. // TODO(fbarchard): Consider expecting the low 16 bits to match.
if ((benchmark_width_ * benchmark_height_) <= kMaxOptCount) { if (kMaxWidth<= kMaxOptCount) {
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8U, h1); EXPECT_EQ(kMaxWidth * 8U, h1);
} else { } else {
if (benchmark_width_ * benchmark_height_ * 8ULL != if (kMaxWidth * 8ULL != static_cast<uint64>(h1)) {
static_cast<uint64>(h1)) {
printf( printf(
"warning - HammingDistance_Opt %u does not match %llu " "warning - HammingDistance_Opt %u does not match %llu "
"but length of %u is longer than guaranteed.\n", "but length of %u is longer than guaranteed.\n",
h1, benchmark_width_ * benchmark_height_ * 8ULL, h1, kMaxWidth * 8ULL, kMaxWidth);
benchmark_width_ * benchmark_height_);
} else { } else {
printf( printf(
"warning - HammingDistance_Opt %u matches but length of %u " "warning - HammingDistance_Opt %u matches but length of %u "
"is longer than guaranteed.\n", "is longer than guaranteed.\n",
h1, benchmark_width_ * benchmark_height_); h1, kMaxWidth);
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment