Commit 80077a80 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

HammingDistance_X86 using popcnt assembly

popcnt has a fake dependency on the destination.
This assembly avoids the dependency by using a different
register for each popcnt.

Bug: libyuv:701
Test: LIBYUV_DISABLE_SSSE3=1 out/Release/libyuv_unittest --gtest_filter=*Ham*Opt --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=9999 --libyuv_flags=-1 --libyuv_cpu_info=-1
Change-Id: Ie1d202e2613b7fa8a3c02acd433940e92c80eafa
Reviewed-on: https://chromium-review.googlesource.com/731826Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 3e5bbea5
......@@ -60,7 +60,7 @@ extern "C" {
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
#define HAS_HASHDJB2_SSE41
#define HAS_SUMSQUAREERROR_SSE2
#define HAS_HAMMINGDISTANCE_X86
#define HAS_HAMMINGDISTANCE_SSE42
#endif
// The following are available for Visual C and clangcl 32 bit:
......@@ -98,7 +98,7 @@ extern "C" {
#endif
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
......
......@@ -130,16 +130,16 @@ uint64 ComputeHammingDistance(const uint8* src_a,
HammingDistance = HammingDistance_NEON;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_X86)
if (TestCpuFlag(kCpuHasX86)) {
HammingDistance = HammingDistance_X86;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
HammingDistance = HammingDistance_SSSE3;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_SSE42)
if (TestCpuFlag(kCpuHasSSE42)) {
HammingDistance = HammingDistance_SSE42;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
HammingDistance = HammingDistance_AVX2;
......
......@@ -22,18 +22,92 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
#if defined(__x86_64__)
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint64 diff = 0u;
asm volatile(
"xor %%r15,%%r15 \n"
"xor %%r14,%%r14 \n"
"xor %%r13,%%r13 \n"
"xor %%r12,%%r12 \n"
LABELALIGN
"1: \n"
"mov (%0),%%rax \n"
"mov 0x8(%0),%%rdx \n"
"xor (%1),%%rax \n"
"xor 0x8(%1),%%rdx \n"
"popcnt %%rax,%%rax \n"
"popcnt %%rdx,%%rdx \n"
"mov 0x10(%0),%%rcx \n"
"mov 0x18(%0),%%rsi \n"
"xor 0x10(%1),%%rcx \n"
"xor 0x18(%1),%%rsi \n"
"popcnt %%rcx,%%rcx \n"
"popcnt %%rsi,%%rsi \n"
"add $0x20,%0 \n"
"add $0x20,%1 \n"
"add %%rax,%%r15 \n"
"add %%rdx,%%r14 \n"
"add %%rcx,%%r13 \n"
"add %%rsi,%%r12 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"add %%r15, %%r14 \n"
"add %%r13, %%r12 \n"
"add %%r14, %%r12 \n"
"mov %%r12, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=r"(diff) // %3
:
: "memory", "cc", "rax", "rdx", "rcx", "rsi", "r12", "r13", "r14", "r15");
return static_cast<uint32>(diff);
}
#else
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u;
int i;
for (i = 0; i < count - 7; i += 8) {
uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b);
src_a += 8;
src_b += 8;
diff += __builtin_popcountll(x);
}
asm volatile(LABELALIGN
"1: \n"
"mov (%0),%%eax \n"
"mov 0x4(%0),%%edx \n"
"xor (%1),%%eax \n"
"xor 0x4(%1),%%edx \n"
"popcnt %%eax,%%eax \n"
"add %%eax,%3 \n"
"popcnt %%edx,%%edx \n"
"add %%edx,%3 \n"
"mov 0x8(%0),%%eax \n"
"mov 0xc(%0),%%edx \n"
"xor 0x8(%1),%%eax \n"
"xor 0xc(%1),%%edx \n"
"popcnt %%eax,%%eax \n"
"add %%eax,%3 \n"
"popcnt %%edx,%%edx \n"
"add %%edx,%3 \n"
"add $0x10,%0 \n"
"add $0x10,%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"+r"(diff) // %3
:
: "memory", "cc", "eax", "edx");
return diff;
}
#endif
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15};
......
......@@ -25,7 +25,9 @@ extern "C" {
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u;
int i;
......
......@@ -234,18 +234,29 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
} else {
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
if (has_sse42) {
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
}
}
#elif defined(HAS_HAMMINGDISTANCE_X86)
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
#else
#elif defined(HAS_HAMMINGDISTANCE_SSE42)
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
if (has_sse42) {
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
#else
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
#endif
}
EXPECT_EQ(h0, h1);
......@@ -328,59 +339,63 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
uint32 h1 = 0;
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
memset(src_a, 255u, benchmark_width_ * benchmark_height_);
memset(src_b, 0, benchmark_width_ * benchmark_height_);
const int kMaxWidth =benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 255u, kMaxWidth);
memset(src_b, 0u, kMaxWidth);
uint64 h0 = ComputeHammingDistance(src_a, src_b,
benchmark_width_ * benchmark_height_);
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h0);
uint64 h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
EXPECT_EQ(kMaxWidth * 8ULL, h0);
for (int i = 0; i < benchmark_iterations_; ++i) {
#if defined(HAS_HAMMINGDISTANCE_NEON)
h1 = HammingDistance_NEON(src_a, src_b,
benchmark_width_ * benchmark_height_);
h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
#elif defined(HAS_HAMMINGDISTANCE_AVX2)
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b,
benchmark_width_ * benchmark_height_);
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
} else {
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b,
benchmark_width_ * benchmark_height_);
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
if (has_sse42) {
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_X86(src_a, src_b,
benchmark_width_ * benchmark_height_);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
}
}
#elif defined(HAS_HAMMINGDISTANCE_X86)
h1 =
HammingDistance_X86(src_a, src_b, benchmark_width_ * benchmark_height_);
#else
h1 = HammingDistance_C(src_a, src_b, benchmark_width_ * benchmark_height_);
#elif defined(HAS_HAMMINGDISTANCE_SSE42)
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
if (has_sse42) {
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
#else
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
#endif
}
// A large count will cause the low level to potentially overflow so the
// result can not be expected to be correct.
// TODO(fbarchard): Consider expecting the low 16 bits to match.
if ((benchmark_width_ * benchmark_height_) <= kMaxOptCount) {
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8U, h1);
if (kMaxWidth<= kMaxOptCount) {
EXPECT_EQ(kMaxWidth * 8U, h1);
} else {
if (benchmark_width_ * benchmark_height_ * 8ULL !=
static_cast<uint64>(h1)) {
if (kMaxWidth * 8ULL != static_cast<uint64>(h1)) {
printf(
"warning - HammingDistance_Opt %u does not match %llu "
"but length of %u is longer than guaranteed.\n",
h1, benchmark_width_ * benchmark_height_ * 8ULL,
benchmark_width_ * benchmark_height_);
h1, kMaxWidth * 8ULL, kMaxWidth);
} else {
printf(
"warning - HammingDistance_Opt %u matches but length of %u "
"is longer than guaranteed.\n",
h1, benchmark_width_ * benchmark_height_);
h1, kMaxWidth);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment