Commit ec75df58 authored by Frank Barchard's avatar Frank Barchard

ComputeHammingDistance reduce SIMD loop to 1 call when possible.

32 bit x86 has high overhead due to -fpic.  So this reduces the
number of calls by 1.

TBR=kjellander@chromium.org
Bug: libyuv:701
Test: BenchmarkHammingDistance
Change-Id: I7f557ef047920db65eab362a5f93abbd274ca051
Reviewed-on: https://chromium-review.googlesource.com/701755Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent b7b53742
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1673 Version: 1674
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1673 #define LIBYUV_VERSION 1674
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -114,10 +114,9 @@ LIBYUV_API ...@@ -114,10 +114,9 @@ LIBYUV_API
uint64 ComputeHammingDistance(const uint8* src_a, uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b, const uint8* src_b,
int count) { int count) {
const int kBlockSize = 65536; const int kBlockSize = (65536 - 64); // Max count that SIMD wont overflow
const int kSimdSize = 64; const int kSimdSize = 64;
// SIMD for multiple of 64, and C for remainder int remainder;
int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
uint64 diff = 0; uint64 diff = 0;
int i; int i;
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) = uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
...@@ -153,16 +152,14 @@ uint64 ComputeHammingDistance(const uint8* src_a, ...@@ -153,16 +152,14 @@ uint64 ComputeHammingDistance(const uint8* src_a,
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
diff += HammingDistance(src_a + i, src_b + i, kBlockSize); diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
} }
src_a += count & ~(kBlockSize - 1); remainder = (count - i) & ~(kSimdSize - 1);
src_b += count & ~(kBlockSize - 1);
if (remainder) { if (remainder) {
diff += HammingDistance(src_a, src_b, remainder); diff += HammingDistance(src_a + i, src_b + i, remainder);
src_a += remainder; i += remainder;
src_b += remainder;
} }
remainder = count & (kSimdSize - 1); remainder = (count - i);
if (remainder) { if (remainder) {
diff += HammingDistance_C(src_a, src_b, remainder); diff += HammingDistance_C(src_a + i, src_b + i, remainder);
} }
return diff; return diff;
} }
......
...@@ -333,6 +333,67 @@ TEST_F(LibYUVCompareTest, TestHammingDistance) { ...@@ -333,6 +333,67 @@ TEST_F(LibYUVCompareTest, TestHammingDistance) {
free_aligned_buffer_page_end(src_b); free_aligned_buffer_page_end(src_b);
} }
// Tests low levels match reference C for specified size.
// The opt implementations have size limitations
static const int kMaxOptCount = 65536 - 64;
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
uint32 h1 = 0;
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
memset(src_a, 255u, benchmark_width_ * benchmark_height_);
memset(src_b, 0, benchmark_width_ * benchmark_height_);
uint32 h0 =
HammingDistance_C(src_a, src_b, benchmark_width_ * benchmark_height_);
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h0);
uint32 h2 = ComputeHammingDistance(src_a, src_b,
benchmark_width_ * benchmark_height_);
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h2);
for (int i = 0; i < benchmark_iterations_; ++i) {
#if defined(HAS_HAMMINGDISTANCE_NEON)
h1 = HammingDistance_NEON(src_a, src_b,
benchmark_width_ * benchmark_height_);
#elif defined(HAS_HAMMINGDISTANCE_AVX2)
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b,
benchmark_width_ * benchmark_height_);
} else {
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b,
benchmark_width_ * benchmark_height_);
} else {
h1 = HammingDistance_X86(src_a, src_b,
benchmark_width_ * benchmark_height_);
}
}
#elif defined(HAS_HAMMINGDISTANCE_X86)
h1 =
HammingDistance_X86(src_a, src_b, benchmark_width_ * benchmark_height_);
#else
h1 = HammingDistance_C(src_a, src_b, benchmark_width_ * benchmark_height_);
#endif
}
// A large count will cause the low level to potentially overflow so the
// result can not be expected to be correct.
// TODO(fbarchard): Consider expecting the low 16 bits to match.
if ((benchmark_width_ * benchmark_height_) <= kMaxOptCount) {
EXPECT_EQ(h0, h1);
} else if (h0 != h1) {
printf(
"warning - HammingDistance_Opt does not match HammingDistance_C: "
"HammingDistance_Opt %u vs HammingDistance_C %u\n",
h1, h0);
}
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
}
TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) { TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) {
const int kMaxWidth = 4096 * 3; const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_a, kMaxWidth);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment