Commit e0615c0e authored by Frank Barchard's avatar Frank Barchard

Optimize Hamming Distance C code to do 64 bits at a time.

BUG=libyuv:701
TEST=LibYUVBaseTest.BenchmarkHammingDistance_C
R=wangcheng@google.com

Change-Id: I243003b098bea8ef3809298bbec349ed52a43d8c
Reviewed-on: https://chromium-review.googlesource.com/499487Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent bbbf30ee
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1655
Version: 1656
License: BSD
License File: LICENSE
......
......@@ -22,6 +22,12 @@ extern "C" {
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
// Hamming Distance
LIBYUV_API
uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b,
int count);
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
......
......@@ -49,11 +49,16 @@ extern "C" {
// The following are available for Visual C and GCC:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
#define HAS_HASHDJB2_SSE41
#define HAS_SUMSQUAREERROR_SSE2
#endif
// The following are available for GCC:
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#define HAS_HAMMINGDISTANCE_X86
#endif
// The following are available for Visual C and clangcl 32 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
......@@ -67,6 +72,11 @@ extern "C" {
#define HAS_SUMSQUAREERROR_NEON
#endif
// The following are available for Neon 64 bit:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_HAMMINGDISTANCE_NEON
#endif
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1655
#define LIBYUV_VERSION 1656
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -110,6 +110,51 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
return fourcc;
}
LIBYUV_API
uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b,
int count) {
const int kBlockSize = 65536;
int remainder = count & (kBlockSize - 1) & ~31;
uint64 diff = 0;
int i;
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
HammingDistance_C;
#if defined(HAS_HAMMINGDISTANCE_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
HammingDistance = HammingDistance_NEON;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_X86)
if (TestCpuFlag(kCpuHasX86)) {
HammingDistance = HammingDistance_X86;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
HammingDistance = HammingDistance_AVX2;
}
#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : diff)
#endif
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
}
src_a += count & ~(kBlockSize - 1);
src_b += count & ~(kBlockSize - 1);
if (remainder) {
diff += HammingDistance(src_a, src_b, remainder);
src_a += remainder;
src_b += remainder;
}
remainder = count & 31;
if (remainder) {
diff += HammingDistance_C(src_a, src_b, remainder);
}
return diff;
}
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a,
......
......@@ -17,21 +17,29 @@ namespace libyuv {
extern "C" {
#endif
#if ORIGINAL_C
#if ORIGINAL_OPT
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
int i;
for (i = 0; i < count; ++i) {
int x = src_a[i] ^ src_b[i];
if (x & 1) ++diff;
if (x & 2) ++diff;
if (x & 4) ++diff;
if (x & 8) ++diff;
if (x & 16) ++diff;
if (x & 32) ++diff;
if (x & 64) ++diff;
if (x & 128) ++diff;
if (x & 1)
++diff;
if (x & 2)
++diff;
if (x & 4)
++diff;
if (x & 8)
++diff;
if (x & 16)
++diff;
if (x & 32)
++diff;
if (x & 64)
++diff;
if (x & 128)
++diff;
}
return diff;
}
......@@ -44,10 +52,11 @@ uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
int i;
for (i = 0; i < count - 3; i += 4) {
uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
uint32 u = x - ((x >> 1) & 0x55555555);
u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
src_a += 4;
src_b += 4;
uint32 u = x - ((x >> 1) & 033333333333) - ((x >> 2) & 011111111111);
diff += ((u + (u >> 3)) & 030707070707) % 63;
}
return diff;
}
......
......@@ -20,40 +20,11 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#if 0
// 256 bits at a time
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"eor v4.16b, v4.16b, v4.16b \n"
"eor v5.16b, v5.16b, v5.16b \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"eor v2.16b, v0.16b, v1.16b \n"
"cnt v3.16b, v2.16b \n"
"addv b4, v3.16b \n"
"add d5, d5, d4 \n"
"b.gt 1b \n"
"fmov %w3, s5 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4", "v5");
return diff;
}
#endif
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"movi d6, #0 \n"
"movi d4, #0 \n"
"1: \n"
MEMACCESS(0)
......@@ -65,19 +36,19 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"addv b4, v0.16b \n"
"addv b5, v1.16b \n"
"add d6, d6, d4 \n"
"add d6, d6, d5 \n"
"uaddlv h0, v0.16b \n"
"uaddlv h1, v1.16b \n"
"add d4, d4, d0 \n"
"add d4, d4, d1 \n"
"b.gt 1b \n"
"fmov %w3, s6 \n"
"fmov %w3, s4 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
: "cc", "v0", "v1", "v2", "v3", "v4");
return diff;
}
......
......@@ -275,6 +275,37 @@ TEST_F(LibYUVBaseTest, BenchmarkHammingDistance_C) {
free_aligned_buffer_page_end(src_b);
}
TEST_F(LibYUVBaseTest, BenchmarkHammingDistance) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
memcpy(src_a, "test0123test4567", 16);
memcpy(src_b, "tick0123tock4567", 16);
uint64 h1 = ComputeHammingDistance(src_a, src_b, 16);
EXPECT_EQ(16u, h1);
// Test C vs OPT on random buffer
MemRandomize(src_a, kMaxWidth);
MemRandomize(src_b, kMaxWidth);
uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
int count =
benchmark_iterations_ *
((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
for (int i = 0; i < count; ++i) {
h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
}
EXPECT_EQ(h0, h1);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
}
TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment