Commit bde789b1 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Hamming Distance SSE2 and AVX2 optimized

Bug: None
Test: None
Change-Id: Id52663f9c957aac3172fba92d888ad1b041d5cf0
Reviewed-on: https://chromium-review.googlesource.com/692981Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 311add63
......@@ -158,9 +158,13 @@ static_library("libyuv_internal") {
}
# To enable AVX2 or other cpu optimization, pass flag here
# cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
if (!is_win) {
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON.
cflags = [
# "-mpopcnt",
# "-mavx2",
# "-mfma",
"-ffp-contract=fast", # Enable fma vectorization for NEON.
]
}
}
if (libyuv_use_neon) {
......
......@@ -49,6 +49,7 @@ extern "C" {
// #define DISABLE_CLANG_MSA 1
#endif
// The following are available for Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2
......@@ -69,6 +70,12 @@ extern "C" {
#define HAS_SUMSQUAREERROR_AVX2
#endif
// The following are available for VGCC and clangcl 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_HAMMINGDISTANCE_AVX2
#endif
// The following are available for Neon:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
......@@ -86,6 +93,8 @@ extern "C" {
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSE2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
......
......@@ -115,7 +115,8 @@ uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b,
int count) {
const int kBlockSize = 65536;
int remainder = count & (kBlockSize - 1) & ~31;
// SIMD for multiple of 64, and C for remainder
int remainder = count & (kBlockSize - 1) & ~63;
uint64 diff = 0;
int i;
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
......
......@@ -35,6 +35,63 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
return diff;
}
#ifdef HAS_HAMMINGDISTANCE_AVX2
static uint32 kNibbleMask = 0x0f0f0f0fu;
static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
asm volatile(
"vbroadcastss %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"vmovdqa (%0),%%ymm4 \n"
"vmovdqa 0x20(%0), %%ymm5 \n"
"vpxor (%0,%1), %%ymm4, %%ymm4 \n"
"vpand %%ymm2,%%ymm4,%%ymm6 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
"vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
"add $0x40,%0 \n"
"vpand %%ymm2,%%ymm4,%%ymm5 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
"vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
"vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
"vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
"vpermq $0xb1,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vmovd %%xmm0, %3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
return diff;
}
#endif // HAS_HAMMINGDISTANCE_AVX2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
......
......@@ -229,13 +229,19 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
for (int i = 0; i < count; ++i) {
#if defined(HAS_HAMMINGDISTANCE_NEON)
h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
#elif defined(HAS_HAMMINGDISTANCE_AVX2)
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
}
#elif defined(HAS_HAMMINGDISTANCE_X86)
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
#else
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
#endif
}
EXPECT_EQ(h0, h1);
free_aligned_buffer_page_end(src_a);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment