Commit bde789b1 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Hamming Distance SSE2 and AVX2 optimized

Bug: None
Test: None
Change-Id: Id52663f9c957aac3172fba92d888ad1b041d5cf0
Reviewed-on: https://chromium-review.googlesource.com/692981Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 311add63
...@@ -158,9 +158,13 @@ static_library("libyuv_internal") { ...@@ -158,9 +158,13 @@ static_library("libyuv_internal") {
} }
# To enable AVX2 or other cpu optimization, pass flag here # To enable AVX2 or other cpu optimization, pass flag here
# cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
if (!is_win) { if (!is_win) {
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON. cflags = [
# "-mpopcnt",
# "-mavx2",
# "-mfma",
"-ffp-contract=fast", # Enable fma vectorization for NEON.
]
} }
} }
if (libyuv_use_neon) { if (libyuv_use_neon) {
......
...@@ -49,6 +49,7 @@ extern "C" { ...@@ -49,6 +49,7 @@ extern "C" {
// #define DISABLE_CLANG_MSA 1 // #define DISABLE_CLANG_MSA 1
#endif #endif
// The following are available for Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2 #define HAS_HASHDJB2_AVX2
...@@ -69,6 +70,12 @@ extern "C" { ...@@ -69,6 +70,12 @@ extern "C" {
#define HAS_SUMSQUAREERROR_AVX2 #define HAS_SUMSQUAREERROR_AVX2
#endif #endif
// The following are available for VGCC and clangcl 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_HAMMINGDISTANCE_AVX2
#endif
// The following are available for Neon: // The following are available for Neon:
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
...@@ -86,6 +93,8 @@ extern "C" { ...@@ -86,6 +93,8 @@ extern "C" {
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSE2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
......
...@@ -115,7 +115,8 @@ uint64 ComputeHammingDistance(const uint8* src_a, ...@@ -115,7 +115,8 @@ uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b, const uint8* src_b,
int count) { int count) {
const int kBlockSize = 65536; const int kBlockSize = 65536;
int remainder = count & (kBlockSize - 1) & ~31; // SIMD for multiple of 64, and C for remainder
int remainder = count & (kBlockSize - 1) & ~63;
uint64 diff = 0; uint64 diff = 0;
int i; int i;
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) = uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
......
...@@ -35,6 +35,63 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { ...@@ -35,6 +35,63 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
return diff; return diff;
} }
#ifdef HAS_HAMMINGDISTANCE_AVX2
static uint32 kNibbleMask = 0x0f0f0f0fu;
static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
asm volatile(
"vbroadcastss %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"vmovdqa (%0),%%ymm4 \n"
"vmovdqa 0x20(%0), %%ymm5 \n"
"vpxor (%0,%1), %%ymm4, %%ymm4 \n"
"vpand %%ymm2,%%ymm4,%%ymm6 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
"vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
"add $0x40,%0 \n"
"vpand %%ymm2,%%ymm4,%%ymm5 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
"vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
"vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
"vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
"vpermq $0xb1,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vmovd %%xmm0, %3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
return diff;
}
#endif // HAS_HAMMINGDISTANCE_AVX2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse; uint32 sse;
asm volatile ( asm volatile (
......
...@@ -229,13 +229,19 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { ...@@ -229,13 +229,19 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
#if defined(HAS_HAMMINGDISTANCE_NEON) #if defined(HAS_HAMMINGDISTANCE_NEON)
h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
#elif defined(HAS_HAMMINGDISTANCE_AVX2)
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
}
#elif defined(HAS_HAMMINGDISTANCE_X86) #elif defined(HAS_HAMMINGDISTANCE_X86)
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
#else #else
h1 = HammingDistance_C(src_a, src_b, kMaxWidth); h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
#endif #endif
} }
EXPECT_EQ(h0, h1); EXPECT_EQ(h0, h1);
free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_a);
......
...@@ -2878,13 +2878,13 @@ float TestCopySamples(int benchmark_width, ...@@ -2878,13 +2878,13 @@ float TestCopySamples(int benchmark_width,
TEST_F(LibYUVPlanarTest, TestCopySamples_C) { TEST_F(LibYUVPlanarTest, TestCopySamples_C) {
float diff = TestCopySamples(benchmark_width_, benchmark_height_, float diff = TestCopySamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, false); benchmark_iterations_, false);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) { TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) {
float diff = TestCopySamples(benchmark_width_, benchmark_height_, float diff = TestCopySamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, true); benchmark_iterations_, true);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment