Commit fecd7417 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Port HammingDistance to SSSE3

Bug: libyuv:701
Test: BenchmarkHammingDistance_Opt
Change-Id: Ibdd5d382677ebef4f82a62e0d5c3b88614a3b6e4
Reviewed-on: https://chromium-review.googlesource.com/696290
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent bde789b1
......@@ -160,9 +160,9 @@ static_library("libyuv_internal") {
# To enable AVX2 or other cpu optimization, pass flag here
if (!is_win) {
cflags = [
# "-mpopcnt",
# "-mavx2",
# "-mfma",
# "-mpopcnt",
# "-mavx2",
# "-mfma",
"-ffp-contract=fast", # Enable fma vectorization for NEON.
]
}
......
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1671
Version: 1672
License: BSD
License File: LICENSE
......
......@@ -70,7 +70,13 @@ extern "C" {
#define HAS_SUMSQUAREERROR_AVX2
#endif
// The following are available for VGCC and clangcl 64 bit:
// The following are available for GCC and clangcl 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_HAMMINGDISTANCE_SSSE3
#endif
// The following are available for GCC and clangcl 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_HAMMINGDISTANCE_AVX2
......@@ -93,7 +99,7 @@ extern "C" {
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSE2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1671
#define LIBYUV_VERSION 1672
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -131,6 +131,11 @@ uint64 ComputeHammingDistance(const uint8* src_a,
HammingDistance = HammingDistance_X86;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
HammingDistance = HammingDistance_SSSE3;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
HammingDistance = HammingDistance_AVX2;
......
......@@ -35,15 +35,74 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
return diff;
}
#ifdef HAS_HAMMINGDISTANCE_AVX2
static uint32 kNibbleMask = 0x0f0f0f0fu;
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15};
static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
uint32 HammingDistance_SSSE3(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u;
asm volatile(
"movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n"
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"movdqa (%0),%%xmm4 \n"
"movdqa 0x10(%0), %%xmm5 \n"
"pxor (%0,%1), %%xmm4 \n"
"movdqa %%xmm4,%%xmm6 \n"
"pand %%xmm2,%%xmm6 \n"
"psrlw $0x4,%%xmm4 \n"
"movdqa %%xmm3,%%xmm7 \n"
"pshufb %%xmm6,%%xmm7 \n"
"pand %%xmm2,%%xmm4 \n"
"movdqa %%xmm3,%%xmm6 \n"
"pshufb %%xmm4,%%xmm6 \n"
"paddb %%xmm7,%%xmm6 \n"
"pxor 0x10(%0,%1),%%xmm5 \n"
"add $0x20,%0 \n"
"movdqa %%xmm5,%%xmm4 \n"
"pand %%xmm2,%%xmm5 \n"
"psrlw $0x4,%%xmm4 \n"
"movdqa %%xmm3,%%xmm7 \n"
"pshufb %%xmm5,%%xmm7 \n"
"pand %%xmm2,%%xmm4 \n"
"movdqa %%xmm3,%%xmm5 \n"
"pshufb %%xmm4,%%xmm5 \n"
"paddb %%xmm7,%%xmm5 \n"
"paddb %%xmm5,%%xmm6 \n"
"psadbw %%xmm1,%%xmm6 \n"
"paddd %%xmm6,%%xmm0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"pshufd $0xaa,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"vmovd %%xmm0, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=r"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
return diff;
}
#ifdef HAS_HAMMINGDISTANCE_AVX2
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
asm volatile(
"vbroadcastss %4,%%ymm2 \n"
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
......@@ -83,7 +142,7 @@ uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(diff) // %3
"=r"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
......
......@@ -636,10 +636,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"subs %w2, %w2, #32 \n" // 32 processed per loop
"stp q0, q1, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
);
}
......
......@@ -234,7 +234,12 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
if (has_avx2) {
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
} else {
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
}
}
#elif defined(HAS_HAMMINGDISTANCE_X86)
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment