Commit ddf9051b authored by fbarchard@google.com's avatar fbarchard@google.com

DJB2 hash with SSE4 pmulld

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/484002

git-svn-id: http://libyuv.googlecode.com/svn/trunk@237 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c5d44a0c
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 236
Version: 237
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 236
#define LIBYUV_VERSION 237
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -25,6 +25,7 @@ namespace libyuv {
extern "C" {
#endif
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
uint32 hash = seed;
......@@ -34,17 +35,250 @@ static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
return hash;
}
// This module is for Visual C x86
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_HASHDJB2_SSE41
static const vec32 kMulL33 = {
0xEC41D4E1, // 33 * 33 * 33 * 33 * 33 * 33 * 33
33 * 33 * 33 * 33 * 33 * 33,
33 * 33 * 33 * 33 * 33,
33 * 33 * 33 * 33 * 1 };
static const vec32 kMulH33 = {
33 * 33 * 33,
33 * 33,
33,
1 };
static const vec32 kHash4x33 = { 33 * 33 * 33 * 33, 0, 0, 0 };
static const vec32 kHash8x33 = {
0x747C7101, // 33 * 33 * 33 * 33 * 33 * 33 * 33 * 33,
0, 0, 0 };
// hash0 = initial state
// hash1 = hash0 * 33 + src[0]
// hash2 = hash1 * 33 + src[1] = (hash0 * 33 + src[0]) * 33 + src[1]
// hash3 = hash2 * 33 + src[2] = (hash1 * 33 + src[1]) * 33 + src[2] =
// ((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]
// hash4 = hash3 * 33 + src[3] = (hash2 * 33 + src[2]) * 33 + src[3] =
// ((hash1 * 33 + src[1]) * 33 + src[2]) * 33 + src[3] =
// (((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]) * 33 + src[3]
// movzxbd xmm1, [eax] // SSE4.1 requires VS2010
// pmulld requires Studio2008
// does 8 at a time, unaligned
__declspec(naked) __declspec(align(16))
static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm4, kHash8x33
movdqa xmm5, kMulL33
movdqa xmm6, kMulH33
align 16
wloop:
movq xmm1, qword ptr [eax] // src[0-7]
lea eax, [eax + 8]
punpcklbw xmm1, xmm7
movdqa xmm3, xmm1
punpcklwd xmm1, xmm7
// pmulld xmm1, xmm5
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xCD
punpckhwd xmm3, xmm7
// pmulld xmm3, xmm6
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xDE
sub ecx, 8
// pmulld xmm0, xmm4 // hash *= 33 ^ 8
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xC4
paddd xmm1, xmm3 // add 2nd 4 to first 4
pshufd xmm2, xmm1, 14 // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 1
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#define HAS_HASHDJB2_ALIGNED_SSE41
static const vec32 kHash16x33 = { -1831214591, 0, 0, 0 }; // 33 ^ 16
static const vec32 kHashMul0 = {
204809697, // 33 ^ 15
-1555599935, // 33 ^ 14
994064801, // 33 ^ 13
1331628417, // 33 ^ 12
};
static const vec32 kHashMul1 = {
821255521, // 33 ^ 11
-2057521855, // 33 ^ 10
67801377, // 33 ^ 9
1954312449, // 33 ^ 8
};
static const vec32 kHashMul2 = {
-331229983, // 33 ^ 7
1291467969, // 33 ^ 6
39135393, // 33 ^ 5
1185921, // 33 ^ 4
};
static const vec32 kHashMul3 = {
35937, // 33 ^ 3
1089, // 33 ^ 2
33, // 33 ^ 1
1, // 33 ^ 0
};
// movzxbd xmm1, [eax] // SSE4.1 requires VS2010
// pmulld requires Studio2008
// does 16 at a time, aligned
// TODO(fbarchard): For SSE2 version use pmuludq
// pmulld xmm1, xmm5
// becomes
// movdqa xmm2, xmm1
// pmuludq xmm1, [33*33*33, 0, 33, 0]
// psrldq xmm2, 8
// pmuludq xmm2, [33*33, 0, 1, 0]
// paddd xmm1, xmm2
// pshufd xmm2, xmm1, 2
// paddd xmm1, xmm2
//27: 66 0F 38 40 C6 pmulld xmm0,xmm6
//44: 66 0F 38 40 DD pmulld xmm3,xmm5
//59: 66 0F 38 40 E5 pmulld xmm4,xmm5
//72: 66 0F 38 40 D5 pmulld xmm2,xmm5
//83: 66 0F 38 40 CD pmulld xmm1,xmm5
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(align(16))
static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33
align 16
wloop:
movdqa xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 8
movdqa xmm5, kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld(0xdd) // pmulld xmm3, xmm5
movdqa xmm5, kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld(0xe5) // pmulld xmm4, xmm5
movdqa xmm5, kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld(0xd5) // pmulld xmm2, xmm5
movdqa xmm5, kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 14 // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 1
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#if 0
// This following works but is slower than movdqa version
// 66 0f 38 31 08 pmovzxbd xmm1, [eax]
// 66 0f 38 31 50 04 pmovzxbd xmm2, [eax + 4]
// 66 0f 38 31 58 08 pmovzxbd xmm3, [eax + 8]
// 66 0f 38 31 60 0c pmovzxbd xmm4, [eax + 12]
#define pmovzxbd0(rmem) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
_asm _emit 0x31 _asm _emit rmem
#define pmovzxbd(rmem0, rmem1) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
_asm _emit 0x31 _asm _emit rmem0 _asm _emit rmem1
__declspec(naked) __declspec(align(16))
static uint32 HashDjb2_Unaligned_SSE41(const uint8* src, int count,
uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
movdqa xmm5, kHash16x33
align 16
wloop:
pmovzxbd0(0x08) // src[0-3] pmovzxbd xmm1, [eax]
pmulld xmm1, kHashMul0
pmovzxbd(0x50, 0x04) // src[4-7] pmovzxbd xmm2, [eax + 4]
pmulld xmm2, kHashMul1
pmovzxbd(0x58, 0x08) // src[8-11] pmovzxbd xmm3, [eax + 8]
pmulld xmm3, kHashMul2
pmovzxbd(0x60, 0x0c) // src[12-15] pmovzxbd xmm4, [eax + 12]
pmulld xmm4, kHashMul3
lea eax, [eax + 16]
pmulld xmm0, xmm5 // hash *= 33 ^ 8
paddd xmm1, xmm2 // add 16 results
paddd xmm3, xmm4
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 14 // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 1
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#endif
#endif
// hash seed of 5381 recommended.
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
uint32 (*Hash)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(count, 8)) {
Hash = HashDjb2_SSE41;
if (IS_ALIGNED(count, 16)) {
Hash = HashDjb2_Aligned_SSE41;
}
}
#endif
const int kBlockSize = 1 << 15; // 32768;
while (count >= static_cast<uint64>(kBlockSize)) {
seed = HashDjb2_C(src, kBlockSize, seed);
seed = Hash(src, kBlockSize, seed);
src += kBlockSize;
count -= kBlockSize;
}
int remainder = static_cast<int>(count) & ~15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
seed = Hash(src, remainder, seed);
src += remainder;
count -= remainder;
}
......
......@@ -42,10 +42,16 @@ TEST_F(libyuvTest, TestDjb2) {
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
EXPECT_EQ(h1, h2);
}
int h = 1;
for (int i = 0; i <= 16 ; ++i) {
printf("%d ", h);
h *= 33;
}
free_aligned_buffer_16(src_a)
}
TEST_F(libyuvTest, BenchmakDjb2) {
TEST_F(libyuvTest, BenchmakDjb2_C) {
const int kMaxTest = 1280 * 720;
align_buffer_16(src_a, kMaxTest)
......@@ -53,10 +59,29 @@ TEST_F(libyuvTest, BenchmakDjb2) {
src_a[i] = i;
}
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
uint32 h1;
MaskCpuFlags(kCpuInitialized);
for (int i = 0; i < _benchmark_iterations; ++i) {
uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
EXPECT_EQ(h1, h2);
h1 = HashDjb2(src_a, kMaxTest, 5381);
}
MaskCpuFlags(-1);
EXPECT_EQ(h1, h2);
free_aligned_buffer_16(src_a)
}
TEST_F(libyuvTest, BenchmakDjb2_OPT) {
const int kMaxTest = 1280 * 720;
align_buffer_16(src_a, kMaxTest)
for (int i = 0; i < kMaxTest; ++i) {
src_a[i] = i;
}
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
uint32 h1;
for (int i = 0; i < _benchmark_iterations; ++i) {
h1 = HashDjb2(src_a, kMaxTest, 5381);
}
EXPECT_EQ(h1, h2);
free_aligned_buffer_16(src_a)
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment