Commit 823548cb authored by fbarchard@google.com's avatar fbarchard@google.com

AVX2 hash using vex128 as first step.

BUG=none
TEST=BenchmarkDjb2_Opt
R=ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/2219004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@792 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent a1ab1945
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 791
Version: 792
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 791
#define LIBYUV_VERSION 792
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -10,6 +10,8 @@
#include "libyuv/compare.h"
#include <stdio.h> // printf
#include <float.h>
#include <math.h>
#ifdef _OPENMP
......@@ -34,9 +36,13 @@ uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
(defined(_M_IX86) || \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
#define HAS_HASHDJB2_SSE41
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
#if _MSC_VER >= 1700
#define HAS_HASHDJB2_AVX2
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
#endif
#endif // HAS_HASHDJB2_SSE41
// hash seed of 5381 recommended.
......@@ -48,6 +54,11 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
HashDjb2_SSE = HashDjb2_SSE41;
}
#endif
#if defined(HAS_HASHDJB2_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
HashDjb2_SSE = HashDjb2_AVX2;
}
#endif
const int kBlockSize = 1 << 15; // 32768;
while (count >= static_cast<uint64>(kBlockSize)) {
......
......@@ -184,6 +184,46 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
ret
}
}
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
__declspec(naked) __declspec(align(16))
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
align 16
wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
pmulld xmm3, kHashMul0
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
pmulld xmm4, kHashMul1
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
pmulld xmm2, kHashMul2
lea eax, [eax + 16]
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#ifdef __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment