Commit 4d3bd834 authored by fbarchard@google.com's avatar fbarchard@google.com

unaligned sse4 djb2 cleanup

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/486001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@238 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent ddf9051b
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 237 Version: 238
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -23,10 +23,11 @@ static const int kCpuHasSSSE3 = 4; ...@@ -23,10 +23,11 @@ static const int kCpuHasSSSE3 = 4;
static const int kCpuHasSSE41 = 8; static const int kCpuHasSSE41 = 8;
// These flags are only valid on ARM processors // These flags are only valid on ARM processors
static const int kCpuHasNEON = 16; static const int kCpuHasARM = 16;
static const int kCpuHasNEON = 32;
// Internal flag to indicate cpuid is initialized. // Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 32; static const int kCpuInitialized = 64;
// Detect CPU has SSE2 etc. // Detect CPU has SSE2 etc.
// test_flag parameter should be one of kCpuHas constants above // test_flag parameter should be one of kCpuHas constants above
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 237 #define LIBYUV_VERSION 238
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -37,117 +37,33 @@ static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { ...@@ -37,117 +37,33 @@ static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
// This module is for Visual C x86 // This module is for Visual C x86
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_HASHDJB2_SSE41 #define HAS_HASHDJB2_SSE41
static const vec32 kMulL33 = { static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
0xEC41D4E1, // 33 * 33 * 33 * 33 * 33 * 33 * 33 static const uvec32 kHashMul0 = {
33 * 33 * 33 * 33 * 33 * 33, 0x0c3525e1, // 33 ^ 15
33 * 33 * 33 * 33 * 33, 0xa3476dc1, // 33 ^ 14
33 * 33 * 33 * 33 * 1 }; 0x3b4039a1, // 33 ^ 13
static const vec32 kMulH33 = { 0x4f5f0981, // 33 ^ 12
33 * 33 * 33,
33 * 33,
33,
1 };
static const vec32 kHash4x33 = { 33 * 33 * 33 * 33, 0, 0, 0 };
static const vec32 kHash8x33 = {
0x747C7101, // 33 * 33 * 33 * 33 * 33 * 33 * 33 * 33,
0, 0, 0 };
// hash0 = initial state
// hash1 = hash0 * 33 + src[0]
// hash2 = hash1 * 33 + src[1] = (hash0 * 33 + src[0]) * 33 + src[1]
// hash3 = hash2 * 33 + src[2] = (hash1 * 33 + src[1]) * 33 + src[2] =
// ((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]
// hash4 = hash3 * 33 + src[3] = (hash2 * 33 + src[2]) * 33 + src[3] =
// ((hash1 * 33 + src[1]) * 33 + src[2]) * 33 + src[3] =
// (((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]) * 33 + src[3]
// movzxbd xmm1, [eax] // SSE4.1 requires VS2010
// pmulld requires Studio2008
// does 8 at a time, unaligned
__declspec(naked) __declspec(align(16))
static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm4, kHash8x33
movdqa xmm5, kMulL33
movdqa xmm6, kMulH33
align 16
wloop:
movq xmm1, qword ptr [eax] // src[0-7]
lea eax, [eax + 8]
punpcklbw xmm1, xmm7
movdqa xmm3, xmm1
punpcklwd xmm1, xmm7
// pmulld xmm1, xmm5
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xCD
punpckhwd xmm3, xmm7
// pmulld xmm3, xmm6
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xDE
sub ecx, 8
// pmulld xmm0, xmm4 // hash *= 33 ^ 8
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xC4
paddd xmm1, xmm3 // add 2nd 4 to first 4
pshufd xmm2, xmm1, 14 // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 1
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#define HAS_HASHDJB2_ALIGNED_SSE41
static const vec32 kHash16x33 = { -1831214591, 0, 0, 0 }; // 33 ^ 16
static const vec32 kHashMul0 = {
204809697, // 33 ^ 15
-1555599935, // 33 ^ 14
994064801, // 33 ^ 13
1331628417, // 33 ^ 12
}; };
static const vec32 kHashMul1 = { static const uvec32 kHashMul1 = {
821255521, // 33 ^ 11 0x30f35d61, // 33 ^ 11
-2057521855, // 33 ^ 10 0x855cb541, // 33 ^ 10
67801377, // 33 ^ 9 0x040a9121, // 33 ^ 9
1954312449, // 33 ^ 8 0x747c7101, // 33 ^ 8
}; };
static const vec32 kHashMul2 = { static const uvec32 kHashMul2 = {
-331229983, // 33 ^ 7 0xec41d4e1, // 33 ^ 7
1291467969, // 33 ^ 6 0x4cfa3cc1, // 33 ^ 6
39135393, // 33 ^ 5 0x025528a1, // 33 ^ 5
1185921, // 33 ^ 4 0x00121881, // 33 ^ 4
}; };
static const vec32 kHashMul3 = { static const uvec32 kHashMul3 = {
35937, // 33 ^ 3 0x00008c61, // 33 ^ 3
1089, // 33 ^ 2 0x00000441, // 33 ^ 2
33, // 33 ^ 1 0x00000021, // 33 ^ 1
1, // 33 ^ 0 0x00000001, // 33 ^ 0
}; };
// movzxbd xmm1, [eax] // SSE4.1 requires VS2010
// pmulld requires Studio2008
// does 16 at a time, aligned
// TODO(fbarchard): For SSE2 version use pmuludq
// pmulld xmm1, xmm5
// becomes
// movdqa xmm2, xmm1
// pmuludq xmm1, [33*33*33, 0, 33, 0]
// psrldq xmm2, 8
// pmuludq xmm2, [33*33, 0, 1, 0]
// paddd xmm1, xmm2
// pshufd xmm2, xmm1, 2
// paddd xmm1, xmm2
//27: 66 0F 38 40 C6 pmulld xmm0,xmm6 //27: 66 0F 38 40 C6 pmulld xmm0,xmm6
//44: 66 0F 38 40 DD pmulld xmm3,xmm5 //44: 66 0F 38 40 DD pmulld xmm3,xmm5
//59: 66 0F 38 40 E5 pmulld xmm4,xmm5 //59: 66 0F 38 40 E5 pmulld xmm4,xmm5
...@@ -157,7 +73,7 @@ static const vec32 kHashMul3 = { ...@@ -157,7 +73,7 @@ static const vec32 kHashMul3 = {
_asm _emit 0x40 _asm _emit reg _asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) { static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count mov ecx, [esp + 8] // count
...@@ -168,7 +84,7 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -168,7 +84,7 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) {
align 16 align 16
wloop: wloop:
movdqa xmm1, [eax] // src[0-15] movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16] lea eax, [eax + 16]
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 8 pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 8
movdqa xmm5, kHashMul0 movdqa xmm5, kHashMul0
...@@ -205,80 +121,26 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -205,80 +121,26 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) {
ret ret
} }
} }
#if 0
// This following works but is slower than movdqa version
// 66 0f 38 31 08 pmovzxbd xmm1, [eax]
// 66 0f 38 31 50 04 pmovzxbd xmm2, [eax + 4]
// 66 0f 38 31 58 08 pmovzxbd xmm3, [eax + 8]
// 66 0f 38 31 60 0c pmovzxbd xmm4, [eax + 12]
#define pmovzxbd0(rmem) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
_asm _emit 0x31 _asm _emit rmem
#define pmovzxbd(rmem0, rmem1) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
_asm _emit 0x31 _asm _emit rmem0 _asm _emit rmem1
__declspec(naked) __declspec(align(16))
static uint32 HashDjb2_Unaligned_SSE41(const uint8* src, int count,
uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
movdqa xmm5, kHash16x33
align 16
wloop:
pmovzxbd0(0x08) // src[0-3] pmovzxbd xmm1, [eax]
pmulld xmm1, kHashMul0
pmovzxbd(0x50, 0x04) // src[4-7] pmovzxbd xmm2, [eax + 4]
pmulld xmm2, kHashMul1
pmovzxbd(0x58, 0x08) // src[8-11] pmovzxbd xmm3, [eax + 8]
pmulld xmm3, kHashMul2
pmovzxbd(0x60, 0x0c) // src[12-15] pmovzxbd xmm4, [eax + 12]
pmulld xmm4, kHashMul3
lea eax, [eax + 16]
pmulld xmm0, xmm5 // hash *= 33 ^ 8
paddd xmm1, xmm2 // add 16 results
paddd xmm3, xmm4
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 14 // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 1
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
ret
}
}
#endif
#endif #endif
// hash seed of 5381 recommended. // hash seed of 5381 recommended.
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
uint32 (*Hash)(const uint8* src, int count, uint32 seed) = HashDjb2_C; uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41) #if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(count, 8)) { if (TestCpuFlag(kCpuHasSSE41)) {
Hash = HashDjb2_SSE41; HashDjb2_SSE = HashDjb2_SSE41;
if (IS_ALIGNED(count, 16)) {
Hash = HashDjb2_Aligned_SSE41;
}
} }
#endif #endif
const int kBlockSize = 1 << 15; // 32768; const int kBlockSize = 1 << 15; // 32768;
while (count >= static_cast<uint64>(kBlockSize)) { while (count >= static_cast<uint64>(kBlockSize)) {
seed = Hash(src, kBlockSize, seed); seed = HashDjb2_SSE(src, kBlockSize, seed);
src += kBlockSize; src += kBlockSize;
count -= kBlockSize; count -= kBlockSize;
} }
int remainder = static_cast<int>(count) & ~15; int remainder = static_cast<int>(count) & ~15;
if (remainder) { if (remainder) {
seed = Hash(src, remainder, seed); seed = HashDjb2_SSE(src, remainder, seed);
src += remainder; src += remainder;
count -= remainder; count -= remainder;
} }
......
...@@ -91,15 +91,18 @@ int InitCpuFlags() { ...@@ -91,15 +91,18 @@ int InitCpuFlags() {
if (getenv("LIBYUV_DISABLE_SSE41")) { if (getenv("LIBYUV_DISABLE_SSE41")) {
cpu_info_ &= ~kCpuHasSSE41; cpu_info_ &= ~kCpuHasSSE41;
} }
if (getenv("LIBYUV_DISABLE_ASM")) {
cpu_info_ = kCpuInitialized;
}
#elif defined(__linux__) && defined(__ARM_NEON__) #elif defined(__linux__) && defined(__ARM_NEON__)
cpu_info_ = ArmCpuCaps("/proc/cpuinfo") | kCpuInitialized; cpu_info_ = ArmCpuCaps("/proc/cpuinfo") | kCpuInitialized;
#elif defined(__ARM_NEON__) #elif defined(__ARM_NEON__)
// gcc -mfpu=neon defines __ARM_NEON__ // gcc -mfpu=neon defines __ARM_NEON__
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
// to disable Neon on devices that do not have it. // to disable Neon on devices that do not have it.
cpu_info_ = kCpuHasNEON | kCpuInitialized; cpu_info_ = kCpuHasNEON | kCpuInitialized | kCpuHasARM;
#else #else
cpu_info_ = kCpuInitialized; cpu_info_ = kCpuInitialized | kCpuHasARM;
#endif #endif
return cpu_info_; return cpu_info_;
} }
......
...@@ -89,13 +89,13 @@ extern "C" { ...@@ -89,13 +89,13 @@ extern "C" {
typedef __declspec(align(16)) int8 vec8[16]; typedef __declspec(align(16)) int8 vec8[16];
typedef __declspec(align(16)) uint8 uvec8[16]; typedef __declspec(align(16)) uint8 uvec8[16];
typedef __declspec(align(16)) int16 vec16[8]; typedef __declspec(align(16)) int16 vec16[8];
typedef __declspec(align(16)) int32 vec32[4]; typedef __declspec(align(16)) uint32 uvec32[4];
#else // __GNUC__ #else // __GNUC__
#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
typedef int8 __attribute__((vector_size(16))) vec8; typedef int8 __attribute__((vector_size(16))) vec8;
typedef uint8 __attribute__((vector_size(16))) uvec8; typedef uint8 __attribute__((vector_size(16))) uvec8;
typedef int16 __attribute__((vector_size(16))) vec16; typedef int16 __attribute__((vector_size(16))) vec16;
typedef int32 __attribute__((vector_size(16))) vec32; typedef uint32 __attribute__((vector_size(16))) uvec32;
#endif #endif
void I420ToARGBRow_NEON(const uint8* y_buf, void I420ToARGBRow_NEON(const uint8* y_buf,
......
...@@ -42,11 +42,13 @@ TEST_F(libyuvTest, TestDjb2) { ...@@ -42,11 +42,13 @@ TEST_F(libyuvTest, TestDjb2) {
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
EXPECT_EQ(h1, h2); EXPECT_EQ(h1, h2);
} }
// Hash constant generator using for tables in compare
int h = 1; int h = 1;
for (int i = 0; i <= 16 ; ++i) { for (int i = 0; i <= 16 ; ++i) {
printf("%d ", h); printf("%08x ", h);
h *= 33; h *= 33;
} }
printf("\n");
free_aligned_buffer_16(src_a) free_aligned_buffer_16(src_a)
} }
...@@ -85,6 +87,22 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) { ...@@ -85,6 +87,22 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) {
free_aligned_buffer_16(src_a) free_aligned_buffer_16(src_a)
} }
TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
const int kMaxTest = 1280 * 720;
align_buffer_16(src_a, kMaxTest + 1)
for (int i = 0; i < kMaxTest; ++i) {
src_a[i + 1] = i;
}
uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
uint32 h1;
for (int i = 0; i < _benchmark_iterations; ++i) {
h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
}
EXPECT_EQ(h1, h2);
free_aligned_buffer_16(src_a)
}
TEST_F(libyuvTest, BenchmarkSumSquareError_C) { TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
const int max_width = 4096*3; const int max_width = 4096*3;
......
...@@ -35,6 +35,10 @@ TEST_F(libyuvTest, TestCpuHas) { ...@@ -35,6 +35,10 @@ TEST_F(libyuvTest, TestCpuHas) {
#if LIBYUV_VERSION >= 236 #if LIBYUV_VERSION >= 236
int has_sse41 = TestCpuFlag(kCpuHasSSE41); int has_sse41 = TestCpuFlag(kCpuHasSSE41);
printf("Has SSE4.1 %d\n", has_sse41); printf("Has SSE4.1 %d\n", has_sse41);
#endif
#if LIBYUV_VERSION >= 238
int has_arm = TestCpuFlag(kCpuHasARM);
printf("Has ARM %d\n", has_arm);
#endif #endif
int has_neon = TestCpuFlag(kCpuHasNEON); int has_neon = TestCpuFlag(kCpuHasNEON);
printf("Has NEON %d\n", has_neon); printf("Has NEON %d\n", has_neon);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment