unaligned sse4 djb2 cleanup

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/486001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@238 16f28f9a-4ce2-e073-06de-1de4eb20be90

unaligned sse4 djb2 cleanup
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/486001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@238 16f28f9a-4ce2-e073-06de-1de4eb20be90
4d3bd834 · fbarchard@google.com · ddf9051b · 4d3bd834 · 4d3bd834 · 4d3bd834
Commit 4d3bd834 authored Apr 06, 2012 by fbarchard@google.com
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 237
+Version: 238
 License: BSD
 License File: LICENSE


--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -23,10 +23,11 @@ static const int kCpuHasSSSE3 = 4;
 static const int kCpuHasSSE41 = 8;

 // These flags are only valid on ARM processors
-static const int kCpuHasNEON = 16;
+static const int kCpuHasARM = 16;
+static const int kCpuHasNEON = 32;

 // Internal flag to indicate cpuid is initialized.
-static const int kCpuInitialized = 32;
+static const int kCpuInitialized = 64;

 // Detect CPU has SSE2 etc.
 // test_flag parameter should be one of kCpuHas constants above

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 237
+#define LIBYUV_VERSION 238

 #endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/source/compare.cc
+++ b/source/compare.cc
@@ -37,117 +37,33 @@ static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {

 // This module is for Visual C x86
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-
 #define HAS_HASHDJB2_SSE41
-static const vec32 kMulL33 = {
-  0xEC41D4E1, // 33 * 33 * 33 * 33 * 33 * 33 * 33
-  33 * 33 * 33 * 33 * 33 * 33,
-  33 * 33 * 33 * 33 * 33,
-  33 * 33 * 33 * 33 * 1 };
-static const vec32 kMulH33 = {
-  33 * 33 * 33,
-  33 * 33,
-  33,
-  1 };
-static const vec32 kHash4x33 = { 33 * 33 * 33 * 33, 0, 0, 0 };
-static const vec32 kHash8x33 = {
-  0x747C7101, // 33 * 33 * 33 * 33 * 33 * 33 * 33 * 33,
-  0, 0, 0 };
-
-
-// hash0 = initial state
-// hash1 = hash0 * 33 + src[0]
-// hash2 = hash1 * 33 + src[1] = (hash0 * 33 + src[0]) * 33 + src[1]
-// hash3 = hash2 * 33 + src[2] = (hash1 * 33 + src[1]) * 33 + src[2] =
-//   ((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]
-// hash4 = hash3 * 33 + src[3] = (hash2 * 33 + src[2]) * 33 + src[3] =
-//   ((hash1 * 33 + src[1]) * 33 + src[2]) * 33 + src[3] =
-//   (((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]) * 33 + src[3]
-
-// movzxbd    xmm1, [eax]     // SSE4.1 requires VS2010
-// pmulld requires Studio2008
-// does 8 at a time, unaligned
-__declspec(naked) __declspec(align(16))
-static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
-    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm4, kHash8x33
-    movdqa     xmm5, kMulL33
-    movdqa     xmm6, kMulH33
-
-    align      16
-  wloop:
-    movq       xmm1, qword ptr [eax] // src[0-7]
-    lea        eax, [eax + 8]
-    punpcklbw  xmm1, xmm7
-    movdqa     xmm3, xmm1
-    punpcklwd  xmm1, xmm7
- // pmulld     xmm1, xmm5
- _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xCD
-    punpckhwd  xmm3, xmm7
- // pmulld     xmm3, xmm6
- _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xDE
-    sub        ecx, 8
- // pmulld     xmm0, xmm4        // hash *= 33 ^ 8
- _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xC4
-    paddd      xmm1, xmm3        // add 2nd 4 to first 4
-    pshufd     xmm2, xmm1, 14    // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 1
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
-    jg         wloop
-
-    movd       eax, xmm0        // return hash
-    ret
-  }
-}
-
-#define HAS_HASHDJB2_ALIGNED_SSE41
-static const vec32 kHash16x33 = { -1831214591, 0, 0, 0 };  // 33 ^ 16
-static const vec32 kHashMul0 = {
-  204809697,  // 33 ^ 15
-  -1555599935,  // 33 ^ 14
-  994064801,  // 33 ^ 13
-  1331628417,  // 33 ^ 12
+static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static const uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
 };
-static const vec32 kHashMul1 = {
-  821255521,  // 33 ^ 11
-  -2057521855,  // 33 ^ 10
-  67801377,  // 33 ^ 9
-  1954312449,  // 33 ^ 8
+static const uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
 };
-static const vec32 kHashMul2 = {
-  -331229983,  // 33 ^ 7
-  1291467969,  // 33 ^ 6
-  39135393,  // 33 ^ 5
-  1185921,  // 33 ^ 4
+static const uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
 };
-static const vec32 kHashMul3 = {
-  35937,  // 33 ^ 3
-  1089,  // 33 ^ 2
-  33,  // 33 ^ 1
-  1,  // 33 ^ 0
+static const uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
 };

-// movzxbd    xmm1, [eax]     // SSE4.1 requires VS2010
-// pmulld requires Studio2008
-// does 16 at a time, aligned
-// TODO(fbarchard): For SSE2 version use pmuludq
-// pmulld     xmm1, xmm5
-// becomes
-// movdqa xmm2, xmm1
-// pmuludq xmm1, [33*33*33, 0, 33, 0]
-// psrldq xmm2, 8
-// pmuludq xmm2, [33*33, 0, 1, 0]
-// paddd xmm1, xmm2
-// pshufd xmm2, xmm1, 2
-// paddd xmm1, xmm2
-
-
 //27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
 //44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
 //59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
@@ -157,7 +73,7 @@ static const vec32 kHashMul3 = {
  _asm _emit 0x40 _asm _emit reg

 __declspec(naked) __declspec(align(16))
-static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) {
+static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
    mov        ecx, [esp + 8]    // count
@@ -168,7 +84,7 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) {

    align      16
  wloop:
-    movdqa     xmm1, [eax]       // src[0-15]
+    movdqu     xmm1, [eax]       // src[0-15]
    lea        eax, [eax + 16]
    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 8
    movdqa     xmm5, kHashMul0
@@ -205,80 +121,26 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) {
    ret
  }
 }
-
-#if 0
-// This following works but is slower than movdqa version
-// 66 0f 38 31 08       pmovzxbd xmm1, [eax]
-// 66 0f 38 31 50 04    pmovzxbd xmm2, [eax + 4]
-// 66 0f 38 31 58 08    pmovzxbd xmm3, [eax + 8]
-// 66 0f 38 31 60 0c    pmovzxbd xmm4, [eax + 12]
-
-#define pmovzxbd0(rmem) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
-    _asm _emit 0x31 _asm _emit rmem
-#define pmovzxbd(rmem0, rmem1) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
-    _asm _emit 0x31 _asm _emit rmem0 _asm _emit rmem1
-
-__declspec(naked) __declspec(align(16))
-static uint32 HashDjb2_Unaligned_SSE41(const uint8* src, int count,
-                                       uint32 seed) {
-  __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
-
-    movdqa     xmm5, kHash16x33
-
-    align      16
-  wloop:
-    pmovzxbd0(0x08)              // src[0-3] pmovzxbd xmm1, [eax]
-    pmulld     xmm1, kHashMul0
-    pmovzxbd(0x50, 0x04)         // src[4-7] pmovzxbd xmm2, [eax + 4]
-    pmulld     xmm2, kHashMul1
-    pmovzxbd(0x58, 0x08)         // src[8-11] pmovzxbd xmm3, [eax + 8]
-    pmulld     xmm3, kHashMul2
-    pmovzxbd(0x60, 0x0c)         // src[12-15] pmovzxbd xmm4, [eax + 12]
-    pmulld     xmm4, kHashMul3
-    lea        eax, [eax + 16]
-    pmulld     xmm0, xmm5        // hash *= 33 ^ 8
-    paddd      xmm1, xmm2        // add 16 results
-    paddd      xmm3, xmm4
-    sub        ecx, 16
-    paddd      xmm1, xmm3
-    pshufd     xmm2, xmm1, 14    // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 1
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
-    jg         wloop
-
-    movd       eax, xmm0        // return hash
-    ret
-  }
-}
-#endif
-
 #endif

 // hash seed of 5381 recommended.
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
-  uint32 (*Hash)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
-  if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(count, 8)) {
-    Hash = HashDjb2_SSE41;
-    if (IS_ALIGNED(count, 16)) {
-      Hash = HashDjb2_Aligned_SSE41;
-    }
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
  }
 #endif
+
  const int kBlockSize = 1 << 15;  // 32768;
  while (count >= static_cast<uint64>(kBlockSize)) {
-    seed = Hash(src, kBlockSize, seed);
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
    src += kBlockSize;
    count -= kBlockSize;
  }
  int remainder = static_cast<int>(count) & ~15;
  if (remainder) {
-    seed = Hash(src, remainder, seed);
+    seed = HashDjb2_SSE(src, remainder, seed);
    src += remainder;
    count -= remainder;
  }

--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -91,15 +91,18 @@ int InitCpuFlags() {
  if (getenv("LIBYUV_DISABLE_SSE41")) {
    cpu_info_ &= ~kCpuHasSSE41;
  }
+  if (getenv("LIBYUV_DISABLE_ASM")) {
+    cpu_info_ = kCpuInitialized;
+  }
 #elif defined(__linux__) && defined(__ARM_NEON__)
  cpu_info_ = ArmCpuCaps("/proc/cpuinfo") | kCpuInitialized;
 #elif defined(__ARM_NEON__)
  // gcc -mfpu=neon defines __ARM_NEON__
  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
  // to disable Neon on devices that do not have it.
-  cpu_info_ = kCpuHasNEON | kCpuInitialized;
+  cpu_info_ = kCpuHasNEON | kCpuInitialized | kCpuHasARM;
 #else
-  cpu_info_ = kCpuInitialized;
+  cpu_info_ = kCpuInitialized | kCpuHasARM;
 #endif
  return cpu_info_;
 }

--- a/source/row.h
+++ b/source/row.h
@@ -89,13 +89,13 @@ extern "C" {
 typedef __declspec(align(16)) int8 vec8[16];
 typedef __declspec(align(16)) uint8 uvec8[16];
 typedef __declspec(align(16)) int16 vec16[8];
-typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) uint32 uvec32[4];
 #else  // __GNUC__
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 typedef int8 __attribute__((vector_size(16))) vec8;
 typedef uint8 __attribute__((vector_size(16))) uvec8;
 typedef int16 __attribute__((vector_size(16))) vec16;
-typedef int32 __attribute__((vector_size(16))) vec32;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
 #endif

 void I420ToARGBRow_NEON(const uint8* y_buf,

--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -42,11 +42,13 @@ TEST_F(libyuvTest, TestDjb2) {
    uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
    EXPECT_EQ(h1, h2);
  }
+  // Hash constant generator using for tables in compare
  int h = 1;
  for (int i = 0; i <= 16 ; ++i) {
-    printf("%d ", h);
+    printf("%08x ", h);
    h *= 33;
  }
+  printf("\n");

  free_aligned_buffer_16(src_a)
 }
@@ -85,6 +87,22 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) {
  free_aligned_buffer_16(src_a)
 }

+TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
+  const int kMaxTest = 1280 * 720;
+
+  align_buffer_16(src_a, kMaxTest + 1)
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i + 1] = i;
+  }
+  uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
+  uint32 h1;
+  for (int i = 0; i < _benchmark_iterations; ++i) {
+    h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
+  }
+  EXPECT_EQ(h1, h2);
+  free_aligned_buffer_16(src_a)
+}
+
 TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
  const int max_width = 4096*3;


--- a/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@@ -35,6 +35,10 @@ TEST_F(libyuvTest, TestCpuHas) {
 #if LIBYUV_VERSION >= 236
  int has_sse41 = TestCpuFlag(kCpuHasSSE41);
  printf("Has SSE4.1 %d\n", has_sse41);
+#endif
+#if LIBYUV_VERSION >= 238
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  printf("Has ARM %d\n", has_arm);
 #endif
  int has_neon = TestCpuFlag(kCpuHasNEON);
  printf("Has NEON %d\n", has_neon);