Remove alignment constraint for SSE2. Allows the optimized function to be used…

Remove alignment constraint for SSE2. Allows the optimized function to be used with unaligned memory, improving performance in that use case. Hurts performance on core2 and prior where memory was faster with movdqa instruction. BUG=365 TESTED=psnr, ssim and djb2 unittests pass. R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22859004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1100 16f28f9a-4ce2-e073-06de-1de4eb20be90

Remove alignment constraint for SSE2. Allows the optimized function to be used…
Remove alignment constraint for SSE2. Allows the optimized function to be used with unaligned memory, improving performance in that use case. Hurts performance on core2 and prior where memory was faster with movdqa instruction. BUG=365 TESTED=psnr, ssim and djb2 unittests pass. R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22859004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1100 16f28f9a-4ce2-e073-06de-1de4eb20be90
9c4c8218 · fbarchard@google.com · bb5cc129 · 9c4c8218 · 9c4c8218 · 9c4c8218
Commit 9c4c8218 authored Sep 30, 2014 by fbarchard@google.com
Showing with 31 additions and 6 deletions

compare.cc source/compare.cc +1 -2

compare_posix.cc source/compare_posix.cc +2 -2

compare_win.cc source/compare_win.cc +2 -2

compare_test.cc unit_test/compare_test.cc +26 -0

No files found.
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -114,8 +114,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
    // Note only used for multiples of 16 so count is not checked.
    SumSquareError = SumSquareError_SSE2;
  }

--- a/source/compare_posix.cc
+++ b/source/compare_posix.cc
@@ -25,9 +25,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    "pxor      %%xmm5,%%xmm5                   \n"
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
    "lea       " MEMLEA(0x10, 1) ",%1          \n"
    "sub       $0x10,%2                        \n"
    "movdqa    %%xmm1,%%xmm3                   \n"

--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -29,9 +29,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {

    align      4
  wloop:
-    movdqa     xmm1, [eax]
+    movdqu     xmm1, [eax]
    lea        eax,  [eax + 16]
-    movdqa     xmm2, [edx]
+    movdqu     xmm2, [edx]
    lea        edx,  [edx + 16]
    sub        ecx, 16
    movdqa     xmm3, xmm1  // abs trick

--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -244,6 +244,32 @@ TEST_F(libyuvTest, BenchmarkPsnr_Opt) {
  free_aligned_buffer_64(src_b);
 }

+
+TEST_F(libyuvTest, BenchmarkPsnr_Unaligned) {
+  align_buffer_64(src_a, benchmark_width_ * benchmark_height_ + 1);
+  align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i + 1] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(-1);
+
+  double opt_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFramePsnr(src_a + 1, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
 TEST_F(libyuvTest, Psnr) {
  const int kSrcWidth = benchmark_width_;
  const int kSrcHeight = benchmark_height_;