test for unaligned vs aligned for CopyRow_SSE2

improves performance on older CPUs where movdqa is faster. TBR=harryjin@google.com BUG=libyuv:492 Review URL: https://codereview.chromium.org/1455463002 .

test for unaligned vs aligned for CopyRow_SSE2
improves performance on older CPUs where movdqa is faster. TBR=harryjin@google.com BUG=libyuv:492 Review URL: https://codereview.chromium.org/1455463002 .
0815568a · Frank Barchard · 1019e453 · 0815568a · 0815568a · 0815568a
Commit 0815568a authored Nov 17, 2015 by Frank Barchard
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 5 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

row_gcc.cc source/row_gcc.cc +17 -1

row_win.cc source/row_win.cc +17 -2

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1537
+Version: 1538
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1537
+#define LIBYUV_VERSION 1538
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2726,8 +2726,23 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  asm volatile (
+    "test       $0xf,%0                        \n"
+    "jne        2f                             \n"
+    "test       $0xf,%1                        \n"
+    "jne        2f                             \n"
    LABELALIGN
  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       9f                              \n"
+    LABELALIGN
+  "2:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -2735,7 +2750,8 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    "lea       " MEMLEA(0x20,1) ",%1           \n"
    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
+    "jg        2b                              \n"
+  "9:                                          \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(count)  // %2

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3376,8 +3376,23 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
    mov        eax, [esp + 4]   // src
    mov        edx, [esp + 8]   // dst
    mov        ecx, [esp + 12]  // count
+    test       eax, 15
+    jne        convertloopu
+    test       edx, 15
+    jne        convertloopu
-  convertloop:
+  convertloopa:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloopa
+    ret
+  convertloopu:
    movdqu     xmm0, [eax]
    movdqu     xmm1, [eax + 16]
    lea        eax, [eax + 32]
@@ -3385,7 +3400,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
    movdqu     [edx + 16], xmm1
    lea        edx, [edx + 32]
    sub        ecx, 32
-    jg         convertloop
+    jg         convertloopu
    ret
  }
 }