Commit 0815568a authored by Frank Barchard's avatar Frank Barchard

test for unaligned vs aligned for CopyRow_SSE2

improves performance on older CPUs where movdqa is faster.
TBR=harryjin@google.com
BUG=libyuv:492

Review URL: https://codereview.chromium.org/1455463002 .
parent 1019e453
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1537 Version: 1538
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1537 #define LIBYUV_VERSION 1538
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -2726,8 +2726,23 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -2726,8 +2726,23 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#ifdef HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
"test $0xf,%0 \n"
"jne 2f \n"
"test $0xf,%1 \n"
"jne 2f \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"jmp 9f \n"
LABELALIGN
"2: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
...@@ -2735,7 +2750,8 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -2735,7 +2750,8 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 2b \n"
"9: \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 "+r"(count) // %2
......
...@@ -3376,8 +3376,23 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -3376,8 +3376,23 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
test eax, 15
jne convertloopu
test edx, 15
jne convertloopu
convertloop: convertloopa:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 32
jg convertloopa
ret
convertloopu:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
...@@ -3385,7 +3400,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -3385,7 +3400,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32 sub ecx, 32
jg convertloop jg convertloopu
ret ret
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment