Commit 5822505e authored by fbarchard@google.com's avatar fbarchard@google.com

Remove extra unaligned loop from alphablender. Both aligned and unaligned loops…

Remove extra unaligned loop from alphablender.  Both aligned and unaligned loops were the same, so remove the extra.
BUG=none
TESTED=try bots.
R=brucedawson@google.com, harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/29059004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1166 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 1eb636d2
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1164 Version: 1165
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -207,12 +207,12 @@ extern "C" { ...@@ -207,12 +207,12 @@ extern "C" {
#define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_ARGBMIRRORROW_AVX2
#endif #endif
// The following are require VS2012. // The following are require VS2012.
// TODO(fbarchard): Port to gcc. // TODO(fbarchard): Port to gcc.
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_ARGBMIRRORROW_AVX2
#define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYROW_AVX2
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1164 #define LIBYUV_VERSION 1165
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -2183,10 +2183,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -2183,10 +2183,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile ( asm volatile (
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
...@@ -3378,10 +3377,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3378,10 +3377,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"19: \n" "19: \n"
"add $1-4,%3 \n" "add $1-4,%3 \n"
"jl 49f \n" "jl 49f \n"
"test $0xf,%0 \n"
"jne 41f \n"
"test $0xf,%1 \n"
"jne 41f \n"
// 4 pixel loop. // 4 pixel loop.
LABELALIGN LABELALIGN
...@@ -3408,33 +3403,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3408,33 +3403,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"jge 40b \n" "jge 40b \n"
"jmp 49f \n"
// 4 pixel loop.
LABELALIGN
"41: \n"
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
"49: \n" "49: \n"
"add $0x3,%3 \n" "add $0x3,%3 \n"
......
...@@ -2392,7 +2392,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -2392,7 +2392,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
} }
#endif // HAS_YTOARGBROW_SSE2 #endif // HAS_YTOARGBROW_SSE2
#ifdef HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes. // Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = { static const uvec8 kShuffleMirror = {
...@@ -2432,7 +2431,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2432,7 +2431,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
align 4 align 4
convertloop: convertloop:
vmovdqu ymm0, [eax - 32 + ecx] vmovdqu ymm0, -32[eax + ecx]
vpshufb ymm0, ymm0, ymm5 vpshufb ymm0, ymm0, ymm5
vpermq ymm0, ymm0, 0x4e // swap high and low halfs vpermq ymm0, ymm0, 0x4e // swap high and low halfs
sub ecx, 32 sub ecx, 32
...@@ -2455,7 +2454,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -2455,7 +2454,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
align 4 align 4
convertloop: convertloop:
movdqu xmm0, [eax - 16 + ecx] movdqu xmm0, -16[eax + ecx]
movdqa xmm1, xmm0 // swap bytes movdqa xmm1, xmm0 // swap bytes
psllw xmm0, 8 psllw xmm0, 8
psrlw xmm1, 8 psrlw xmm1, 8
...@@ -2553,7 +2552,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2553,7 +2552,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
align 4 align 4
convertloop: convertloop:
vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order vpermd ymm0, ymm5, -32[eax + ecx * 4] // permute dword order
sub ecx, 8 sub ecx, 8
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
...@@ -3608,11 +3607,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3608,11 +3607,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
add ecx, 1 - 4 add ecx, 1 - 4
jl convertloop4b jl convertloop4b
test eax, 15 // unaligned?
jne convertuloop4
test esi, 15 // unaligned?
jne convertuloop4
// 4 pixel loop. // 4 pixel loop.
convertloop4: convertloop4:
movdqu xmm3, [eax] // src argb movdqu xmm3, [eax] // src argb
...@@ -3637,32 +3631,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3637,32 +3631,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
jge convertloop4 jge convertloop4
jmp convertloop4b
// 4 pixel unaligned loop.
convertuloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
jge convertuloop4
convertloop4b: convertloop4b:
add ecx, 4 - 1 add ecx, 4 - 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment