Commit 794fe123 authored by fbarchard@google.com's avatar fbarchard@google.com

alpha blend 4 pixel loop bug fix and blender C code match SSE for better…

alpha blend 4 pixel loop bug fix and blender C code match SSE for better testability and reference code for future optimized code.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/645008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@287 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent ee220888
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 286 Version: 287
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 286 #define LIBYUV_VERSION 287
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
This diff is collapsed.
...@@ -2375,60 +2375,39 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2375,60 +2375,39 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"add $1-4,%3 \n" "add $1-4,%3 \n"
"jl 49f \n" "jl 49f \n"
// 8 pixel loop. // 4 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"41: \n" "41: \n"
"movdqu (%0),%%xmm3 \n" "movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n" "movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n" "pxor %%xmm4,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n"
"pand %%xmm6,%%xmm2 \n" "pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n" "paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n" "pmullw %%xmm3,%%xmm2 \n"
"movdqu (%1),%%xmm1 \n" "movdqu (%1),%%xmm1 \n"
"lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n" "pmullw %%xmm3,%%xmm1 \n"
"movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n" "movdqa %%xmm0,(%2) \n"
"jl 49f \n" "lea 0x10(%2),%2 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu 0x10(%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu 0x10(%1),%%xmm1 \n"
"lea 0x20(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"jge 41b \n" "jge 41b \n"
"49: \n" "49: \n"
"add $0x3,%3 \n" "add $0x3,%3 \n"
"jl 99f \n" "jl 99f \n"
// 1 pixel loop. // 1 pixel loop.
"91: \n" "91: \n"
"movd (%0),%%xmm3 \n" "movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n" "lea 0x4(%0),%0 \n"
...@@ -2531,56 +2510,37 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2531,56 +2510,37 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"add $1-4,%3 \n" "add $1-4,%3 \n"
"jl 49f \n" "jl 49f \n"
// 8 pixel loop. // 4 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"41: \n" "41: \n"
"movdqu (%0),%%xmm3 \n" "movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n" "movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n" "pxor %%xmm4,%%xmm3 \n"
"pshufb %4,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n" "movdqu (%1),%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n" "pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n" "paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n" "pmullw %%xmm3,%%xmm2 \n"
"movdqu (%1),%%xmm1 \n" "movdqu (%1),%%xmm1 \n"
"lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n" "pmullw %%xmm3,%%xmm1 \n"
"movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n" "movdqa %%xmm0,(%2) \n"
"jl 49f \n" "lea 0x10(%2),%2 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu 0x10(%1),%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu 0x10(%1),%%xmm1 \n"
"lea 0x20(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"jge 41b \n" "jge 41b \n"
"49: \n" "49: \n"
"add $0x3,%3 \n" "add $0x3,%3 \n"
"jl 99f \n" "jl 99f \n"
// 1 pixel loop. // 1 pixel loop.
"91: \n" "91: \n"
"movd (%0),%%xmm3 \n" "movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n" "lea 0x4(%0),%0 \n"
...@@ -2629,7 +2589,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2629,7 +2589,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x8,%%xmm5 \n" "psrld $0x8,%%xmm5 \n"
// 4 pixel loop // 4 pixel loop
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
......
...@@ -2474,54 +2474,31 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2474,54 +2474,31 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
add ecx, 1 - 4 add ecx, 1 - 4
jl convertloop4b jl convertloop4b
// 8 pixel loop. // 4 pixel loop.
align 4
convertloop4: convertloop4:
movdqu xmm3, [eax] movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h pshuflw xmm3, xmm3,0F5h
movdqu xmm2, [esi] // _r_b
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi] // _a_g movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255 por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha pmullw xmm1, xmm3 // _a_g * alpha
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrlw xmm2, 8 // _r_b convert to 8 bits again psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 4 sub ecx, 4
movdqa [edx], xmm0 movdqa [edx], xmm0
jl convertloop4b lea edx, [edx + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi + 16] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi + 16] // _a_g
lea esi, [esi + 32]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
jge convertloop4 jge convertloop4
convertloop4b: convertloop4b:
...@@ -2530,7 +2507,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2530,7 +2507,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop. // 1 pixel loop.
convertloop1: convertloop1:
movd xmm3, [eax] movd xmm3, [eax] // src argb
lea eax, [eax + 4] lea eax, [eax + 4]
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
...@@ -2629,50 +2606,29 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2629,50 +2606,29 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
add ecx, 1 - 4 add ecx, 1 - 4
jl convertloop4b jl convertloop4b
// 8 pixel loop. // 4 pixel loop.
align 4
convertloop4: convertloop4:
movdqu xmm3, [eax] movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
pshufb xmm3, kShuffleAlpha // alpha
movdqu xmm2, [esi] // _r_b movdqu xmm2, [esi] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi] // _a_g movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255 por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha pmullw xmm1, xmm3 // _a_g * alpha
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrlw xmm2, 8 // _r_b convert to 8 bits again psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 4 sub ecx, 4
movdqa [edx], xmm0 movdqa [edx], xmm0
jl convertloop4b lea edx, [edx + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi + 16] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi + 16] // _a_g
lea esi, [esi + 32]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
jge convertloop4 jge convertloop4
convertloop4b: convertloop4b:
...@@ -2681,7 +2637,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2681,7 +2637,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop. // 1 pixel loop.
convertloop1: convertloop1:
movd xmm3, [eax] movd xmm3, [eax] // src argb
lea eax, [eax + 4] lea eax, [eax + 4]
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment