Commit 976423fe authored by fbarchard@google.com's avatar fbarchard@google.com

alpha blend last pixel fix

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/439008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@210 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 90310ddb
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 209 Version: 210
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 209 #define LIBYUV_VERSION 210
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -218,10 +218,10 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -218,10 +218,10 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// loop will run one extra time. // loop will run one extra time.
"sub %2, #16 \n" "sub %2, #16 \n"
// mirror the bytes in the 64 bit segments. unable to mirror // mirror the bytes in the 64 bit segments. unable to mirror
// the bytes in the entire 128 bits in one go. // the bytes in the entire 128 bits in one go.
// because of the inability to mirror the entire 128 bits // because of the inability to mirror the entire 128 bits
// mirror the writing out of the two 64 bit segments. // mirror the writing out of the two 64 bit segments.
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // src += 16 "vld1.8 {q0}, [%0]! \n" // src += 16
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
......
...@@ -1931,29 +1931,29 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1931,29 +1931,29 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"sub %0,%1 \n" "sub %0,%1 \n"
"mov (%0),%3 \n" "mov (%0),%3 \n"
"sub $0x1,%2 \n" "sub $0x1,%2 \n"
"je 8f \n" // last1 "jle 8f \n" // last1
"cmp $0xff000000,%3 \n" "cmp $0xff000000,%3 \n"
"jae 2f \n" // opaqueloop "jae 2f \n" // opaqueloop
"cmp $0xffffff,%3 \n" "cmp $0xffffff,%3 \n"
"ja 3f \n" // translucientloop "ja 3f \n" // translucentloop
// transparentloop // transparentloop
"1: \n" "1: \n"
"sub $0x1,%2 \n" "sub $0x1,%2 \n"
"lea 0x4(%0),%0 \n" "lea 0x4(%0),%0 \n"
"je 8f \n" // last1 "jle 8f \n" // last1
"mov (%0),%3 \n" "mov (%0),%3 \n"
"cmp $0xffffff,%3 \n" "cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop "jbe 1b \n" // transparentloop
"cmp $0xff000000,%3 \n" "cmp $0xff000000,%3 \n"
"jb 3f \n" // translucientloop "jb 3f \n" // translucentloop
// opaqueloop // opaqueloop
"2: \n" "2: \n"
"mov %3,(%0,%1,1) \n" "mov %3,(%0,%1,1) \n"
"lea 0x4(%0),%0 \n" "lea 0x4(%0),%0 \n"
"sub $0x1,%2 \n" "sub $0x1,%2 \n"
"je 8f \n" // last1 "jle 8f \n" // last1
"mov (%0),%3 \n" "mov (%0),%3 \n"
"cmp $0xff000000,%3 \n" "cmp $0xff000000,%3 \n"
"jae 2b \n" // opaqueloop "jae 2b \n" // opaqueloop
...@@ -1961,48 +1961,50 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1961,48 +1961,50 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"jbe 1b \n" // transparentloop "jbe 1b \n" // transparentloop
"nop \n" "nop \n"
// translucientloop // translucentloop
"3: \n" "3: \n"
"movq (%0),%%xmm0 \n" "movd %3,%%xmm0 \n"
"movq (%0,%1,1),%%xmm1 \n" "mov (%0,%1,1),%3 \n"
"movd %3,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"punpcklbw %%xmm1,%%xmm1 \n" "punpcklbw %%xmm1,%%xmm1 \n"
"pshuflw $0xff,%%xmm0,%%xmm2 \n" "pshuflw $0xff,%%xmm0,%%xmm2 \n"
"pshufhw $0xff,%%xmm2,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"pxor %%xmm4,%%xmm3 \n" "pxor %%xmm4,%%xmm3 \n"
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n" "pmulhuw %%xmm3,%%xmm1 \n"
"paddw %%xmm1,%%xmm0 \n" "paddusw %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0,(%0,%1,1) \n" "movd %%xmm0,%3 \n"
"mov %3,(%0,%1,1) \n"
"lea 0x8(%0),%0 \n" "lea 0x8(%0),%0 \n"
"sub $0x2,%2 \n" "sub $0x2,%2 \n"
"jbe 8f \n" // last1 "jle 8f \n" // last1
"mov (%0),%3 \n" "mov (%0),%3 \n"
"cmp $0xffffff,%3 \n" "cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop "jbe 1b \n" // transparentloop
"cmp $0xff000000,%3 \n" "cmp $0xff000000,%3 \n"
"jb 3b \n" // translucientloop "jb 3b \n" // translucentloop
"jmp 2b \n" // opaqueloop "jmp 2b \n" // opaqueloop
// last1 // last1
"8: \n" "8: \n"
"add $0x1,%2 \n" "add $0x1,%2 \n" // 1 pixel left?
"je 9f \n" // done "cmp $0x1,%2 \n"
"jl 9f \n" // done
"mov (%0),%3 \n"
"movd %3,%%xmm0 \n" "movd %3,%%xmm0 \n"
"mov (%0,%1,1),%3 \n" "mov (%0,%1,1),%3 \n"
"movd %3,%%xmm1 \n" "movd %3,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"punpcklbw %%xmm1,%%xmm1 \n" "punpcklbw %%xmm1,%%xmm1 \n"
"pshuflw $0xff,%%xmm0,%%xmm2 \n" "pshuflw $0xff,%%xmm0,%%xmm2 \n"
"pshufhw $0xff,%%xmm2,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"pxor %%xmm4,%%xmm3 \n" "pxor %%xmm4,%%xmm3 \n"
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n" "pmulhuw %%xmm3,%%xmm1 \n"
"paddw %%xmm1,%%xmm0 \n" "paddusw %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%3 \n" "movd %%xmm0,%3 \n"
......
...@@ -477,7 +477,6 @@ __asm { ...@@ -477,7 +477,6 @@ __asm {
} }
} }
// TODO(fbarchard): Port to gcc
__declspec(naked) __declspec(naked)
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
...@@ -1965,40 +1964,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1965,40 +1964,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
mov edx, [esp + 4 + 8] // dst_argb mov edx, [esp + 4 + 8] // dst_argb
mov ecx, [esp + 4 + 12] // width mov ecx, [esp + 4 + 12] // width
pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha
pcmpeqb xmm5, xmm5 // generate 0xff000000 for alpha
pslld xmm5, 24
sub edx, esi sub edx, esi
mov eax, [esi] // get first pixel mov eax, [esi] // get first pixel
sub ecx, 1 // ensure there are at least 2 pixels sub ecx, 1 // ensure there are at least 2 pixels
je last1 // last pixel? jle last1 // last pixel?
cmp eax, 0xFF000000 // opaque? cmp eax, 0xFF000000 // opaque?
jae opaqueloop jae opaqueloop
cmp eax, 0x00FFFFFF // translucient? cmp eax, 0x00FFFFFF // translucent?
ja translucientloop ja translucentloop
align 16 align 16
transparentloop: transparentloop:
sub ecx, 1 sub ecx, 1
lea esi, [esi + 4] lea esi, [esi + 4]
je last1 jle last1
mov eax, [esi] // handle remaining pixel mov eax, [esi] // get next pixel
cmp eax, 0x00FFFFFF // transparent? cmp eax, 0x00FFFFFF // transparent?
jbe transparentloop jbe transparentloop
cmp eax, 0xFF000000 // translucient? cmp eax, 0xFF000000 // translucent?
jb translucientloop jb translucentloop
align 16 align 16
opaqueloop: opaqueloop:
mov dword ptr [esi + edx], eax mov dword ptr [esi + edx], eax
lea esi, [esi + 4] lea esi, [esi + 4]
sub ecx, 1 sub ecx, 1
je last1 jle last1
mov eax, [esi] // handle remaining pixel mov eax, [esi] // get next pixel
cmp eax, 0xFF000000 // opaque? cmp eax, 0xFF000000 // opaque?
jae opaqueloop jae opaqueloop
cmp eax, 0x00FFFFFF // transparent? cmp eax, 0x00FFFFFF // transparent?
jbe transparentloop jbe transparentloop
align 16 align 16
translucientloop: translucentloop:
movq xmm0, qword ptr [esi] // fetch 2 pixels movq xmm0, qword ptr [esi] // fetch 2 pixels
movq xmm1, qword ptr [esi + edx] movq xmm1, qword ptr [esi + edx]
punpcklbw xmm0, xmm0 // src 16 bits punpcklbw xmm0, xmm0 // src 16 bits
...@@ -2009,39 +2010,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2009,39 +2010,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pxor xmm3, xmm4 pxor xmm3, xmm4
pmulhuw xmm0, xmm2 // src * a pmulhuw xmm0, xmm2 // src * a
pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff) pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff)
paddw xmm0, xmm1 paddusw xmm0, xmm1
psrlw xmm0, 8 psrlw xmm0, 8
packuswb xmm0, xmm0 // pack 2 pixels packuswb xmm0, xmm0 // pack 2 pixels
por xmm0, xmm5 // set alpha
movq qword ptr [esi + edx], xmm0 movq qword ptr [esi + edx], xmm0
lea esi, [esi + 8] lea esi, [esi + 8]
sub ecx, 2 sub ecx, 2
jbe last1 jle last1
mov eax, [esi] // handle remaining pixel mov eax, [esi]
cmp eax, 0x00FFFFFF // transparent? cmp eax, 0x00FFFFFF // transparent?
jbe transparentloop jbe transparentloop
cmp eax, 0xFF000000 // translucient? cmp eax, 0xFF000000 // translucent?
jb translucientloop jb translucentloop
jmp opaqueloop jmp opaqueloop
align 16 align 16
last1: last1:
add ecx, 1 add ecx, 1
je done cmp ecx, 1 // 1 left?
jl done
mov eax, [esi] // get next pixel
movd xmm0, eax movd xmm0, eax
mov eax, [esi + edx] mov eax, [esi + edx]
movd xmm1, eax movd xmm1, eax
punpcklbw xmm0, xmm0 // src 16 bits punpcklbw xmm0, xmm0 // src 16 bits
punpcklbw xmm1, xmm1 // dst 16 bits punpcklbw xmm1, xmm1 // dst 16 bits
pshuflw xmm2, xmm0, 0xff // src alpha pshuflw xmm2, xmm0, 0xff // src alpha
pshufhw xmm2, xmm2, 0xff
movdqa xmm3, xmm2 // dst alpha movdqa xmm3, xmm2 // dst alpha
pxor xmm3, xmm4 pxor xmm3, xmm4
pmulhuw xmm0, xmm2 // src * a pmulhuw xmm0, xmm2 // src * a
pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff) pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff)
paddw xmm0, xmm1 paddusw xmm0, xmm1
psrlw xmm0, 8 psrlw xmm0, 8
packuswb xmm0, xmm0 // pack to bytes packuswb xmm0, xmm0 // pack to bytes
por xmm0, xmm5 // set alpha
movd eax, xmm0 movd eax, xmm0
mov dword ptr [esi + edx], eax mov dword ptr [esi + edx], eax
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment