Commit 794fe123 authored by fbarchard@google.com's avatar fbarchard@google.com

alpha blend 4 pixel loop bug fix and blender C code match SSE for better…

alpha blend 4 pixel loop bug fix and blender C code match SSE for better testability and reference code for future optimized code.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/645008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@287 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent ee220888
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 286
Version: 287
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 286
#define LIBYUV_VERSION 287
#endif // INCLUDE_LIBYUV_VERSION_H_
This diff is collapsed.
......@@ -2375,53 +2375,32 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"add $1-4,%3 \n"
"jl 49f \n"
// 8 pixel loop.
// 4 pixel loop.
".p2align 2 \n"
"41: \n"
"movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu (%1),%%xmm1 \n"
"lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n"
"jl 49f \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu 0x10(%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu 0x10(%1),%%xmm1 \n"
"lea 0x20(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"lea 0x10(%2),%2 \n"
"jge 41b \n"
"49: \n"
......@@ -2531,49 +2510,30 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"add $1-4,%3 \n"
"jl 49f \n"
// 8 pixel loop.
// 4 pixel loop.
".p2align 2 \n"
"41: \n"
"movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"pshufb %4,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu (%1),%%xmm1 \n"
"lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n"
"jl 49f \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu 0x10(%1),%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu 0x10(%1),%%xmm1 \n"
"lea 0x20(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"lea 0x10(%2),%2 \n"
"jge 41b \n"
"49: \n"
......
......@@ -2474,54 +2474,31 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
add ecx, 1 - 4
jl convertloop4b
// 8 pixel loop.
align 4
// 4 pixel loop.
convertloop4:
movdqu xmm3, [eax]
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h
movdqu xmm2, [esi] // _r_b
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx], xmm0
jl convertloop4b
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi + 16] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi + 16] // _a_g
lea esi, [esi + 32]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
lea edx, [edx + 16]
jge convertloop4
convertloop4b:
......@@ -2530,7 +2507,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop.
convertloop1:
movd xmm3, [eax]
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
......@@ -2629,50 +2606,29 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
add ecx, 1 - 4
jl convertloop4b
// 8 pixel loop.
align 4
// 4 pixel loop.
convertloop4:
movdqu xmm3, [eax]
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
pshufb xmm3, kShuffleAlpha // alpha
movdqu xmm2, [esi] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx], xmm0
jl convertloop4b
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi + 16] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi + 16] // _a_g
lea esi, [esi + 32]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
lea edx, [edx + 16]
jge convertloop4
convertloop4b:
......@@ -2681,7 +2637,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop.
convertloop1:
movd xmm3, [eax]
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment