Commit 8b9759c4 authored by fbarchard@google.com's avatar fbarchard@google.com

I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid…

I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid fpic performance stall
BUG=none
TEST=none
Review URL: http://webrtc-codereview.appspot.com/322013

git-svn-id: http://libyuv.googlecode.com/svn/trunk@106 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 9cece4b1
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 104 Version: 106
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -558,47 +558,49 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi ...@@ -558,47 +558,49 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
#endif #endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm5 \n" "pslld $0x18,%%xmm4 \n"
"pxor %%xmm4,%%xmm4 \n" "mov $0x10001000,%%eax \n"
"movdqa %3,%%xmm3 \n" "movd %%eax,%%xmm3 \n"
"movdqa %4,%%xmm2 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n"
"mov $0x012a012a,%%eax \n"
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
"1: \n" "1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq (%0),%%xmm0 \n" "movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n" "lea 0x8(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"psubsw %%xmm3,%%xmm0 \n" "psubusw %%xmm3,%%xmm0 \n"
"pmullw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"psraw $0x6,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
// Step 2: Weave into ARGB "punpcklbw %%xmm0,%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm0 \n"
"punpcklwd %%xmm0,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n" "por %%xmm4,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n" "por %%xmm4,%%xmm1 \n"
"punpckhwd %%xmm1,%%xmm1 \n" "movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n" "movdqa %%xmm1,16(%1) \n"
"movdqa %%xmm1,16(%1) \n" "lea 32(%1),%1 \n"
"lea 32(%1),%1 \n"
"sub $0x8,%2 \n"
"sub $0x8,%2 \n" "ja 1b \n"
"ja 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(rgb_buf), // %1 "+r"(rgb_buf), // %1
"+rm"(width) // %2 "+rm"(width) // %2
: "m"(kYuvConstants.kYSub16), // %3 :
"m"(kYuvConstants.kYToRgb) // %4 : "memory", "cc", "eax"
: "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif #endif
); );
} }
......
...@@ -780,23 +780,25 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, ...@@ -780,23 +780,25 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
__asm { __asm {
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
mov eax,0x10001000
movd xmm3,eax
pshufd xmm3,xmm3,0
mov eax,0x012a012a
movd xmm2,eax
pshufd xmm2,xmm2,0
mov eax, [esp + 4] // Y mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
pxor xmm4, xmm4
movdqa xmm3, kYSub16
movdqa xmm2, kYToRgb
convertloop: convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax] movq xmm0, [eax]
lea eax, [eax + 8] lea eax, [eax + 8]
punpcklbw xmm0, xmm4 punpcklbw xmm0, xmm0 // Y.Y
psubsw xmm0, xmm3 psubusw xmm0, xmm3
pmullw xmm0, xmm2 pmulhuw xmm0, xmm2
psraw xmm0, 6
packuswb xmm0, xmm0 // G packuswb xmm0, xmm0 // G
// Step 2: Weave into ARGB // Step 2: Weave into ARGB
...@@ -804,8 +806,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, ...@@ -804,8 +806,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklwd xmm0, xmm0 // BGRA first 4 pixels punpcklwd xmm0, xmm0 // BGRA first 4 pixels
punpckhwd xmm1, xmm1 // BGRA next 4 pixels punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm0, xmm5 por xmm0, xmm4
por xmm1, xmm5 por xmm1, xmm4
movdqa [edx], xmm0 movdqa [edx], xmm0
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment