Commit 8b9759c4 authored by fbarchard@google.com's avatar fbarchard@google.com

I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid…

I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid fpic performance stall
BUG=none
TEST=none
Review URL: http://webrtc-codereview.appspot.com/322013

git-svn-id: http://libyuv.googlecode.com/svn/trunk@106 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 9cece4b1
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 104
Version: 106
License: BSD
License File: LICENSE
......
......@@ -558,47 +558,49 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"movdqa %3,%%xmm3 \n"
"movdqa %4,%%xmm2 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
"mov $0x10001000,%%eax \n"
"movd %%eax,%%xmm3 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"mov $0x012a012a,%%eax \n"
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
"1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"psubsw %%xmm3,%%xmm0 \n"
"pmullw %%xmm2,%%xmm0 \n"
"psraw $0x6,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"por %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm1,16(%1) \n"
"lea 32(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"psubusw %%xmm3,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,16(%1) \n"
"lea 32(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
"+rm"(width) // %2
: "m"(kYuvConstants.kYSub16), // %3
"m"(kYuvConstants.kYToRgb) // %4
: "memory", "cc"
:
: "memory", "cc", "eax"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
);
}
......
......@@ -780,23 +780,25 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
mov eax,0x10001000
movd xmm3,eax
pshufd xmm3,xmm3,0
mov eax,0x012a012a
movd xmm2,eax
pshufd xmm2,xmm2,0
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
pxor xmm4, xmm4
movdqa xmm3, kYSub16
movdqa xmm2, kYToRgb
convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax]
movq xmm0, [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm4
psubsw xmm0, xmm3
pmullw xmm0, xmm2
psraw xmm0, 6
punpcklbw xmm0, xmm0 // Y.Y
psubusw xmm0, xmm3
pmulhuw xmm0, xmm2
packuswb xmm0, xmm0 // G
// Step 2: Weave into ARGB
......@@ -804,8 +806,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm0, xmm5
por xmm1, xmm5
por xmm0, xmm4
por xmm1, xmm4
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment