I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid…

I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid fpic performance stall BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/322013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@106 16f28f9a-4ce2-e073-06de-1de4eb20be90

I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid…
I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid fpic performance stall BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/322013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@106 16f28f9a-4ce2-e073-06de-1de4eb20be90
8b9759c4 · fbarchard@google.com · 9cece4b1 · 8b9759c4 · 8b9759c4 · 8b9759c4
Commit 8b9759c4 authored Dec 14, 2011 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 44 deletions

README.chromium README.chromium +1 -1

row_posix.cc source/row_posix.cc +33 -31

row_win.cc source/row_win.cc +14 -12

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 104
+Version: 106
 License: BSD
 License File: LICENSE

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -558,47 +558,49 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,  // rdi
 #endif
 #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,  // rdi
                                uint8* rgb_buf,      // rcx
                                int width) {         // r8
  asm volatile (
-  "pcmpeqb     %%xmm5,%%xmm5                   \n"
+  "pcmpeqb     %%xmm4,%%xmm4                   \n"
-  "pslld       $0x18,%%xmm5                    \n"
+  "pslld       $0x18,%%xmm4                    \n"
-  "pxor        %%xmm4,%%xmm4                   \n"
+  "mov         $0x10001000,%%eax               \n"
-  "movdqa      %3,%%xmm3                       \n"
+  "movd        %%eax,%%xmm3                    \n"
-  "movdqa      %4,%%xmm2                       \n"
+  "pshufd      $0x0,%%xmm3,%%xmm3              \n"
+  "mov         $0x012a012a,%%eax               \n"
+  "movd        %%eax,%%xmm2                    \n"
+  "pshufd      $0x0,%%xmm2,%%xmm2              \n"
  "1:                                          \n"
-  // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-  "movq        (%0),%%xmm0                     \n"
+    "movq        (%0),%%xmm0                   \n"
-  "lea         0x8(%0),%0                      \n"
+    "lea         0x8(%0),%0                    \n"
-  "punpcklbw   %%xmm4,%%xmm0                   \n"
+    "punpcklbw   %%xmm0,%%xmm0                 \n"
-  "psubsw      %%xmm3,%%xmm0                   \n"
+    "psubusw     %%xmm3,%%xmm0                 \n"
-  "pmullw      %%xmm2,%%xmm0                   \n"
+    "pmulhuw     %%xmm2,%%xmm0                 \n"
-  "psraw       $0x6,%%xmm0                     \n"
+    "packuswb    %%xmm0,%%xmm0                 \n"
-  "packuswb    %%xmm0,%%xmm0                   \n"
+    // Step 2: Weave into ARGB
-  // Step 2: Weave into ARGB
+    "punpcklbw   %%xmm0,%%xmm0                 \n"
-  "punpcklbw   %%xmm0,%%xmm0                   \n"
+    "movdqa      %%xmm0,%%xmm1                 \n"
-  "movdqa      %%xmm0,%%xmm1                   \n"
+    "punpcklwd   %%xmm0,%%xmm0                 \n"
-  "punpcklwd   %%xmm0,%%xmm0                   \n"
+    "punpckhwd   %%xmm1,%%xmm1                 \n"
-  "por         %%xmm5,%%xmm0                   \n"
+    "por         %%xmm4,%%xmm0                 \n"
-  "movdqa      %%xmm0,(%1)                     \n"
+    "por         %%xmm4,%%xmm1                 \n"
-  "punpckhwd   %%xmm1,%%xmm1                   \n"
+    "movdqa      %%xmm0,(%1)                   \n"
-  "por         %%xmm5,%%xmm1                   \n"
+    "movdqa      %%xmm1,16(%1)                 \n"
-  "movdqa      %%xmm1,16(%1)                   \n"
+    "lea         32(%1),%1                     \n"
-  "lea         32(%1),%1                       \n"
+    "sub         $0x8,%2                       \n"
-  "sub         $0x8,%2                         \n"
+    "ja          1b                            \n"
-  "ja          1b                              \n"
  : "+r"(y_buf),    // %0
    "+r"(rgb_buf),  // %1
    "+rm"(width)    // %2
-  : "m"(kYuvConstants.kYSub16),  // %3
+  :
-    "m"(kYuvConstants.kYToRgb)   // %4
+  : "memory", "cc", "eax"
-  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
 #endif
  );
 }

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -780,23 +780,25 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                uint8* rgb_buf,
                                int width) {
  __asm {
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+    mov        eax,0x10001000
+    movd       xmm3,eax
+    pshufd     xmm3,xmm3,0
+    mov        eax,0x012a012a
+    movd       xmm2,eax
+    pshufd     xmm2,xmm2,0
    mov        eax, [esp + 4]       // Y
    mov        edx, [esp + 8]       // rgb
    mov        ecx, [esp + 12]      // width
-    pcmpeqb    xmm5, xmm5           // generate mask 0xff000000
-    pslld      xmm5, 24
-    pxor       xmm4, xmm4
-    movdqa     xmm3, kYSub16
-    movdqa     xmm2, kYToRgb
 convertloop:
    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    movq       xmm0, qword ptr [eax]
+    movq       xmm0, [eax]
    lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm4
+    punpcklbw  xmm0, xmm0           // Y.Y
-    psubsw     xmm0, xmm3
+    psubusw    xmm0, xmm3
-    pmullw     xmm0, xmm2
+    pmulhuw    xmm0, xmm2
-    psraw      xmm0, 6
    packuswb   xmm0, xmm0           // G
    // Step 2: Weave into ARGB
@@ -804,8 +806,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
    movdqa     xmm1, xmm0
    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
-    por        xmm0, xmm5
+    por        xmm0, xmm4
-    por        xmm1, xmm5
+    por        xmm1, xmm4
    movdqa     [edx], xmm0
    movdqa     [edx + 16], xmm1
    lea        edx,  [edx + 32]