alpha blend last pixel fix

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/439008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@210 16f28f9a-4ce2-e073-06de-1de4eb20be90

alpha blend last pixel fix
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/439008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@210 16f28f9a-4ce2-e073-06de-1de4eb20be90
976423fe · fbarchard@google.com · 90310ddb · 976423fe · 976423fe · 976423fe
Commit 976423fe authored Mar 08, 2012 by fbarchard@google.com
Showing with 46 additions and 40 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

row_neon.cc source/row_neon.cc +2 -2

row_posix.cc source/row_posix.cc +19 -17

row_win.cc source/row_win.cc +23 -19

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 209
+Version: 210
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 209
+#define LIBYUV_VERSION 210

 #endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -218,10 +218,10 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    // loop will run one extra time.
    "sub         %2, #16                       \n"

-    // mirror the bytes in the 64 bit segments.  unable to mirror
+    // mirror the bytes in the 64 bit segments. unable to mirror
    // the bytes in the entire 128 bits in one go.
    // because of the inability to mirror the entire 128 bits
-     // mirror the writing out of the two 64 bit segments.
+    // mirror the writing out of the two 64 bit segments.
    "1:                                        \n"
      "vld1.8      {q0}, [%0]!                 \n"  // src += 16
      "vrev64.8    q0, q0                      \n"

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1931,29 +1931,29 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    "sub       %0,%1                           \n"
    "mov       (%0),%3                         \n"
    "sub       $0x1,%2                         \n"
-    "je        8f                              \n"  // last1
+    "jle       8f                              \n"  // last1
    "cmp       $0xff000000,%3                  \n"
    "jae       2f                              \n"  // opaqueloop
    "cmp       $0xffffff,%3                    \n"
-    "ja        3f                              \n"  // translucientloop
+    "ja        3f                              \n"  // translucentloop

  // transparentloop
  "1:                                          \n"
    "sub       $0x1,%2                         \n"
    "lea       0x4(%0),%0                      \n"
-    "je        8f                              \n"  // last1
+    "jle       8f                              \n"  // last1
    "mov       (%0),%3                         \n"
    "cmp       $0xffffff,%3                    \n"
    "jbe       1b                              \n"  // transparentloop
    "cmp       $0xff000000,%3                  \n"
-    "jb        3f                              \n"  // translucientloop
+    "jb        3f                              \n"  // translucentloop

  // opaqueloop
  "2:                                          \n"
    "mov       %3,(%0,%1,1)                    \n"
    "lea       0x4(%0),%0                      \n"
    "sub       $0x1,%2                         \n"
-    "je        8f                              \n"  // last1
+    "jle       8f                              \n"  // last1
    "mov       (%0),%3                         \n"
    "cmp       $0xff000000,%3                  \n"
    "jae       2b                              \n"  // opaqueloop
@@ -1961,48 +1961,50 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    "jbe       1b                              \n"  // transparentloop
    "nop                                       \n"

-  // translucientloop
+  // translucentloop
  "3:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movq      (%0,%1,1),%%xmm1                \n"
+    "movd      %3,%%xmm0                       \n"
+    "mov       (%0,%1,1),%3                    \n"
+    "movd      %3,%%xmm1                       \n"
    "punpcklbw %%xmm0,%%xmm0                   \n"
    "punpcklbw %%xmm1,%%xmm1                   \n"
    "pshuflw   $0xff,%%xmm0,%%xmm2             \n"
-    "pshufhw   $0xff,%%xmm2,%%xmm2             \n"
    "movdqa    %%xmm2,%%xmm3                   \n"
    "pxor      %%xmm4,%%xmm3                   \n"
    "pmulhuw   %%xmm2,%%xmm0                   \n"
    "pmulhuw   %%xmm3,%%xmm1                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddusw   %%xmm1,%%xmm0                   \n"
    "psrlw     $0x8,%%xmm0                     \n"
    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%0,%1,1)                \n"
+    "movd      %%xmm0,%3                       \n"
+    "mov       %3,(%0,%1,1)                    \n"
    "lea       0x8(%0),%0                      \n"
    "sub       $0x2,%2                         \n"
-    "jbe       8f                              \n"  // last1
+    "jle       8f                              \n"  // last1
    "mov       (%0),%3                         \n"
    "cmp       $0xffffff,%3                    \n"
    "jbe       1b                              \n"  // transparentloop
    "cmp       $0xff000000,%3                  \n"
-    "jb        3b                              \n"  // translucientloop
+    "jb        3b                              \n"  // translucentloop
    "jmp       2b                              \n"  // opaqueloop

  // last1
  "8:                                          \n"
-    "add       $0x1,%2                         \n"
-    "je        9f                              \n"  // done
+    "add       $0x1,%2                         \n"  // 1 pixel left?
+    "cmp       $0x1,%2                         \n"
+    "jl        9f                              \n"  // done
+    "mov       (%0),%3                         \n"
    "movd      %3,%%xmm0                       \n"
    "mov       (%0,%1,1),%3                    \n"
    "movd      %3,%%xmm1                       \n"
    "punpcklbw %%xmm0,%%xmm0                   \n"
    "punpcklbw %%xmm1,%%xmm1                   \n"
    "pshuflw   $0xff,%%xmm0,%%xmm2             \n"
-    "pshufhw   $0xff,%%xmm2,%%xmm2             \n"
    "movdqa    %%xmm2,%%xmm3                   \n"
    "pxor      %%xmm4,%%xmm3                   \n"
    "pmulhuw   %%xmm2,%%xmm0                   \n"
    "pmulhuw   %%xmm3,%%xmm1                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddusw   %%xmm1,%%xmm0                   \n"
    "psrlw     $0x8,%%xmm0                     \n"
    "packuswb  %%xmm0,%%xmm0                   \n"
    "movd      %%xmm0,%3                       \n"

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -477,7 +477,6 @@ __asm {
  }
 }

-// TODO(fbarchard): Port to gcc
 __declspec(naked)
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
@@ -1965,40 +1964,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    mov        edx, [esp + 4 + 8]   // dst_argb
    mov        ecx, [esp + 4 + 12]  // width
    pcmpeqb    xmm4, xmm4       // generate 0xffffffff do negative alpha
+    pcmpeqb    xmm5, xmm5       // generate 0xff000000 for alpha
+    pslld      xmm5, 24
    sub        edx, esi
    mov        eax, [esi]       // get first pixel
    sub        ecx, 1           // ensure there are at least 2 pixels
-    je         last1            // last pixel?
+    jle        last1            // last pixel?
    cmp        eax, 0xFF000000  // opaque?
    jae        opaqueloop
-    cmp        eax, 0x00FFFFFF  // translucient?
-    ja         translucientloop
+    cmp        eax, 0x00FFFFFF  // translucent?
+    ja         translucentloop

    align      16
 transparentloop:
    sub        ecx, 1
    lea        esi, [esi + 4]
-    je         last1
-    mov        eax, [esi]       // handle remaining pixel
+    jle        last1
+    mov        eax, [esi]       // get next pixel
    cmp        eax, 0x00FFFFFF  // transparent?
    jbe        transparentloop
-    cmp        eax, 0xFF000000  // translucient?
-    jb         translucientloop
+    cmp        eax, 0xFF000000  // translucent?
+    jb         translucentloop

    align      16
 opaqueloop:
    mov        dword ptr [esi + edx], eax
    lea        esi, [esi + 4]
    sub        ecx, 1
-    je         last1
-    mov        eax, [esi]       // handle remaining pixel
+    jle        last1
+    mov        eax, [esi]       // get next pixel
    cmp        eax, 0xFF000000  // opaque?
    jae        opaqueloop
    cmp        eax, 0x00FFFFFF  // transparent?
    jbe        transparentloop

    align      16
- translucientloop:
+ translucentloop:
    movq       xmm0, qword ptr [esi]      // fetch 2 pixels
    movq       xmm1, qword ptr [esi + edx]
    punpcklbw  xmm0, xmm0       // src 16 bits
@@ -2009,39 +2010,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    pxor       xmm3, xmm4
    pmulhuw    xmm0, xmm2       // src * a
    pmulhuw    xmm1, xmm3       // dst * (a ^ 0xffff)
-    paddw      xmm0, xmm1
+    paddusw    xmm0, xmm1
    psrlw      xmm0, 8
    packuswb   xmm0, xmm0       // pack 2 pixels
+    por        xmm0, xmm5       // set alpha
    movq       qword ptr [esi + edx], xmm0
    lea        esi, [esi + 8]
    sub        ecx, 2
-    jbe        last1
-    mov        eax, [esi]       // handle remaining pixel
+    jle        last1
+    mov        eax, [esi]
    cmp        eax, 0x00FFFFFF  // transparent?
    jbe        transparentloop
-    cmp        eax, 0xFF000000  // translucient?
-    jb         translucientloop
+    cmp        eax, 0xFF000000  // translucent?
+    jb         translucentloop
    jmp        opaqueloop

    align      16
 last1:
    add        ecx, 1
-    je         done
+    cmp        ecx, 1           // 1 left?
+    jl         done

+    mov        eax, [esi]       // get next pixel
    movd       xmm0, eax
    mov        eax,  [esi + edx]
    movd       xmm1, eax
    punpcklbw  xmm0, xmm0       // src 16 bits
    punpcklbw  xmm1, xmm1       // dst 16 bits
    pshuflw    xmm2, xmm0, 0xff // src alpha
-    pshufhw    xmm2, xmm2, 0xff
    movdqa     xmm3, xmm2       // dst alpha
    pxor       xmm3, xmm4
    pmulhuw    xmm0, xmm2       // src * a
    pmulhuw    xmm1, xmm3       // dst * (a ^ 0xffff)
-    paddw      xmm0, xmm1
+    paddusw    xmm0, xmm1
    psrlw      xmm0, 8
    packuswb   xmm0, xmm0       // pack to bytes
+    por        xmm0, xmm5       // set alpha
    movd       eax, xmm0
    mov        dword ptr [esi + edx], eax