address munge for rowreverse. And computer green mask based on red mask to save one shift.

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/363001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@144 16f28f9a-4ce2-e073-06de-1de4eb20be90

address munge for rowreverse. And computer green mask based on red mask to save one shift.
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/363001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@144 16f28f9a-4ce2-e073-06de-1de4eb20be90
0e6ce93c · fbarchard@google.com · d1943b39 · 0e6ce93c · 0e6ce93c · 0e6ce93c
Commit 0e6ce93c authored Jan 20, 2012 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 133 deletions

README.chromium README.chromium +1 -1

row_posix.cc source/row_posix.cc +6 -8

row_win.cc source/row_win.cc +7 -124

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 143
+Version: 144
 License: BSD
 License File: LICENSE


--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -655,14 +655,13 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
  "movdqa     %3,%%xmm5                        \n"
-  "lea        -0x10(%0,%2,1),%0                \n"
+  "lea        -0x10(%0),%0                     \n"
  "1:                                          \n"
-    "movdqa     (%0),%%xmm0                    \n"
-    "lea        -0x10(%0),%0                   \n"
+    "movdqa     (%0,%2),%%xmm0                 \n"
    "pshufb     %%xmm5,%%xmm0                  \n"
+    "sub        $0x10,%2                       \n"
    "movdqa     %%xmm0,(%1)                    \n"
    "lea        0x10(%1),%1                    \n"
-    "sub        $0x10,%2                       \n"
    "ja         1b                             \n"
  : "+r"(src),  // %0
    "+r"(dst),  // %1
@@ -681,10 +680,9 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
 void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
-  "lea        -0x10(%0,%2,1),%0                \n"
+  "lea        -0x10(%0),%0                     \n"
  "1:                                          \n"
-    "movdqa     (%0),%%xmm0                    \n"
-    "lea        -0x10(%0),%0                   \n"
+    "movdqa     (%0,%2),%%xmm0                 \n"
    "movdqa     %%xmm0,%%xmm1                  \n"
    "psllw      $0x8,%%xmm0                    \n"
    "psrlw      $0x8,%%xmm1                    \n"
@@ -692,9 +690,9 @@ void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
    "pshuflw    $0x1b,%%xmm0,%%xmm0            \n"
    "pshufhw    $0x1b,%%xmm0,%%xmm0            \n"
    "pshufd     $0x4e,%%xmm0,%%xmm0            \n"
+    "sub        $0x10,%2                       \n"
    "movdqa     %%xmm0,(%1)                    \n"
    "lea        0x10(%1),%1                    \n"
-    "sub        $0x10,%2                       \n"
    "ja         1b                             \n"
  : "+r"(src),  // %0
    "+r"(dst),  // %1

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -238,120 +238,6 @@ __asm {
  }
 }

-#ifdef SHIFT565
-// Below shift/mask code is efficient and works, but more instructions than
-// pmul method
-// TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc
-// 29 instructions
-__declspec(naked)
-void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                             int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_rgb565
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000 for Alpha
-    pslld     xmm5, 24
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
-    psllw     xmm4, 11
-    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
-    psrlw     xmm6, 11
-    pcmpeqb   xmm7, xmm7       // generate mask 0x00fc00fc for Green
-    psrlw     xmm7, 10
-    psllw     xmm7, 2
-
- convertloop:
-    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
-    lea       eax, [eax + 16]
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    pand      xmm1, xmm4    // R in upper 5 bits
-    psrlw     xmm2, 13      // R 3 bits
-    psllw     xmm2, 8
-    por       xmm1, xmm2
-    movdqa    xmm2, xmm0
-    pand      xmm2, xmm6    // mask B 5 bits
-    movdqa    xmm3, xmm2
-    psllw     xmm2, 3
-    psrlw     xmm3, 2
-    por       xmm2, xmm3
-    por       xmm1, xmm2    // RB
-    psrlw     xmm0, 3       // G in top 6 bits of lower byte
-    pand      xmm0, xmm7    // mask G 6 bits
-    movdqa    xmm2, xmm0
-    psrlw     xmm2, 6
-    por       xmm0, xmm2
-    por       xmm0, xmm5   // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqa    [edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       edx, [edx + 32]
-    sub       ecx, 8
-    ja        convertloop
-    ret
-  }
-}
-
-// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
-// 33 instructions
-__declspec(naked)
-void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                               int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_argb1555
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff00ff00 for Alpha
-    psllw     xmm5, 8
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
-    psllw     xmm4, 11
-    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
-    psrlw     xmm6, 11
-    pcmpeqb   xmm7, xmm7       // generate mask 0x00f800f8 for Green
-    psrlw     xmm7, 11
-    psllw     xmm7, 3
-
- convertloop:
-    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
-    lea       eax, [eax + 16]
-    movdqa    xmm1, xmm0
-    psllw     xmm1, 1
-    movdqa    xmm2, xmm0
-    pand      xmm1, xmm4    // R in upper 5 bits
-    psrlw     xmm2, 13      // R 3 bits
-    psllw     xmm2, 8
-    por       xmm1, xmm2
-    movdqa    xmm2, xmm0
-    pand      xmm2, xmm6    // mask B 5 bits
-    movdqa    xmm3, xmm2
-    psllw     xmm2, 3
-    psrlw     xmm3, 2
-    por       xmm2, xmm3
-    por       xmm1, xmm2    // RB
-    movdqa    xmm2, xmm0
-    psrlw     xmm2, 2       // G in top 5 bits of lower byte
-    pand      xmm2, xmm7    // mask G 5 bits
-    movdqa    xmm3, xmm2
-    psrlw     xmm3, 5
-    por       xmm2, xmm3
-    psraw     xmm0, 8       // A
-    pand      xmm0, xmm5
-    por       xmm0, xmm2    // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqa    [edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       edx, [edx + 32]
-    sub       ecx, 8
-    ja        convertloop
-    ret
-  }
-}
-#endif
-
 // pmul method to replicate bits
 // Math to replicate bits
 // (v << 8) | (v << 3)
@@ -422,8 +308,7 @@ __asm {
    pshufd    xmm6, xmm6, 0
    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
    psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4       // generate mask 0x03e003e0 for Green
-    psllw     xmm4, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
    psrlw     xmm4, 6
    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
    psllw     xmm7, 8
@@ -1305,14 +1190,13 @@ __asm {
    mov       edx, [esp + 8]   // dst
    mov       ecx, [esp + 12]  // width
    movdqa    xmm5, kShuffleReverse
-    lea       eax, [eax + ecx - 16]
- convertloop:
-    movdqa    xmm0, [eax]
    lea       eax, [eax - 16]
+ convertloop:
+    movdqa    xmm0, [eax + ecx]
    pshufb    xmm0, xmm5
+    sub       ecx, 16
    movdqa    [edx], xmm0
    lea       edx, [edx + 16]
-    sub       ecx, 16
    ja        convertloop
    ret
  }
@@ -1327,10 +1211,9 @@ __asm {
    mov       eax, [esp + 4]   // src
    mov       edx, [esp + 8]   // dst
    mov       ecx, [esp + 12]  // width
-    lea       eax, [eax + ecx - 16]
- convertloop:
-    movdqa    xmm0, [eax]
    lea       eax, [eax - 16]
+ convertloop:
+    movdqa    xmm0, [eax + ecx]
    movdqa    xmm1, xmm0        // swap bytes
    psllw     xmm0, 8
    psrlw     xmm1, 8
@@ -1338,9 +1221,9 @@ __asm {
    pshuflw   xmm0, xmm0, 0x1b  // swap words
    pshufhw   xmm0, xmm0, 0x1b
    pshufd    xmm0, xmm0, 0x4e  // swap qwords
+    sub       ecx, 16
    movdqa    [edx], xmm0
    lea       edx, [edx + 16]
-    sub       ecx, 16
    ja        convertloop
    ret
  }