Remove extra unaligned loop from alphablender. Both aligned and unaligned loops…

Remove extra unaligned loop from alphablender. Both aligned and unaligned loops were the same, so remove the extra. BUG=none TESTED=try bots. R=brucedawson@google.com, harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/29059004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1166 16f28f9a-4ce2-e073-06de-1de4eb20be90

Remove extra unaligned loop from alphablender. Both aligned and unaligned loops…
Remove extra unaligned loop from alphablender. Both aligned and unaligned loops were the same, so remove the extra. BUG=none TESTED=try bots. R=brucedawson@google.com, harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/29059004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1166 16f28f9a-4ce2-e073-06de-1de4eb20be90
5822505e · fbarchard@google.com · 1eb636d2 · 5822505e · 5822505e · 5822505e
Commit 5822505e authored Nov 17, 2014 by fbarchard@google.com
Showing with 7 additions and 71 deletions

README.chromium README.chromium +1 -1

row.h include/libyuv/row.h +1 -1

version.h include/libyuv/version.h +1 -1

row_posix.cc source/row_posix.cc +1 -33

row_win.cc source/row_win.cc +3 -35

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1164
+Version: 1165
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -207,12 +207,12 @@ extern "C" {
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
-#define HAS_ARGBMIRRORROW_AVX2
 #endif

 // The following are require VS2012.
 // TODO(fbarchard): Port to gcc.
 #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGBMIRRORROW_AVX2
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1164
+#define LIBYUV_VERSION 1165

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2183,10 +2183,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = (intptr_t)(width);
  asm volatile (
    "movdqa    %3,%%xmm5                       \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
    LABELALIGN
  "1:                                          \n"
-    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
    "pshufb    %%xmm5,%%xmm0                   \n"
    "sub       $0x10,%2                        \n"
    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
@@ -3378,10 +3377,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  "19:                                         \n"
    "add       $1-4,%3                         \n"
    "jl        49f                             \n"
-    "test      $0xf,%0                         \n"
-    "jne       41f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       41f                             \n"

    // 4 pixel loop.
    LABELALIGN
@@ -3408,33 +3403,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
    "lea       " MEMLEA(0x10,2) ",%2           \n"
    "jge       40b                             \n"
-    "jmp       49f                             \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "41:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "jge       41b                             \n"

  "49:                                         \n"
    "add       $0x3,%3                         \n"

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2392,7 +2392,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 }
 #endif  // HAS_YTOARGBROW_SSE2

-
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 static const uvec8 kShuffleMirror = {
@@ -2432,7 +2431,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

    align      4
 convertloop:
-    vmovdqu   ymm0, [eax - 32 + ecx]
+    vmovdqu   ymm0, -32[eax + ecx]
    vpshufb   ymm0, ymm0, ymm5
    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
    sub       ecx, 32
@@ -2455,7 +2454,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

    align      4
 convertloop:
-    movdqu    xmm0, [eax - 16 + ecx]
+    movdqu    xmm0, -16[eax + ecx]
    movdqa    xmm1, xmm0        // swap bytes
    psllw     xmm0, 8
    psrlw     xmm1, 8
@@ -2553,7 +2552,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

    align      4
 convertloop:
-    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
+    vpermd    ymm0, ymm5, -32[eax + ecx * 4]  // permute dword order
    sub       ecx, 8
    vmovdqu   [edx], ymm0
    lea       edx, [edx + 32]
@@ -3608,11 +3607,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    add        ecx, 1 - 4
    jl         convertloop4b

-    test       eax, 15          // unaligned?
-    jne        convertuloop4
-    test       esi, 15          // unaligned?
-    jne        convertuloop4
-
    // 4 pixel loop.
  convertloop4:
    movdqu     xmm3, [eax]      // src argb
@@ -3637,32 +3631,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jge        convertloop4
-    jmp        convertloop4b
-
-    // 4 pixel unaligned loop.
-  convertuloop4:
-    movdqu     xmm3, [eax]      // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jge        convertuloop4

  convertloop4b:
    add        ecx, 4 - 1