Reduce alignment for loops from 16 bytes to 4 bytes. Reduces outer loop…

Reduce alignment for loops from 16 bytes to 4 bytes. Reduces outer loop overhead without hurting innerloop time. BUG=none TESTED=try bots R=fbarchard@chromium.org, mflodman@webrtc.org Review URL: https://webrtc-codereview.appspot.com/4659004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@880 16f28f9a-4ce2-e073-06de-1de4eb20be90

Reduce alignment for loops from 16 bytes to 4 bytes. Reduces outer loop…
Reduce alignment for loops from 16 bytes to 4 bytes. Reduces outer loop overhead without hurting innerloop time. BUG=none TESTED=try bots R=fbarchard@chromium.org, mflodman@webrtc.org Review URL: https://webrtc-codereview.appspot.com/4659004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@880 16f28f9a-4ce2-e073-06de-1de4eb20be90
c2295807 · fbarchard@google.com · dbe48143 · c2295807 · c2295807 · c2295807
Commit c2295807 authored Dec 02, 2013 by fbarchard@google.com
13 changed files
--- a/source/compare_posix.cc
+++ b/source/compare_posix.cc
@@ -31,7 +31,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  asm volatile (  // NOLINT
    "pxor      %%xmm0,%%xmm0                   \n"
    "pxor      %%xmm5,%%xmm5                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
    "1:                                        \n"
    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
    "lea       " MEMLEA(0x10, 0) ",%0          \n"
@@ -107,7 +107,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    "movd      %2,%%xmm0                       \n"
    "pxor      %%xmm7,%%xmm7                   \n"
    "movdqa    %4,%%xmm6                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
    "lea       " MEMLEA(0x10, 0) ",%0          \n"

--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -27,7 +27,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    pxor       xmm0, xmm0
    pxor       xmm5, xmm5

-    align      16
+    align      4
  wloop:
    movdqa     xmm1, [eax]
    lea        eax,  [eax + 16]
@@ -70,7 +70,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
    sub        edx, eax

-    align      16
+    align      4
  wloop:
    vmovdqu    ymm1, [eax]
    vmovdqu    ymm2, [eax + edx]
@@ -145,7 +145,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    pxor       xmm7, xmm7        // constant 0 for unpck
    movdqa     xmm6, kHash16x33

-    align      16
+    align      4
  wloop:
    movdqu     xmm1, [eax]       // src[0-15]
    lea        eax, [eax + 16]
@@ -195,7 +195,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
    movd       xmm0, [esp + 12]  // seed
    movdqa     xmm6, kHash16x33

-    align      16
+    align      4
  wloop:
    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
    pmulld     xmm0, xmm6  // hash *= 33 ^ 16

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -91,7 +91,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,

    // Read in the data from the source pointer.
    // First round of bit swap.
-    align      16
+    align      4
 convertloop:
    movq      xmm0, qword ptr [eax]
    lea       ebp, [eax + 8]
@@ -190,7 +190,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    mov       [esp + 16], ecx
    mov       ecx, [ecx + 16 + 28]  // w

-    align      16
+    align      4
 convertloop:
    // Read in the data from the source pointer.
    // First round of bit swap.
@@ -304,7 +304,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
-    ".p2align  4                                 \n"
+    ".p2align  2                                 \n"
  "1:                                            \n"
    "movq       (%0),%%xmm0                      \n"
    "movq       (%0,%3),%%xmm1                   \n"
@@ -523,7 +523,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
-  ".p2align  4                                 \n"
+  ".p2align  2                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%3),%%xmm1                   \n"
@@ -664,7 +664,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
-  ".p2align  4                                 \n"
+  ".p2align  2                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%4),%%xmm1                   \n"

--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -31,7 +31,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
    "sub         %4, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"

@@ -198,7 +198,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "sub         %6, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"


--- a/source/row_mips.cc
+++ b/source/row_mips.cc
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/row_x86.asm
+++ b/source/row_x86.asm
@@ -28,7 +28,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
    psrlw      m2, m2, 8
 %endif

-    ALIGN      16
+    align      4
 .convertloop:
    mov%2      m0, [src_yuy2q]
    mov%2      m1, [src_yuy2q + mmsize]
@@ -74,7 +74,7 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
    psrlw      m4, m4, 8
    sub        dst_vq, dst_uq

-    ALIGN      16
+    align      4
 .convertloop:
    mov%1      m0, [src_uvq]
    mov%1      m1, [src_uvq + mmsize]
@@ -113,7 +113,7 @@ SplitUVRow a,
 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
    sub        src_vq, src_uq

-    ALIGN      16
+    align      4
 .convertloop:
    mov%1      m0, [src_uq]
    mov%1      m1, [src_vq]

--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -30,6 +30,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
    "beqz           $t9, 2f                        \n"
    " nop                                          \n"

+    ".p2align       2                              \n"
  "1:                                              \n"
    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
@@ -88,6 +89,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    "bltz           $t9, 2f                       \n"
    " nop                                         \n"

+    ".p2align       2                             \n"
  "1:                                             \n"
    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
@@ -185,6 +187,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
      "beqz           $t9, 2f                       \n"
      " nop                                         \n"

+      ".p2align       2                             \n"
     "1:                                            \n"
      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
@@ -244,6 +247,7 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      "srl           $t9, %[dst_width], 1         \n"
      "andi          $t8, %[dst_width], 1         \n"

+      ".p2align      2                            \n"
     "1:                                          \n"
      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
@@ -314,6 +318,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
  __asm__ __volatile__ (
      ".set push                                          \n"
      ".set noreorder                                     \n"
+      ".p2align        2                                  \n"
    "1:                                                   \n"
      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@@ -360,7 +365,9 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
-      "repl.ph          $t3, 3                           \n"  // 0x00030003
+      "repl.ph           $t3, 3                          \n"  // 0x00030003
+
+     ".p2align           2                               \n"
    "1:                                                  \n"
      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
@@ -416,6 +423,8 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      ".set push                                           \n"
      ".set noreorder                                      \n"
      "repl.ph           $t2, 3                            \n"  // 0x00030003
+
+      ".p2align          2                                 \n"
    "1:                                                    \n"
      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
@@ -466,6 +475,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
  __asm__ __volatile__ (
      ".set push                                     \n"
      ".set noreorder                                \n"
+
+      ".p2align   2                                  \n"
    "1:                                              \n"
      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@@ -515,6 +526,8 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
@@ -571,6 +584,8 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
  __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
--- a/source/scale_posix.cc
+++ b/source/scale_posix.cc
--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -103,7 +103,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    mov        edx, [esp + 12]       // dst_ptr
    mov        ecx, [esp + 16]       // dst_width

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -133,7 +133,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    psrlw      xmm5, 8

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -172,7 +172,7 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    psrlw      xmm5, 8

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -214,7 +214,7 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
    mov        edx, [esp + 12]       // dst_ptr
    mov        ecx, [esp + 16]       // dst_width

-    align      16
+    align      4
  wloop:
    movdqu     xmm0, [eax]
    movdqu     xmm1, [eax + 16]
@@ -244,7 +244,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    psrlw      xmm5, 8

-    align      16
+    align      4
  wloop:
    movdqu     xmm0, [eax]
    movdqu     xmm1, [eax + 16]
@@ -284,7 +284,7 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    psrlw      xmm5, 8

-    align      16
+    align      4
  wloop:
    movdqu     xmm0, [eax]
    movdqu     xmm1, [eax + 16]
@@ -328,7 +328,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    psrld      xmm5, 24
    pslld      xmm5, 16

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -363,7 +363,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    psrlw      xmm7, 8

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -426,7 +426,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    movdqa     xmm4, kShuf1
    movdqa     xmm5, kShuf2

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -480,7 +480,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
    movdqa     xmm6, kMadd11
    movdqa     xmm7, kRound34

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]           // pixels 0..7
    movdqa     xmm1, [eax + esi]
@@ -539,7 +539,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
    movdqa     xmm6, kMadd11
    movdqa     xmm7, kRound34

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]           // pixels 0..7
    movdqa     xmm1, [eax + esi]
@@ -596,7 +596,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
    movdqa     xmm4, kShuf38a
    movdqa     xmm5, kShuf38b

-    align      16
+    align      4
  xloop:
    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
@@ -632,7 +632,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
    movdqa     xmm4, kScaleAc33
    pxor       xmm5, xmm5

-    align      16
+    align      4
  xloop:
    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
    movdqa     xmm6, [eax + esi]
@@ -698,7 +698,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
    movdqa     xmm4, kShufAb2
    movdqa     xmm5, kScaleAb2

-    align      16
+    align      4
  xloop:
    movdqa     xmm0, [eax]           // average 2 rows into xmm0
    pavgb      xmm0, [eax + esi]
@@ -746,7 +746,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    pxor       xmm4, xmm4
    dec        ebx

-    align      16
+    align      4
  xloop:
    // first row
    movdqa     xmm0, [esi]
@@ -760,7 +760,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    je         ydone

    // sum remaining rows
-    align      16
+    align      4
  yloop:
    movdqa     xmm2, [eax]       // read 16 pixels
    lea        eax, [eax + edx]  // advance to next row
@@ -772,7 +772,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    sub        ebp, 1
    jg         yloop

-    align      16
+    align      4
  ydone:
    movdqa     [edi], xmm0
    movdqa     [edi + 16], xmm1
@@ -861,7 +861,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    movd       ebx, xmm0
    mov        [edi], bl

-    align      16
+    align      4
 xloop99:

    pop        edi
@@ -881,7 +881,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    mov        eax, [esp + 8]    // src_ptr
    mov        ecx, [esp + 12]   // dst_width

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    lea        eax,  [eax + 16]
@@ -910,7 +910,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    mov        edx, [esp + 12]       // dst_argb
    mov        ecx, [esp + 16]       // dst_width

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -937,7 +937,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
    mov        edx, [esp + 12]       // dst_argb
    mov        ecx, [esp + 16]       // dst_width

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -968,7 +968,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
    mov        edx, [esp + 4 + 12]   // dst_argb
    mov        ecx, [esp + 4 + 16]   // dst_width

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -1008,7 +1008,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
    lea        ebx, [ebx * 4]
    lea        edi, [ebx + ebx * 2]

-    align      16
+    align      4
  wloop:
    movd       xmm0, [eax]
    movd       xmm1, [eax + ebx]
@@ -1049,7 +1049,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
    lea        ebx, [ebx * 4]
    lea        edi, [ebx + ebx * 2]

-    align      16
+    align      4
  wloop:
    movq       xmm0, qword ptr [eax]  // row0 4 pairs
    movhps     xmm0, qword ptr [eax + ebx]
@@ -1238,7 +1238,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
    movd       [edi], xmm0

-    align      16
+    align      4
 xloop99:

    pop        edi
@@ -1257,7 +1257,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
    mov        eax, [esp + 8]    // src_argb
    mov        ecx, [esp + 12]   // dst_width

-    align      16
+    align      4
  wloop:
    movdqa     xmm0, [eax]
    lea        eax,  [eax + 16]