remove align directives

R=harryjin@google.com BUG=none Review URL: https://webrtc-codereview.appspot.com/54809004.

remove align directives
R=harryjin@google.com BUG=none Review URL: https://webrtc-codereview.appspot.com/54809004.
1f461f73 · Frank Barchard · 6e7ef3fd · 1f461f73 · 1f461f73 · 1f461f73
Commit 1f461f73 authored Aug 05, 2015 by Frank Barchard
9 changed files
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -27,7 +27,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
    "vmov.u8    q9, #0                         \n"
    "vmov.u8    q11, #0                        \n"

-    ".p2align  2                               \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q0}, [%0]!                    \n"

--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -26,7 +26,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
    "eor        v17.16b, v17.16b, v17.16b      \n"
    "eor        v19.16b, v19.16b, v19.16b      \n"

-    ".p2align  2                               \n"
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"

--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
-    ".p2align  2                                 \n"
+    LABELALIGN
  "1:                                            \n"
    "movq       (%0),%%xmm0                      \n"
    "movq       (%0,%3),%%xmm1                   \n"
@@ -114,7 +114,7 @@ void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
-    ".p2align  2                                 \n"
+    LABELALIGN
  "1:                                            \n"
    "movdqu     (%0),%%xmm0                      \n"
    "movdqu     (%0,%3),%%xmm1                   \n"
@@ -256,7 +256,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
-    ".p2align  2                                 \n"
+    LABELALIGN
  "1:                                            \n"
    "movdqu     (%0),%%xmm0                      \n"
    "movdqu     (%0,%4),%%xmm1                   \n"

--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -35,7 +35,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
    "sub         %5, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  2                               \n"
    "1:                                        \n"
      "mov         %0, %1                      \n"

@@ -256,7 +255,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "sub         %7, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  2                               \n"
    "1:                                        \n"
      "mov         %0, %1                      \n"


--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -141,101 +141,6 @@ static uvec8 kShuffleMaskARGBToRAW_0 = {
 };
 #endif  // HAS_RGB24TOARGBROW_SSSE3

-#if defined(TESTING) && defined(__x86_64__)
-void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
-  asm volatile (
-    ".p2align  5                               \n"
-    "mov       %%eax,%%eax                     \n"
-    "mov       %%ebx,%%ebx                     \n"
-    "mov       %%ecx,%%ecx                     \n"
-    "mov       %%edx,%%edx                     \n"
-    "mov       %%esi,%%esi                     \n"
-    "mov       %%edi,%%edi                     \n"
-    "mov       %%ebp,%%ebp                     \n"
-    "mov       %%esp,%%esp                     \n"
-    ".p2align  5                               \n"
-    "mov       %%r8d,%%r8d                     \n"
-    "mov       %%r9d,%%r9d                     \n"
-    "mov       %%r10d,%%r10d                   \n"
-    "mov       %%r11d,%%r11d                   \n"
-    "mov       %%r12d,%%r12d                   \n"
-    "mov       %%r13d,%%r13d                   \n"
-    "mov       %%r14d,%%r14d                   \n"
-    "mov       %%r15d,%%r15d                   \n"
-    ".p2align  5                               \n"
-    "lea       (%%rax),%%eax                   \n"
-    "lea       (%%rbx),%%ebx                   \n"
-    "lea       (%%rcx),%%ecx                   \n"
-    "lea       (%%rdx),%%edx                   \n"
-    "lea       (%%rsi),%%esi                   \n"
-    "lea       (%%rdi),%%edi                   \n"
-    "lea       (%%rbp),%%ebp                   \n"
-    "lea       (%%rsp),%%esp                   \n"
-    ".p2align  5                               \n"
-    "lea       (%%r8),%%r8d                    \n"
-    "lea       (%%r9),%%r9d                    \n"
-    "lea       (%%r10),%%r10d                  \n"
-    "lea       (%%r11),%%r11d                  \n"
-    "lea       (%%r12),%%r12d                  \n"
-    "lea       (%%r13),%%r13d                  \n"
-    "lea       (%%r14),%%r14d                  \n"
-    "lea       (%%r15),%%r15d                  \n"
-
-    ".p2align  5                               \n"
-    "lea       0x10(%%rax),%%eax               \n"
-    "lea       0x10(%%rbx),%%ebx               \n"
-    "lea       0x10(%%rcx),%%ecx               \n"
-    "lea       0x10(%%rdx),%%edx               \n"
-    "lea       0x10(%%rsi),%%esi               \n"
-    "lea       0x10(%%rdi),%%edi               \n"
-    "lea       0x10(%%rbp),%%ebp               \n"
-    "lea       0x10(%%rsp),%%esp               \n"
-    ".p2align  5                               \n"
-    "lea       0x10(%%r8),%%r8d                \n"
-    "lea       0x10(%%r9),%%r9d                \n"
-    "lea       0x10(%%r10),%%r10d              \n"
-    "lea       0x10(%%r11),%%r11d              \n"
-    "lea       0x10(%%r12),%%r12d              \n"
-    "lea       0x10(%%r13),%%r13d              \n"
-    "lea       0x10(%%r14),%%r14d              \n"
-    "lea       0x10(%%r15),%%r15d              \n"
-
-    ".p2align  5                               \n"
-    "add       0x10,%%eax                      \n"
-    "add       0x10,%%ebx                      \n"
-    "add       0x10,%%ecx                      \n"
-    "add       0x10,%%edx                      \n"
-    "add       0x10,%%esi                      \n"
-    "add       0x10,%%edi                      \n"
-    "add       0x10,%%ebp                      \n"
-    "add       0x10,%%esp                      \n"
-    ".p2align  5                               \n"
-    "add       0x10,%%r8d                      \n"
-    "add       0x10,%%r9d                      \n"
-    "add       0x10,%%r10d                     \n"
-    "add       0x10,%%r11d                     \n"
-    "add       0x10,%%r12d                     \n"
-    "add       0x10,%%r13d                     \n"
-    "add       0x10,%%r14d                     \n"
-    "add       0x10,%%r15d                     \n"
-
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // TESTING
-
 #ifdef HAS_J400TOARGBROW_SSE2
 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  asm volatile (

--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -389,7 +389,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "blez            $t4, 2f                       \n"
    " andi           %[width], %[width], 0xf       \n"  // residual

-    ".p2align        2                             \n"
  "1:                                              \n"
    "addiu           $t4, $t4, -1                  \n"
    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
@@ -457,7 +456,6 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
    "blez      $t4, 2f                     \n"
    " addu     %[src], %[src], %[width]    \n"  // src += width

-    ".p2align  2                           \n"
   "1:                                     \n"
    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
@@ -512,7 +510,6 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "blez            %[x], 2f                     \n"
    " addu           %[src_uv], %[src_uv], $t4    \n"

-    ".p2align        2                            \n"
   "1:                                            \n"
    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
@@ -673,7 +670,6 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
    "lui               $s6, 0xff00            \n"
    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|

-    ".p2align          2                      \n"
   "1:                                        \n"
      I422ToTransientMipsRGB
 // Arranging into argb format
@@ -735,7 +731,6 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
    "lui               $s6, 0xff00            \n"
    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|

-    ".p2align          2                       \n"
   "1:                                         \n"
      I422ToTransientMipsRGB
 // Arranging into abgr format
@@ -797,7 +792,6 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
    "lui               $s6, 0xff              \n"
    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|

-    ".p2align          2                      \n"
   "1:                                        \n"
      I422ToTransientMipsRGB
      // Arranging into bgra format
@@ -857,7 +851,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
     "replv.ph          $t0, %[y0_fraction]               \n"
     "replv.ph          $t1, %[source_y_fraction]         \n"

-    ".p2align           2                                 \n"
   "1:                                                    \n"
     "lw                $t2, 0(%[src_ptr])                \n"
     "lw                $t3, 0(%[src_ptr1])               \n"

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -31,7 +31,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    "beqz           $t9, 2f                        \n"
    " nop                                          \n"

-    ".p2align       2                              \n"
  "1:                                              \n"
    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
@@ -90,7 +89,6 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    "bltz           $t9, 2f                       \n"
    " nop                                         \n"

-    ".p2align       2                             \n"
  "1:                                             \n"
    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
@@ -188,7 +186,6 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      "beqz           $t9, 2f                       \n"
      " nop                                         \n"

-      ".p2align       2                             \n"
     "1:                                            \n"
      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
@@ -248,7 +245,6 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      "srl           $t9, %[dst_width], 1         \n"
      "andi          $t8, %[dst_width], 1         \n"

-      ".p2align      2                            \n"
     "1:                                          \n"
      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
@@ -319,7 +315,6 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  __asm__ __volatile__ (
      ".set push                                          \n"
      ".set noreorder                                     \n"
-      ".p2align        2                                  \n"
    "1:                                                   \n"
      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@@ -368,7 +363,6 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      ".set noreorder                                    \n"
      "repl.ph           $t3, 3                          \n"  // 0x00030003

-     ".p2align           2                               \n"
    "1:                                                  \n"
      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
@@ -425,7 +419,6 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      ".set noreorder                                      \n"
      "repl.ph           $t2, 3                            \n"  // 0x00030003

-      ".p2align          2                                 \n"
    "1:                                                    \n"
      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
@@ -477,7 +470,6 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      ".set push                                     \n"
      ".set noreorder                                \n"

-      ".p2align   2                                  \n"
    "1:                                              \n"
      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@@ -528,7 +520,6 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      ".set push                                         \n"
      ".set noreorder                                    \n"

-      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
@@ -586,7 +577,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
      ".set push                                         \n"
      ".set noreorder                                    \n"

-      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -26,7 +26,6 @@ extern "C" {
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    // load even pixels into q0, odd into q1
    MEMACCESS(0)
@@ -47,7 +46,6 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
@@ -73,7 +71,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
@@ -101,7 +98,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
@@ -123,7 +119,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  const uint8* src_ptr2 = src_ptr + src_stride * 2;
  const uint8* src_ptr3 = src_ptr + src_stride * 3;
 asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
@@ -162,7 +157,6 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
                         ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
@@ -185,7 +179,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
  asm volatile (
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
@@ -245,7 +238,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
  asm volatile (
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
@@ -300,7 +292,6 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
  asm volatile (
    MEMACCESS(3)
    "vld1.8     {q3}, [%3]                     \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
@@ -334,7 +325,6 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    MEMACCESS(7)
    "vld1.8     {q15}, [%7]                    \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"

    // d0 = 00 40 01 41 02 42 03 43
@@ -450,7 +440,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    MEMACCESS(5)
    "vld1.8     {q14}, [%5]                    \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"

    // d0 = 00 40 01 41 02 42 03 43
@@ -545,7 +534,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                    uint16* dst_ptr, int src_width, int src_height) {
  const uint8* src_tmp = NULL;
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    "mov       %0, %1                          \n"
    "mov       r12, %5                         \n"
@@ -590,7 +578,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
  int* tmp = dx_offset;
  const uint8* src_tmp = src_ptr;
  asm volatile (
-    ".p2align   2                              \n"
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
@@ -749,7 +736,6 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    // load even pixels into q0, odd into q1
    MEMACCESS(0)
@@ -773,7 +759,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
@@ -804,7 +789,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
@@ -845,7 +829,6 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
                               int src_stepx, uint8* dst_argb, int dst_width) {
  asm volatile (
    "mov        r12, %3, lsl #2                \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.32    {d0[0]}, [%0], r12             \n"
@@ -875,7 +858,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  asm volatile (
    "mov        r12, %4, lsl #2                \n"
    "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
@@ -930,7 +912,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
  int tmp = 0;
  const uint8* src_tmp = src_argb;
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    LOAD1_DATA32_LANE(d0, 0)
    LOAD1_DATA32_LANE(d0, 1)
@@ -974,7 +955,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
  int* tmp = dx_offset;
  const uint8* src_tmp = src_argb;
  asm volatile (
-    ".p2align   2                              \n"
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3