Port scale_argb functions to Native Client.

BUG=253 TEST=libyuv_unittest,ncval,trybots R=fbarchard@google.com Review URL: https://webrtc-codereview.appspot.com/2054006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@766 16f28f9a-4ce2-e073-06de-1de4eb20be90

Port scale_argb functions to Native Client.
BUG=253 TEST=libyuv_unittest,ncval,trybots R=fbarchard@google.com Review URL: https://webrtc-codereview.appspot.com/2054006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@766 16f28f9a-4ce2-e073-06de-1de4eb20be90
f6e11399 · nfullagar@google.com · c140b9d1 · f6e11399 · f6e11399 · f6e11399
Commit f6e11399 authored Aug 19, 2013 by nfullagar@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 119 additions and 53 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

scale_argb.cc source/scale_argb.cc +117 -51

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 765
+Version: 766
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 765
+#define LIBYUV_VERSION 766
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -346,24 +346,63 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 }
 #elif !defined(LIBYUV_DISABLE_X86) && \
-    ((defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+    (defined(__x86_64__) || defined(__i386__))
-// GCC versions of row functions are verbatim conversions from Visual C.
+// TODO(nfullagar): For Native Client: When new toolchain becomes available,
+// take advantage of bundle lock / unlock feature. This will reduce the amount
+// of manual bundle alignment done below, and bundle alignment could even be
+// moved into each macro that doesn't use %%nacl: such as MEMOPREG.
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n"
+#define BUNDLEALIGN ".p2align 5 \n"
+#else
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define BUNDLEALIGN
+#endif
+// GCC versions of row functions are verbatim conversions from Visual C,
+// with some additional macro injection for Native Client (see row_posix.cc
+// for more details.)
 // Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
+//   objdump -D yuvscaler.obj >yuvscaler.txt
 #define HAS_SCALEARGBROWDOWN2_SSE2
 static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                                   ptrdiff_t /* src_stride */,
                                   uint8* dst_argb, int dst_width) {
  asm volatile (
    ".p2align  4                               \n"
+    BUNDLEALIGN
  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    "MEMACCESS2(0x10,0)",%%xmm1     \n"
-    "lea       0x20(%0),%0                     \n"
+    "lea       "MEMLEA(0x20,0)",%0             \n"
    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
-    "lea       0x10(%1),%1                     \n"
+    "lea       "MEMLEA(0x10,1)",%1             \n"
    "jg        1b                              \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_argb),  // %1
@@ -381,12 +420,14 @@ static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                      uint8* dst_argb, int dst_width) {
  asm volatile (
    ".p2align  4                               \n"
+    BUNDLEALIGN
  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    "MEMACCESS2(0x10,0)",%%xmm1     \n"
-    "movdqa    (%0,%3,1),%%xmm2                \n"
+    BUNDLEALIGN
-    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
+    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
-    "lea       0x20(%0),%0                     \n"
+    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
+    "lea       "MEMLEA(0x20,0)",%0             \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "pavgb     %%xmm3,%%xmm1                   \n"
    "movdqa    %%xmm0,%%xmm2                   \n"
@@ -394,14 +435,17 @@ static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
-    "lea       0x10(%1),%1                     \n"
+    "lea       "MEMLEA(0x10,1)",%1             \n"
    "jg        1b                              \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_argb),   // %1
    "+r"(dst_width)   // %2
  : "r"(static_cast<intptr_t>(src_stride))   // %3
  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
@@ -417,21 +461,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
  intptr_t src_stepx_x12 = 0;
  asm volatile (
-    "lea       0x0(,%1,4),%1                   \n"
+    "lea       "MEMLEA3(0x00,1,4)",%1          \n"
-    "lea       (%1,%1,2),%4                    \n"
+    "lea       "MEMLEA4(0x00,1,1,2)",%4        \n"
    ".p2align  4                               \n"
+    BUNDLEALIGN
  "1:                                          \n"
-    "movd      (%0),%%xmm0                     \n"
+    "movd      "MEMACCESS(0)",%%xmm0           \n"
-    "movd      (%0,%1,1),%%xmm1                \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movd      (%0,%1,2),%%xmm2                \n"
+    BUNDLEALIGN
-    "movd      (%0,%4,1),%%xmm3                \n"
+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
-    "lea       (%0,%1,4),%0                    \n"
+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
+    "lea       "MEMLEA4(0x00,0,1,4)",%0        \n"
    "punpckldq %%xmm3,%%xmm2                   \n"
    "punpcklqdq %%xmm2,%%xmm0                  \n"
    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
+    "movdqa    %%xmm0,"MEMACCESS(2)"           \n"
-    "lea       0x10(%2),%2                     \n"
+    "lea       "MEMLEA(0x10,2)",%2             \n"
    "jg        1b                              \n"
  : "+r"(src_argb),      // %0
    "+r"(src_stepx_x4),  // %1
@@ -440,6 +486,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
    "+r"(src_stepx_x12)  // %4
  :
  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
@@ -455,21 +504,23 @@ static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  intptr_t src_stepx_x12 = 0;
  intptr_t row1 = static_cast<intptr_t>(src_stride);
  asm volatile (
-    "lea       0x0(,%1,4),%1                   \n"
+    "lea       "MEMLEA3(0x00,1,4)",%1          \n"
-    "lea       (%1,%1,2),%4                    \n"
+    "lea       "MEMLEA4(0x00,1,1,2)",%4        \n"
-    "lea       (%0,%5,1),%5                    \n"
+    "lea       "MEMLEA4(0x00,0,5,1)",%5        \n"
    ".p2align  4                               \n"
+    BUNDLEALIGN
  "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
+    "movq      "MEMACCESS(0)",%%xmm0           \n"
-    "movhps    (%0,%1,1),%%xmm0                \n"
+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
-    "movq      (%0,%1,2),%%xmm1                \n"
+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
-    "movhps    (%0,%4,1),%%xmm1                \n"
+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
-    "lea       (%0,%1,4),%0                    \n"
+    "lea       "MEMLEA4(0x00,0,1,4)",%0        \n"
-    "movq      (%5),%%xmm2                     \n"
+    "movq      "MEMACCESS(5)",%%xmm2           \n"
-    "movhps    (%5,%1,1),%%xmm2                \n"
+    BUNDLEALIGN
-    "movq      (%5,%1,2),%%xmm3                \n"
+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
-    "movhps    (%5,%4,1),%%xmm3                \n"
+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
-    "lea       (%5,%1,4),%5                    \n"
+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
+    "lea       "MEMLEA4(0x00,5,1,4)",%5        \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "pavgb     %%xmm3,%%xmm1                   \n"
    "movdqa    %%xmm0,%%xmm2                   \n"
@@ -477,8 +528,8 @@ static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    "pavgb     %%xmm2,%%xmm0                   \n"
    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
+    "movdqa    %%xmm0,"MEMACCESS(2)"           \n"
-    "lea       0x10(%2),%2                     \n"
+    "lea       "MEMLEA(0x10,2)",%2             \n"
    "jg        1b                              \n"
  : "+r"(src_argb),       // %0
    "+r"(src_stepx_x4),   // %1
@@ -488,6 +539,9 @@ static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
    "+r"(row1)            // %5
  :
  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
@@ -514,15 +568,16 @@ static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
    "pextrw    $0x3,%%xmm2,%k4                 \n"
    ".p2align  5                               \n"
+    BUNDLEALIGN
  "2:                                          \n"
    "paddd     %%xmm3,%%xmm2                   \n"
-    "movd      (%1,%3,4),%%xmm0                \n"
+    MEMOPREG(movd,0x00,1,3,4,xmm0)             //  movd      (%1,%3,4),%%xmm0
-    "movd      (%1,%4,4),%%xmm1                \n"
+    MEMOPREG(movd,0x00,1,4,4,xmm1)             //  movd      (%1,%4,4),%%xmm1
    "punpckldq %%xmm1,%%xmm0                   \n"
    "pextrw    $0x1,%%xmm2,%k3                 \n"
    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "movq      %%xmm0,(%0)                     \n"
+    "movq      %%xmm0,"MEMACCESS(0)"           \n"
-    "lea       0x8(%0),%0                      \n"
+    "lea       "MEMLEA(0x8,0)",%0              \n"
    "sub       $0x2,%2                         \n"
    "jge       2b                              \n"
@@ -530,8 +585,9 @@ static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  "29:                                         \n"
    "add       $0x1,%2                         \n"
    "jl        99f                             \n"
-    "movd      (%1,%3,4),%%xmm0                \n"
+    BUNDLEALIGN
-    "movd      %%xmm0,(%0)                     \n"
+    MEMOPREG(movd,0x00,1,3,4,xmm0)             //  movd      (%1,%3,4),%%xmm0
+    "movd      %%xmm0,"MEMACCESS(0)"           \n"
    ".p2align  5                               \n"
  "99:                                         \n"
@@ -543,6 +599,9 @@ static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  : "rm"(x),           // %5
    "rm"(dx)           // %6
  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
@@ -589,12 +648,14 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
    "pextrw    $0x3,%%xmm2,%k4                 \n"
    ".p2align  4                               \n"
+    BUNDLEALIGN
  "2:                                          \n"
    "movdqa    %%xmm2,%%xmm1                   \n"
    "paddd     %%xmm3,%%xmm2                   \n"
-    "movq      (%1,%3,4),%%xmm0                \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
    "psrlw     $0x9,%%xmm1                     \n"
-    "movhps    (%1,%4,4),%%xmm0                \n"
+    BUNDLEALIGN
+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
    "pshufb    %%xmm5,%%xmm1                   \n"
    "pshufb    %%xmm4,%%xmm0                   \n"
    "pxor      %%xmm6,%%xmm1                   \n"
@@ -603,24 +664,26 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
    "pextrw    $0x1,%%xmm2,%k3                 \n"
    "pextrw    $0x3,%%xmm2,%k4                 \n"
    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%0)                     \n"
+    "movq      %%xmm0,"MEMACCESS(0)"           \n"
-    "lea       0x8(%0),%0                      \n"
+    "lea       "MEMLEA(0x8,0)",%0              \n"
    "sub       $0x2,%2                         \n"
    "jge       2b                              \n"
    ".p2align  4                               \n"
+    BUNDLEALIGN
  "29:                                         \n"
    "add       $0x1,%2                         \n"
    "jl        99f                             \n"
    "psrlw     $0x9,%%xmm2                     \n"
-    "movq      (%1,%3,4),%%xmm0                \n"
+    BUNDLEALIGN
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
    "pshufb    %%xmm5,%%xmm2                   \n"
    "pshufb    %%xmm4,%%xmm0                   \n"
    "pxor      %%xmm6,%%xmm2                   \n"
    "pmaddubsw %%xmm2,%%xmm0                   \n"
    "psrlw     $0x7,%%xmm0                     \n"
    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%0)                     \n"
+    "movd      %%xmm0,"MEMACCESS(0)"           \n"
    ".p2align  4                               \n"
  "99:                                         \n"
@@ -632,6 +695,9 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  : "rm"(x),           // %5
    "rm"(dx)           // %6
  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 #endif