Unattenuate port to NaCL

BUG=253 TESTED=validator R=nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/2038004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@763 16f28f9a-4ce2-e073-06de-1de4eb20be90

Unattenuate port to NaCL
BUG=253 TESTED=validator R=nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/2038004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@763 16f28f9a-4ce2-e073-06de-1de4eb20be90
bb5ea8e4 · fbarchard@google.com · f78509b3 · bb5ea8e4 · bb5ea8e4 · bb5ea8e4
Commit bb5ea8e4 authored Aug 15, 2013 by fbarchard@google.com
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 762
+Version: 763
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -51,6 +51,7 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_SSSE3
 #define HAS_ARGBSHADEROW_SSE2
 #define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 #define HAS_INTERPOLATEROW_SSE2
@@ -130,7 +131,6 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2
 // Effects:
-#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_SOBELROW_SSE2
 #define HAS_SOBELXROW_SSSE3
 #define HAS_SOBELXYROW_SSE2
@@ -1413,7 +1413,7 @@ void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
                               int width);
 // Inverse table for unattenuate, shared by C and SSE2.
-extern uint32 fixed_invtbl8[256];
+extern const uint32 fixed_invtbl8[256];
 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 762
+#define LIBYUV_VERSION 763
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1688,7 +1688,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // Reciprocal method is off by 1 on some values. ie 125
 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
 #define T(a) 0x01000000 + (0x10000 / a)
-uint32 fixed_invtbl8[256] = {
+const uint32 fixed_invtbl8[256] = {
  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3920,35 +3920,34 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
  uintptr_t alpha = 0;
  asm volatile (
-    "sub       %0,%1                           \n"
    // 4 pixel loop.
    ".p2align  4                               \n"
  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
-    "movzb     0x3(%0),%3                      \n"
+    "movzb     "MEMACCESS2(0x03,0)",%3         \n"
    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movd      0x0(%4,%3,4),%%xmm2             \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     0x7(%0),%3                      \n"
+    "movzb     "MEMACCESS2(0x07,0)",%3         \n"
-    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
    "movlhps   %%xmm3,%%xmm2                   \n"
    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
+    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
-    "movzb     0xb(%0),%3                      \n"
+    "movzb     "MEMACCESS2(0x0b,0)",%3         \n"
    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movd      0x0(%4,%3,4),%%xmm2             \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     0xf(%0),%3                      \n"
+    "movzb     "MEMACCESS2(0x0f,0)",%3         \n"
-    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
    "movlhps   %%xmm3,%%xmm2                   \n"
    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "lea       "MEMLEA(0x10,0)",%0             \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
-    "lea       0x10(%0),%0                     \n"
+    "lea       "MEMLEA(0x10,1)",%1             \n"
    "jg        1b                              \n"
  : "+r"(src_argb),    // %0
    "+r"(dst_argb),    // %1
@@ -3956,6 +3955,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    "+r"(alpha)        // %3
  : "r"(fixed_invtbl8)  // %4
  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4765,7 +4765,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    mov        eax, [esp + 8 + 4]   // src_argb0
    mov        edx, [esp + 8 + 8]   // dst_argb
    mov        ecx, [esp + 8 + 12]  // width
-    sub        edx, eax
    align      16
 convertloop:
@@ -4790,11 +4789,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
    movlhps    xmm2, xmm3
    pmulhuw    xmm1, xmm2       // rgb * a
+    lea        eax, [eax + 16]
    packuswb   xmm0, xmm1
    sub        ecx, 4
-    movdqa     [eax + edx], xmm0
+    movdqa     [edx], xmm0
-    lea        eax, [eax + 16]
+    lea        edx, [edx + 16]
    jg         convertloop
    pop        edi
    pop        esi