4 pixel version of affine for gcc and aligned version of win.

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/714007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@320 16f28f9a-4ce2-e073-06de-1de4eb20be90

4 pixel version of affine for gcc and aligned version of win.
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/714007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@320 16f28f9a-4ce2-e073-06de-1de4eb20be90
e3cc7694 · fbarchard@google.com · 845e94d1 · e3cc7694 · e3cc7694 · e3cc7694
Commit e3cc7694 authored Aug 10, 2012 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 88 additions and 55 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

row_posix.cc source/row_posix.cc +70 -39

row_win.cc source/row_win.cc +16 -14

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 319
+Version: 320
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 319
+#define LIBYUV_VERSION 320
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3220,61 +3220,91 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 #endif  // HAS_ARGBSHADE_SSE2
 #ifdef HAS_ARGBAFFINEROW_SSE2
+// TODO(fbarchard): Find 64 bit way to avoid masking.
+// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
 // Copy ARGB pixels from source image with slope to a row of destination.
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width) {
  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp = 0;
  asm volatile (
    "movq      (%3),%%xmm2                     \n"
-    "movq      0x8(%3),%%xmm3                  \n"
+    "movq      0x8(%3),%%xmm7                  \n"
    "shl       $0x10,%1                        \n"
    "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm4                       \n"
+    "movd      %1,%%xmm5                       \n"
-    "xor       %1,%1                           \n"  // cleanse upper bits.
+    "sub       $0x4,%4                         \n"
-    "sub       $0x2,%4                         \n"
+    "jl        49f                             \n"
-    "jl        29f                             \n"
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm3,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
    "movlhps   %%xmm0,%%xmm2                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
-    "movlhps   %%xmm3,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
-    "addps     %%xmm3,%%xmm3                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
-  // 2 pixel loop                              \n"
+  // 4 pixel loop                              \n"
-    ".p2align  2                               \n"
+    ".p2align  4                               \n"
-  "20:                                         \n"
+  "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm1,%%xmm1                   \n"
+    "cvttps2dq %%xmm3,%%xmm1                   \n"
-    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
-    "addps     %%xmm3,%%xmm2                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "movd      %%xmm1,%1                       \n"
+#if defined(__x86_64__)
+    "movq      %%xmm0,%1                       \n"
+    "mov       %1,%5                           \n"
    "and       $0x0fffffff,%1                  \n"
-    "movdqa    %%xmm1,%%xmm5                   \n"
+    "shr       $32,%5                          \n"
-    "pshufd    $0x55,%%xmm5,%%xmm5             \n"
+    "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
-    "movd      (%0,%1,1),%%xmm0                \n"
+#else
-    "movd      %%xmm5,%1                       \n"
+    "movd      %%xmm0,%1                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%5                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+#endif
+    "movd      (%0,%1,1),%%xmm1                \n"
+    "movd      (%0,%5,1),%%xmm6                \n"
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1,(%2)                     \n"
+#if defined(__x86_64__)
+    "movq      %%xmm0,%1                       \n"
+    "mov       %1,%5                           \n"
    "and       $0x0fffffff,%1                  \n"
-    "movd      (%0,%1,1),%%xmm5                \n"
+    "shr       $32,%5                          \n"
-    "punpckldq %%xmm5,%%xmm0                   \n"
+#else
-    "sub       $0x2,%4                         \n"
+    "movd      %%xmm0,%1                       \n"
-    "movq      %%xmm0,(%2)                     \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "lea       0x8(%2),%2                      \n"
+    "movd      %%xmm0,%5                       \n"
-    "jge       20b                             \n"
+#endif
+    "movd      (%0,%1,1),%%xmm0                \n"
-  "29:                                         \n"
+    "movd      (%0,%5,1),%%xmm6                \n"
-    "add       $0x1,%4                         \n"
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "sub       $0x4,%4                         \n"
+    "movq      %%xmm0,0x08(%2)                 \n"
+    "lea       0x10(%2),%2                     \n"
+    "jge       40b                             \n"
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
    "jl        19f                             \n"
  // 1 pixel loop                              \n"
-    ".p2align  2                               \n"
+    ".p2align  4                               \n"
  "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm1,%%xmm1                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
-    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "addps     %%xmm3,%%xmm2                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm1,%1                       \n"
+    "movd      %%xmm0,%1                       \n"
+#if defined(__x86_64__)
    "and       $0x0fffffff,%1                  \n"
+#endif
    "movd      (%0,%1,1),%%xmm0                \n"
    "sub       $0x1,%4                         \n"
    "movd      %%xmm0,(%2)                     \n"
@@ -3285,11 +3315,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
    "+r"(src_argb_stride_temp),  // %1
    "+r"(dst_argb),  // %2
    "+r"(uv_dudv),   // %3
-    "+rm"(width)     // %4
+    "+rm"(width),    // %4
+    "+r"(temp)   // %5
  :
  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
  );
 }

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3354,13 +3354,14 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 8]   // src_argb
+    push       edi
-    mov        esi, [esp + 12]  // stride
+    mov        eax, [esp + 12]   // src_argb
-    mov        edx, [esp + 16]  // dst_argb
+    mov        esi, [esp + 16]  // stride
-    mov        ecx, [esp + 20]  // pointer to uv_dudv
+    mov        edx, [esp + 20]  // dst_argb
+    mov        ecx, [esp + 24]  // pointer to uv_dudv
    movq       xmm2, qword ptr [ecx]  // uv
    movq       xmm7, qword ptr [ecx + 8]  // dudv
-    mov        ecx, [esp + 24]  // width
+    mov        ecx, [esp + 28]  // width
    shl        esi, 16          // 4, stride
    add        esi, 4
    movd       xmm5, esi
@@ -3386,24 +3387,24 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
    packssdw   xmm0, xmm1    // x, y as 8 shorts
    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
-    addps      xmm2, xmm4    // x, y += dx, dy first 2
-    addps      xmm3, xmm4    // x, y += dx, dy next 2
    movd       esi, xmm0
    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       edi, xmm0
-    movd       esi, xmm0
    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       xmm6, [eax + esi]  // read pixel 1
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       xmm6, [eax + edi]  // read pixel 1
    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
+    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    movq       qword ptr [edx], xmm1
    movd       esi, xmm0
    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
    movd       xmm6, [eax + esi]  // read pixel 2
-    movd       esi, xmm0
+    movd       xmm0, [eax + edi]  // read pixel 3
-    movd       xmm0, [eax + esi]  // read pixel 3
    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
-    punpcklqdq xmm1, xmm6     // combine pixel 0, 1, 2 and 3
+    addps      xmm3, xmm4    // x, y += dx, dy next 2
    sub        ecx, 4
-    movdqu     [edx], xmm1
+    movq       qword ptr 8[edx], xmm6
    lea        edx, [edx + 16]
    jge        l4
@@ -3425,6 +3426,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
    lea        edx, [edx + 4]
    jge        l1
  l1b:
+    pop        edi
    pop        esi
    ret
  }