Affine do 4 pixels at a time.

BUG=none TEST=affine unitest Review URL: https://webrtc-codereview.appspot.com/729005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@319 16f28f9a-4ce2-e073-06de-1de4eb20be90

Affine do 4 pixels at a time.
BUG=none TEST=affine unitest Review URL: https://webrtc-codereview.appspot.com/729005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@319 16f28f9a-4ce2-e073-06de-1de4eb20be90
845e94d1 · fbarchard@google.com · 749950d7 · 845e94d1 · 845e94d1 · 845e94d1
Commit 845e94d1 authored Aug 10, 2012 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 35 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

row_win.cc source/row_win.cc +46 -33

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 318
+Version: 319
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 318
+#define LIBYUV_VERSION 319

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3359,53 +3359,66 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
    mov        edx, [esp + 16]  // dst_argb
    mov        ecx, [esp + 20]  // pointer to uv_dudv
    movq       xmm2, qword ptr [ecx]  // uv
-    movq       xmm3, qword ptr [ecx + 8]  // dudv
+    movq       xmm7, qword ptr [ecx + 8]  // dudv
    mov        ecx, [esp + 24]  // width
    shl        esi, 16          // 4, stride
    add        esi, 4
-    movd       xmm4, esi
-    sub        ecx, 2
-    jl         l2b
+    movd       xmm5, esi
+    sub        ecx, 4
+    jl         l4b

+    // setup for 4 pixel loop
+    pshufd     xmm7, xmm7, 0x44  // dup dudv
+    pshufd     xmm5, xmm5, 0  // dup 4, stride
    movdqa     xmm0, xmm2    // x0, y0, x1, y1
-    addps      xmm0, xmm3
+    addps      xmm0, xmm7
    movlhps    xmm2, xmm0
-    pshufd     xmm4, xmm4, 0  // dup 4, stride
-    movlhps    xmm3, xmm3    // dudv
-    addps      xmm3, xmm3    // dudv *= 2
-    pshufd     xmm4, xmm4, 0
+    movdqa     xmm4, xmm7
+    addps      xmm4, xmm4    // dudv *= 2
+    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm3, xmm4
+    addps      xmm4, xmm4    // dudv *= 4

-     // 2 pixel loop
+    // 4 pixel loop
    align      4
-  l2:
-    cvttps2dq  xmm1, xmm2    // x, y float to int
-    packssdw   xmm1, xmm1    // x, y as shorts
-    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
-    addps      xmm2, xmm3    // x, y += dx, dy
-    movd       esi, xmm1
-    movdqa     xmm5, xmm1
-    pshufd     xmm5, xmm5, 0x55
-    movd       xmm0, [eax + esi]  // read pixel 0
-    movd       esi, xmm5
-    movd       xmm5, [eax + esi]  // read pixel 1
-    punpckldq  xmm0, xmm5
-    sub        ecx, 2
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jge        l2
+  l4:
+    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
+    packssdw   xmm0, xmm1    // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm6, [eax + esi]  // read pixel 1
+    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm6, [eax + esi]  // read pixel 2
+    movd       esi, xmm0
+    movd       xmm0, [eax + esi]  // read pixel 3
+    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
+    punpcklqdq xmm1, xmm6     // combine pixel 0, 1, 2 and 3
+    sub        ecx, 4
+    movdqu     [edx], xmm1
+    lea        edx, [edx + 16]
+    jge        l4

-  l2b:
-    add        ecx, 2 - 1
+  l4b:
+    add        ecx, 4 - 1
    jl         l1b

    // 1 pixel loop
    align      4
  l1:
-    cvttps2dq  xmm1, xmm2    // x, y float to int
-    packssdw   xmm1, xmm1    // x, y as shorts
-    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
-    addps      xmm2, xmm3    // x, y += dx, dy
-    movd       esi, xmm1
+    cvttps2dq  xmm0, xmm2    // x, y float to int
+    packssdw   xmm0, xmm0    // x, y as shorts
+    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
+    addps      xmm2, xmm7    // x, y += dx, dy
+    movd       esi, xmm0
    movd       xmm0, [eax + esi]  // copy a pixel
    sub        ecx, 1
    movd       [edx], xmm0