ARGBAffineRow_SSE2 function to copy pixels from a source with slope to a row of destination.

BUG=60 TEST=none Review URL: https://webrtc-codereview.appspot.com/727004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@313 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBAffineRow_SSE2 function to copy pixels from a source with slope to a row of destination.
BUG=60 TEST=none Review URL: https://webrtc-codereview.appspot.com/727004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@313 16f28f9a-4ce2-e073-06de-1de4eb20be90
864f828a · fbarchard@google.com · 4f10e97f · 864f828a · 864f828a · 864f828a
Commit 864f828a authored Aug 08, 2012 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 76 additions and 3 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

row.h source/row.h +3 -0

row_win.cc source/row_win.cc +71 -1

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 312
+Version: 313
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 312
+#define LIBYUV_VERSION 313

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row.h
+++ b/source/row.h
@@ -87,6 +87,7 @@ extern "C" {
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
+#define HAS_ARGBAFFINEROW_SSE2
 #endif

 // The following are disabled when SSSE3 is available:
@@ -522,6 +523,8 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                    uint32 value);
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                       uint32 value);
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);

 #ifdef __cplusplus
 }  // extern "C"

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3347,8 +3347,78 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 }
 #endif  // HAS_ARGBSHADE_SSE2

-#endif  // _M_IX86
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) __declspec(align(16))
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 8]   // src_argb
+    mov        esi, [esp + 12]  // stride
+    mov        edx, [esp + 16]  // dst_argb
+    mov        ecx, [esp + 20]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm3, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 24]  // width
+    shl        esi, 16          // 4, stride
+    add        esi, 4
+    movd       xmm4, esi
+    sub        ecx, 2
+    jl         l2b
+
+    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    addps      xmm0, xmm3
+    movlhps    xmm2, xmm0
+    pshufd     xmm4, xmm4, 0  // dup 4, stride
+    movlhps    xmm3, xmm3    // dudv
+    addps      xmm3, xmm3    // dudv *= 2
+    pshufd     xmm4, xmm4, 0

+     // 2 pixel loop
+    align      4
+  l2:
+    cvttps2dq  xmm1, xmm2    // x, y float to int
+    packssdw   xmm1, xmm1    // x, y as shorts
+    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
+    addps      xmm2, xmm3    // x, y += dx, dy
+    movd       esi, xmm1
+    movdqa     xmm5, xmm1
+    pshufd     xmm5, xmm5, 0x55
+    movd       xmm0, [eax + esi]  // read pixel 0
+    movd       esi, xmm5
+    movd       xmm5, [eax + esi]  // read pixel 1
+    punpckldq  xmm0, xmm5
+    sub        ecx, 2
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jge        l2
+
+  l2b:
+    add        ecx, 2 - 1
+    jl         l1b
+
+    // 1 pixel loop
+    align      4
+  l1:
+    cvttps2dq  xmm1, xmm2    // x, y float to int
+    packssdw   xmm1, xmm1    // x, y as shorts
+    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
+    addps      xmm2, xmm3    // x, y += dx, dy
+    movd       esi, xmm1
+    movd       xmm0, [eax + esi]  // copy a pixel
+    sub        ecx, 1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jge        l1
+  l1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#endif  // _M_IX86

 #ifdef __cplusplus
 }  // extern "C"