Interpolate ported to SSE2

BUG=177 TEST=out\release\libyuv_unittest --gtest_filter=* Review URL: https://webrtc-codereview.appspot.com/1060006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@549 16f28f9a-4ce2-e073-06de-1de4eb20be90

Interpolate ported to SSE2
BUG=177 TEST=out\release\libyuv_unittest --gtest_filter=* Review URL: https://webrtc-codereview.appspot.com/1060006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@549 16f28f9a-4ce2-e073-06de-1de4eb20be90
8811289b · fbarchard@google.com · 70b49281 · 8811289b · 8811289b · 8811289b
Commit 8811289b authored Jan 22, 2013 by fbarchard@google.com
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 548
+Version: 549
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -121,6 +121,7 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBINTERPOLATEROW_SSE2
 #endif

 // The following are Yasm x86 only.
@@ -1306,6 +1307,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 void ARGBInterpolateRow_C(uint8* dst_argb, const uint8* src_argb,
                          ptrdiff_t src_stride_argb,
                          int dst_width, int source_y_fraction);
+void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
+                             ptrdiff_t src_stride_argb, int dst_width,
+                             int source_y_fraction);
 void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
                              ptrdiff_t src_stride_argb, int dst_width,
                              int source_y_fraction);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 548
+#define LIBYUV_VERSION 549

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1284,6 +1284,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
  void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
                             ptrdiff_t src_stride, int dst_width,
                             int source_y_fraction) = ARGBInterpolateRow_C;
+#if defined(HAS_ARGBINTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBInterpolateRow = ARGBInterpolateRow_SSE2;
+  }
+#endif
 #if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4724,7 +4724,117 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
    lea        esi, [esi + 16]
    jg         xloop100

-    // Extrude last pixel.
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Bilinear image filtering.
+// Same as ScaleARGBFilterRows_SSE2 but without last pixel duplicated.
+__declspec(naked) __declspec(align(16))
+void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
+                             ptrdiff_t src_stride, int dst_width,
+                             int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_argb
+    mov        esi, [esp + 8 + 8]   // src_argb
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    cmp        eax, 0  // dispatch to specialized filters if applicable.
+    je         xloop100
+    cmp        eax, 64
+    je         xloop75
+    cmp        eax, 128
+    je         xloop50
+    cmp        eax, 192
+    je         xloop25
+
+    movd       xmm5, eax            // xmm5 = y fraction
+    punpcklbw  xmm5, xmm5
+    psrlw      xmm5, 1
+    punpcklwd  xmm5, xmm5
+    punpckldq  xmm5, xmm5
+    punpcklqdq xmm5, xmm5
+    pxor       xmm4, xmm4
+
+    align      16
+  xloop:
+    movdqa     xmm0, [esi]  // row0
+    movdqa     xmm2, [esi + edx]  // row1
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    psubw      xmm2, xmm0  // row1 - row0
+    psubw      xmm3, xmm1
+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
+    paddw      xmm3, xmm3
+    pmulhw     xmm2, xmm5  // scale diff
+    pmulhw     xmm3, xmm5
+    paddw      xmm0, xmm2  // sum rows
+    paddw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+    align      16
+  xloop25:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+    align      16
+  xloop50:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+    align      16
+  xloop75:
+    movdqa     xmm1, [esi]
+    movdqa     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+    align      16
+  xloop100:
+    movdqa     xmm0, [esi]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop100
+
  xloop99:
    pop        edi
    pop        esi

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -2483,7 +2483,7 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
  } while (dst_ptr < dend);
 }

-#define HAS_SCALEROWDOWN34_SSE2_DISABLED
+#define HAS_SCALEROWDOWN34_SSE2
 // Filter rows 0 and 1 together, 3 : 1
 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
                                      ptrdiff_t src_stride,