SSE2 scale for ARGB and posix with specializations

BUG=177 TESTED=try bots Review URL: https://webrtc-codereview.appspot.com/1115008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@585 16f28f9a-4ce2-e073-06de-1de4eb20be90

SSE2 scale for ARGB and posix with specializations
BUG=177 TESTED=try bots Review URL: https://webrtc-codereview.appspot.com/1115008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@585 16f28f9a-4ce2-e073-06de-1de4eb20be90
14f657b4 · fbarchard@google.com · b3446fc5 · 14f657b4 · 14f657b4 · 14f657b4
Commit 14f657b4 authored Feb 27, 2013 by fbarchard@google.com
Showing with 166 additions and 62 deletions

README.chromium README.chromium +1 -1

row.h include/libyuv/row.h +2 -1

version.h include/libyuv/version.h +1 -1

scale.cc source/scale.cc +55 -18

scale_argb.cc source/scale_argb.cc +107 -41

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 584
+Version: 585
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -169,7 +169,8 @@ extern "C" {
 #endif
 // The following are available on Neon platforms
-#if !defined(LIBYUV_DISABLE_NEON) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOARGBROW_NEON
 #define HAS_ABGRTOUVROW_NEON
 #define HAS_ABGRTOYROW_NEON

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 584
+#define LIBYUV_VERSION 585
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -23,9 +23,6 @@ namespace libyuv {
 extern "C" {
 #endif
-// Bilinear SSE2 is disabled.
-#define SSE2_DISABLED 1
 // Note: Some SSE2 reference manuals
 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
@@ -1954,20 +1951,29 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                 const uint8* src_ptr, ptrdiff_t src_stride,
                                 int dst_width, int source_y_fraction) {
+  asm volatile (
  asm volatile (
    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
+    "je        100f                            \n"
-    "cmp       $0x80,%3                        \n"
+    "cmp       $0x20,%3                        \n"
-    "je        3f                              \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "psrlw     $0x1,%%xmm5                     \n"
    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "punpckldq %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "punpcklqdq %%xmm5,%%xmm5                  \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    // General purpose row blend.
    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%1),%%xmm0                     \n"
@@ -1991,25 +1997,56 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
    "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+    // Blend 25 / 75.
    ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
    "sub       $0x10,%2                        \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
+    "jg        25b                             \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+    // Blend 50 / 50.
    ".p2align  4                               \n"
-  "3:                                          \n"
+  "50:                                         \n"
    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+    // Blend 75 / 25.
+    ".p2align  4                               \n"
+  "75:                                         \n"
+    "movdqa    (%1),%%xmm1                     \n"
+    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
    "sub       $0x10,%2                        \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+    // Blend 100 / 0 - Copy row unchanged.
    ".p2align  4                               \n"
-  "4:                                          \n"
+  "100:                                        \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        100b                            \n"
+  "99:                                         \n"
    "punpckhbw %%xmm0,%%xmm0                   \n"
    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
    "punpckhqdq %%xmm0,%%xmm0                  \n"

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -200,8 +200,7 @@ static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb,
 }
 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
-#ifndef SSE2_DISABLED
+#define HAS_SCALEARGBFILTERROWS_SSE2
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
 __declspec(naked) __declspec(align(16))
 void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
                              ptrdiff_t src_stride, int dst_width,
@@ -215,19 +214,24 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
    sub        edi, esi
+    // Dispatch to specialized filters if applicable.
    cmp        eax, 0
-    je         xloop1
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
+    cmp        eax, 64
+    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
    cmp        eax, 128
-    je         xloop2
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    cmp        eax, 192
+    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
    movd       xmm5, eax            // xmm5 = y fraction
    punpcklbw  xmm5, xmm5
+    psrlw      xmm5, 1
    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
+    punpckldq  xmm5, xmm5
+    punpcklqdq xmm5, xmm5
    pxor       xmm4, xmm4
-    // f * row1 + (1 - frac) row0
-    // frac * (row1 - row0) + row0
    align      16
  xloop:
    movdqa     xmm0, [esi]  // row0
@@ -240,6 +244,8 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
    punpckhbw  xmm1, xmm4
    psubw      xmm2, xmm0  // row1 - row0
    psubw      xmm3, xmm1
+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
+    paddw      xmm3, xmm3
    pmulhw     xmm2, xmm5  // scale diff
    pmulhw     xmm3, xmm5
    paddw      xmm0, xmm2  // sum rows
@@ -249,44 +255,63 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
    jg         xloop
+    jmp        xloop99
-    shufps     xmm0, xmm0, 0xff
+    // Blend 25 / 75.
-    movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
+    align      16
-    pop        edi
+  xloop25:
-    pop        esi
+    movdqa     xmm0, [esi]
-    ret
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop25
+    jmp        xloop99
+    // Blend 50 / 50.
    align      16
-  xloop1:
+  xloop50:
    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
    sub        ecx, 4
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
-    jg         xloop1
+    jg         xloop50
+    jmp        xloop99
-    shufps     xmm0, xmm0, 0xff
+    // Blend 75 / 25.
+    align      16
+  xloop75:
+    movdqa     xmm1, [esi]
+    movdqa     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
    movdqa     [esi + edi], xmm0
-    pop        edi
+    lea        esi, [esi + 16]
-    pop        esi
+    jg         xloop75
-    ret
+    jmp        xloop99
+    // Blend 100 / 0 - Copy row unchanged.
    align      16
-  xloop2:
+  xloop100:
    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
    sub        ecx, 4
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
-    jg         xloop2
+    jg         xloop100
+  xloop99:
    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
+    movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
    pop        edi
    pop        esi
    ret
  }
 }
-#endif  // SSE2_DISABLED
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
 #define HAS_SCALEARGBFILTERROWS_SSSE3
@@ -544,23 +569,33 @@ static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb,
  );
 }
-#ifndef SSE2_DISABLED
 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
+#define HAS_SCALEARGBFILTERROWS_SSE2
 void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
                              ptrdiff_t src_stride, int dst_width,
                              int source_y_fraction) {
  asm volatile (
    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
+    "je        100f                            \n"
-    "cmp       $0x80,%3                        \n"
+    "cmp       $0x20,%3                        \n"
-    "je        3f                              \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
    "punpcklwd %%xmm5,%%xmm5                   \n"
    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    // General purpose row blend.
    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%1),%%xmm0                     \n"
@@ -573,6 +608,8 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
    "punpckhbw %%xmm4,%%xmm1                   \n"
    "psubw     %%xmm0,%%xmm2                   \n"
    "psubw     %%xmm1,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm2                   \n"
+    "paddw     %%xmm3,%%xmm3                   \n"
    "pmulhw    %%xmm5,%%xmm2                   \n"
    "pmulhw    %%xmm5,%%xmm3                   \n"
    "paddw     %%xmm2,%%xmm0                   \n"
@@ -582,31 +619,61 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
    "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+    // Blend 25 / 75.
    ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
+    "jg        25b                             \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+    // Blend 50 / 50.
    ".p2align  4                               \n"
-  "3:                                          \n"
+  "50:                                         \n"
    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+    // Blend 75 / 25.
+    ".p2align  4                               \n"
+  "75:                                         \n"
+    "movdqa    (%1),%%xmm1                     \n"
+    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+    // Blend 100 / 0 - Copy row unchanged.
    ".p2align  4                               \n"
-  "4:                                          \n"
+  "100:                                        \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        100b                            \n"
+  "99:                                         \n"
    "shufps    $0xff,%%xmm0,%%xmm0             \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_argb),     // %0
+  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),     // %1
+    "+r"(src_argb),   // %1
-    "+r"(dst_width),   // %2
+    "+r"(dst_width),  // %2
    "+r"(source_y_fraction)  // %3
  : "r"(static_cast<intptr_t>(src_stride))  // %4
  : "memory", "cc"
@@ -615,7 +682,6 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
 #endif
  );
 }
-#endif  // SSE2_DISABLED
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
 #define HAS_SCALEARGBFILTERROWS_SSSE3