Port box filter to AVX2.

BUG=libyuv:425 TESTED=c:\intelsde\sde -ast -hsw -- out\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*libyuvTest.ScaleTo640x360_Box R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/43149004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1367 16f28f9a-4ce2-e073-06de-1de4eb20be90

Port box filter to AVX2.
BUG=libyuv:425 TESTED=c:\intelsde\sde -ast -hsw -- out\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*libyuvTest.ScaleTo640x360_Box R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/43149004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1367 16f28f9a-4ce2-e073-06de-1de4eb20be90
013e8122 · fbarchard@google.com · b5ea79d8 · 013e8122 · 013e8122 · 013e8122
Commit 013e8122 authored Apr 14, 2015 by fbarchard@google.com
Showing with 86 additions and 37 deletions

scale_row.h include/libyuv/scale_row.h +9 -12

scale.cc source/scale.cc +21 -7

scale_common.cc source/scale_common.cc +0 -4

scale_posix.cc source/scale_posix.cc +1 -1

scale_win.cc source/scale_win.cc +55 -13

No files found.
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -52,6 +52,7 @@ extern "C" {
 // The following are available on VS2012.
 #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_SCALEADDROWS_AVX2
 #define HAS_SCALEROWDOWN2_AVX2
 #endif
@@ -262,26 +263,22 @@ void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                       uint16* dst_ptr, int src_width,
+                       uint16* dst_ptr, int src_width, int src_height);
-                       int src_height);
+void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width, int src_height);
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx);
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx);
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                            ptrdiff_t src_stride,
                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               ptrdiff_t src_stride,
                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
+                               int src_stepx, uint8* dst_argb, int dst_width);
-                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width);
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -702,11 +702,22 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
  }
 }
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+  }
+}
 static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
                            const uint16* src_ptr, uint8* dst_ptr) {
  int boxwidth = (dx >> 16);
  int scaleval = 65536 / (boxwidth * boxheight);
  int i;
+  x >>= 16;
  for (i = 0; i < dst_width; ++i) {
    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
    x += boxwidth;
@@ -768,15 +779,20 @@ static void ScalePlaneBox(int src_width, int src_height,
    align_buffer_64(row16, src_width * 2);
    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
        const uint16* src_ptr, uint8* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
+        (dx & 0xffff) ? ScaleAddCols2_C:
+        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
        uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
 #if defined(HAS_SCALEADDROWS_SSE2)
    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
      ScaleAddRows = ScaleAddRows_SSE2;
    }
 #endif
+#if defined(HAS_SCALEADDROWS_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(src_width, 32)) {
+      ScaleAddRows = ScaleAddRows_AVX2;
+    }
+#endif
 #if defined(HAS_SCALEADDROWS_NEON)
    if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) {
      ScaleAddRows = ScaleAddRows_NEON;
@@ -1419,8 +1435,7 @@ void ScalePlane(const uint8* src, int src_stride,
                enum FilterMode filtering) {
  // Simplify filtering when possible.
  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+                                dst_width, dst_height, filtering);
-                                filtering);
  // Negative height means invert the image.
  if (src_height < 0) {
@@ -1436,7 +1451,7 @@ void ScalePlane(const uint8* src, int src_stride,
    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
    return;
  }
-  if (dst_width == src_width) {
+  if (dst_width == src_width && filtering != kFilterBox) {
    int dy = FixedDiv(src_height, dst_height);
    // Arbitrary scale vertically, but unscaled vertically.
    ScalePlaneVertical(src_height,
@@ -1503,8 +1518,7 @@ void ScalePlane_16(const uint16* src, int src_stride,
                  enum FilterMode filtering) {
  // Simplify filtering when possible.
  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+                                dst_width, dst_height, filtering);
-                                filtering);
  // Negative height means invert the image.
  if (src_height < 0) {

--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -1030,10 +1030,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
      filtering = kFilterBilinear;
    }
-    // If scaling to larger, switch from Box to Bilinear.
-    if (dst_width >= src_width || dst_height >= src_height) {
-      filtering = kFilterBilinear;
-    }
  }
  if (filtering == kFilterBilinear) {
    if (src_height == 1) {

--- a/source/scale_posix.cc
+++ b/source/scale_posix.cc
@@ -579,11 +579,11 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
  int tmp_height = 0;
  intptr_t tmp_src = 0;
  asm volatile (
-    "pxor      %%xmm4,%%xmm4                   \n"
    "mov       %0,%3                           \n"  // row pointer
    "mov       %5,%2                           \n"  // height
    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
    "pxor      %%xmm1,%%xmm1                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
    LABELALIGN
  "1:                                          \n"

--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -721,16 +721,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    mov        edi, [esp + 16 + 12]  // dst_ptr
    mov        ecx, [esp + 16 + 16]  // dst_width
    mov        ebx, [esp + 16 + 20]  // height
-    pxor       xmm4, xmm4
    mov        eax, esi          // row pointer
    mov        ebp, ebx          // height
    pxor       xmm0, xmm0        // clear accumulators
    pxor       xmm1, xmm1
+    pxor       xmm4, xmm4
+  // sum rows
  xloop:
-    // sum rows
-  yloop:
    movdqu     xmm2, [eax]       // read 16 pixels
    lea        eax, [eax + edx]  // advance to next row
    movdqa     xmm3, xmm2
@@ -739,7 +737,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    paddusw    xmm0, xmm2        // sum 16 words
    paddusw    xmm1, xmm3
    sub        ebp, 1
-    jg         yloop
+    jg         xloop
    movdqu     [edi], xmm0
    movdqu     [edi + 16], xmm1
@@ -760,15 +758,59 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }
-// Bilinear column filtering. SSSE3 version.
+// Reads 32xN bytes and produces 32 shorts at a time.
-// TODO(fbarchard): Switch the following:
+__declspec(naked)
-//    xor        ebx, ebx
+void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
+                       uint16* dst_ptr, int src_width, int src_height) {
-// To
+  __asm {
-//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    push        esi
-// when drmemory bug fixed.
+    push        edi
-// https://code.google.com/p/drmemory/issues/detail?id=1396
+    push        ebx
+    push        ebp
+    mov         esi, [esp + 16 + 4]   // src_ptr
+    mov         edx, [esp + 16 + 8]   // src_stride
+    mov         edi, [esp + 16 + 12]  // dst_ptr
+    mov         ecx, [esp + 16 + 16]  // dst_width
+    mov         ebx, [esp + 16 + 20]  // height
+    mov         eax, esi              // row pointer
+    mov         ebp, ebx              // height
+    vpxor       ymm0, ymm0, ymm0      // clear accumulators
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm4, ymm4, ymm4
+  // sum rows
+  xloop:
+    vmovdqu     ymm2, [eax]       // read 16 pixels
+    vpermq      ymm2, ymm2, 0xd8  // unmutate for vpunpck
+    lea         eax, [eax + edx]  // advance to next row
+    vpunpckhbw  ymm3, ymm2, ymm4
+    vpunpcklbw  ymm2, ymm2, ymm4
+    vpaddusw    ymm0, ymm0, ymm2  // sum 16 words
+    vpaddusw    ymm1, ymm1, ymm3
+    sub         ebp, 1
+    jg          xloop
+    vmovdqu     [edi], ymm0
+    vmovdqu     [edi + 32], ymm1
+    lea         edi, [edi + 64]   // dst_ptr
+    lea         esi, [esi + 32]   // src_ptr
+    mov         eax, esi          // row pointer
+    mov         ebp, ebx          // height
+    vpxor       ymm0, ymm0, ymm0  // clear accumulators
+    vpxor       ymm1, ymm1, ymm1
+    sub         ecx, 32
+    jg          xloop
+    pop         ebp
+    pop         ebx
+    pop         edi
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+// Bilinear column filtering. SSSE3 version.
 __declspec(naked)
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx) {