Box filter for YUV use rows with accumulation buffer for better memory behavior.…

Box filter for YUV use rows with accumulation buffer for better memory behavior. The old code would do columns accumulated into registers, and then store the result once. This was slow from a memory point of view. The new code does a row of source at a time, updating an accumulation buffer every row. The accumulation buffer is small, and should fit cache. Before each accumulation of N rows, the buffer needs to be reset to zero. If the memset is a bottleneck, it would be faster to do the first row without an add, storing to the accumulation buffer, and then add for the remaining rows. BUG=425 TESTED=out\release\libyuv_unittest --gtest_filter=*ScaleTo1x1* R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/52659004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1428 16f28f9a-4ce2-e073-06de-1de4eb20be90

Box filter for YUV use rows with accumulation buffer for better memory behavior.…
Box filter for YUV use rows with accumulation buffer for better memory behavior. The old code would do columns accumulated into registers, and then store the result once. This was slow from a memory point of view. The new code does a row of source at a time, updating an accumulation buffer every row. The accumulation buffer is small, and should fit cache. Before each accumulation of N rows, the buffer needs to be reset to zero. If the memset is a bottleneck, it would be faster to do the first row without an add, storing to the accumulation buffer, and then add for the remaining rows. BUG=425 TESTED=out\release\libyuv_unittest --gtest_filter=*ScaleTo1x1* R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/52659004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1428 16f28f9a-4ce2-e073-06de-1de4eb20be90
05416e2d · fbarchard@google.com · b07de879 · 05416e2d · 05416e2d · 05416e2d
Commit 05416e2d authored Jun 09, 2015 by fbarchard@google.com
6 changed files
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -30,13 +30,11 @@ extern "C" {
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_FIXEDDIV1_X86
 #define HAS_FIXEDDIV_X86
-#define HAS_SCALEADDROWS_SSE2
 #define HAS_SCALEARGBCOLS_SSE2
 #define HAS_SCALEARGBCOLSUP2_SSE2
 #define HAS_SCALEARGBFILTERCOLS_SSSE3
@@ -50,17 +48,21 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_SSE2
 #endif
-// The following are available on VS2012.
+// The following are available on VS2012:
 #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
-#define HAS_SCALEADDROWS_AVX2
+#define HAS_SCALEADDROW_AVX2
 #define HAS_SCALEROWDOWN2_AVX2
 #define HAS_SCALEROWDOWN4_AVX2
 #endif
+// The following are available on Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
+#define HAS_SCALEADDROW_SSE2
+#endif
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEADDROWS_NEON
 #define HAS_SCALEARGBCOLS_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
@@ -183,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
 void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
                               uint16* dst_ptr, int dst_width);
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-                    uint16* dst_ptr, int src_width, int src_height);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
-void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                       uint32* dst_ptr, int src_width, int src_height);
 void ScaleARGBRowDown2_C(const uint8* src_argb,
                         ptrdiff_t src_stride,
                         uint8* dst_argb, int dst_width);
@@ -289,14 +289,10 @@ void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
                                    ptrdiff_t src_stride,
                                    uint8* dst_ptr, int dst_width);
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-                       uint16* dst_ptr, int src_width, int src_height);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-                       uint16* dst_ptr, int src_width, int src_height);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRows_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int src_width, int src_height);
-void ScaleAddRows_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int src_width, int src_height);
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx);
@@ -442,10 +438,8 @@ void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-                       uint16* dst_ptr, int src_width, int src_height);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRows_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int src_width, int src_height);
 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx);

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -733,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
                          int dst_width, int dst_height,
                          int src_stride, int dst_stride,
                          const uint8* src_ptr, uint8* dst_ptr) {
-  int j;
+  int j, k;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
  int y = 0;
@@ -750,29 +750,29 @@ static void ScalePlaneBox(int src_width, int src_height,
        const uint16* src_ptr, uint8* dst_ptr) =
        (dx & 0xffff) ? ScaleAddCols2_C:
        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
+    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
-        uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
+        ScaleAddRow_C;
-#if defined(HAS_SCALEADDROWS_SSE2)
+#if defined(HAS_SCALEADDROW_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)) {
-      ScaleAddRows = ScaleAddRows_Any_SSE2;
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
      if (IS_ALIGNED(src_width, 16)) {
-        ScaleAddRows = ScaleAddRows_SSE2;
+        ScaleAddRow = ScaleAddRow_SSE2;
      }
    }
 #endif
-#if defined(HAS_SCALEADDROWS_AVX2)
+#if defined(HAS_SCALEADDROW_AVX2)
    if (TestCpuFlag(kCpuHasAVX2)) {
-      ScaleAddRows = ScaleAddRows_Any_AVX2;
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
      if (IS_ALIGNED(src_width, 32)) {
-        ScaleAddRows = ScaleAddRows_AVX2;
+        ScaleAddRow = ScaleAddRow_AVX2;
      }
    }
 #endif
-#if defined(HAS_SCALEADDROWS_NEON)
+#if defined(HAS_SCALEADDROW_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
-      ScaleAddRows = ScaleAddRows_Any_NEON;
+      ScaleAddRow = ScaleAddRow_Any_NEON;
      if (IS_ALIGNED(src_width, 16)) {
-        ScaleAddRows = ScaleAddRows_NEON;
+        ScaleAddRow = ScaleAddRow_NEON;
      }
    }
 #endif
@@ -786,7 +786,11 @@ static void ScalePlaneBox(int src_width, int src_height,
        y = max_y;
      }
      boxheight = MIN1((y >> 16) - iy);
-      ScaleAddRows(src, src_stride, (uint16*)(row16), src_width, boxheight);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        src += src_stride;
+      }
      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
      dst_ptr += dst_stride;
    }
@@ -798,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint16* src_ptr, uint16* dst_ptr) {
-  int j;
+  int j, k;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
  int y = 0;
@@ -814,12 +818,12 @@ static void ScalePlaneBox_16(int src_width, int src_height,
    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
        const uint32* src_ptr, uint16* dst_ptr) =
        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
-    void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
+    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
-        uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
+        ScaleAddRow_16_C;
-#if defined(HAS_SCALEADDROWS_16_SSE2)
+#if defined(HAS_SCALEADDROW_16_SSE2)
    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
-      ScaleAddRows = ScaleAddRows_16_SSE2;
+      ScaleAddRow = ScaleAddRow_16_SSE2;
    }
 #endif
@@ -832,7 +836,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
        y = max_y;
      }
      boxheight = MIN1((y >> 16) - iy);
-      ScaleAddRows(src, src_stride, (uint32*)(row32), src_width, boxheight);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        src += src_stride;
+      }
      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
      dst_ptr += dst_stride;
    }

--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -169,25 +169,23 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
 #endif
 // Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROWS_SIMD, SCALEADDROWS_C, MASK)                \
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
-  void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                     \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
-               uint16* dst_ptr, int src_width, int src_height) {               \
      int n = src_width & ~MASK;                                               \
      if (n > 0) {                                                             \
-        SCALEADDROWS_SIMD(src_ptr, src_stride, dst_ptr, n, src_height);        \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
      }                                                                        \
-      SCALEADDROWS_C(src_ptr + n, src_stride,                                  \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
-                     dst_ptr + n, src_width & MASK, src_height);               \
    }
-#ifdef HAS_SCALEADDROWS_SSE2
+#ifdef HAS_SCALEADDROW_SSE2
-SAANY(ScaleAddRows_Any_SSE2, ScaleAddRows_SSE2, ScaleAddRows_C, 15)
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
 #endif
-#ifdef HAS_SCALEADDROWS_AVX2
+#ifdef HAS_SCALEADDROW_AVX2
-SAANY(ScaleAddRows_Any_AVX2, ScaleAddRows_AVX2, ScaleAddRows_C, 31)
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
 #endif
-#ifdef HAS_SCALEADDROWS_NEON
+#ifdef HAS_SCALEADDROW_NEON
-SAANY(ScaleAddRows_Any_NEON, ScaleAddRows_NEON, ScaleAddRows_C, 15)
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #endif
 #undef SAANY

--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
  }
 }
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-                    uint16* dst_ptr, int src_width, int src_height) {
  int x;
  assert(src_width > 0);
-  assert(src_height > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
-  for (x = 0; x < src_width; ++x) {
+    dst_ptr[0] += src_ptr[0];
-    const uint8* s = src_ptr + x;
+    dst_ptr[1] += src_ptr[1];
-    unsigned int sum = 0u;
+    src_ptr += 2;
-    int y;
+    dst_ptr += 2;
-    for (y = 0; y < src_height; ++y) {
+  }
-      sum += s[0];
+  if (src_width & 1) {
-      s += src_stride;
+    dst_ptr[0] += src_ptr[0];
-    }
-    // TODO(fbarchard): Consider limiting height to 256 to avoid overflow.
-    dst_ptr[x] = sum < 65535u ? sum : 65535u;
  }
 }
-void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
-                       uint32* dst_ptr, int src_width, int src_height) {
  int x;
  assert(src_width > 0);
-  assert(src_height > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
-  for (x = 0; x < src_width; ++x) {
+    dst_ptr[0] += src_ptr[0];
-    const uint16* s = src_ptr + x;
+    dst_ptr[1] += src_ptr[1];
-    unsigned int sum = 0u;
+    src_ptr += 2;
-    int y;
+    dst_ptr += 2;
-    for (y = 0; y < src_height; ++y) {
+  }
-      sum += s[0];
+  if (src_width & 1) {
-      s += src_stride;
+    dst_ptr[0] += src_ptr[0];
-    }
-    // No risk of overflow here now
-    dst_ptr[x] = sum;
  }
 }

--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -800,104 +800,61 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
  }
 }
-// Reads 16xN bytes and produces 16 shorts at a time.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
 __declspec(naked)
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-                       uint16* dst_ptr, int src_width, int src_height) {
  __asm {
-    push       esi
+    mov        eax, [esp + 4]   // src_ptr
-    push       edi
+    mov        edx, [esp + 8]   // dst_ptr
-    push       ebx
+    mov        ecx, [esp + 12]  // src_width
-    push       ebp
+    pxor       xmm5, xmm5
-    mov        esi, [esp + 16 + 4]   // src_ptr
-    mov        edx, [esp + 16 + 8]   // src_stride
-    mov        edi, [esp + 16 + 12]  // dst_ptr
-    mov        ecx, [esp + 16 + 16]  // dst_width
-    mov        ebx, [esp + 16 + 20]  // height
-    mov        eax, esi          // row pointer
-    mov        ebp, ebx          // height
-    pxor       xmm0, xmm0        // clear accumulators
-    pxor       xmm1, xmm1
-    pxor       xmm4, xmm4
  // sum rows
  xloop:
-    movdqu     xmm2, [eax]       // read 16 pixels
+    movdqu     xmm3, [eax]       // read 16 bytes
-    lea        eax, [eax + edx]  // advance to next row
+    lea        eax, [eax + 16]
-    movdqa     xmm3, xmm2
+    movdqu     xmm0, [edx]       // read 16 words from destination
-    punpcklbw  xmm2, xmm4
+    movdqu     xmm1, [edx + 16]
-    punpckhbw  xmm3, xmm4
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
    paddusw    xmm0, xmm2        // sum 16 words
    paddusw    xmm1, xmm3
-    sub        ebp, 1
+    movdqu     [edx], xmm0       // write 16 words to destination
-    jg         xloop
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
-    movdqu     [edi], xmm0
-    movdqu     [edi + 16], xmm1
-    lea        edi, [edi + 32]   // dst_ptr += 16
-    lea        esi, [esi + 16]   // src_ptr += 16
-    mov        eax, esi          // row pointer
-    mov        ebp, ebx          // height
-    pxor       xmm0, xmm0        // clear accumulators
-    pxor       xmm1, xmm1
    sub        ecx, 16
    jg         xloop
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
    ret
  }
 }
-// Reads 32xN bytes and produces 32 shorts at a time.
+// Reads 32 bytes and accumulates to 32 shorts at a time.
 __declspec(naked)
-void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-                       uint16* dst_ptr, int src_width, int src_height) {
  __asm {
-    push        esi
+    mov         eax, [esp + 4]   // src_ptr
-    push        edi
+    mov         edx, [esp + 8]   // dst_ptr
-    push        ebx
+    mov         ecx, [esp + 12]  // src_width
-    push        ebp
+    vpxor       ymm5, ymm5, ymm5
-    mov         esi, [esp + 16 + 4]   // src_ptr
-    mov         edx, [esp + 16 + 8]   // src_stride
-    mov         edi, [esp + 16 + 12]  // dst_ptr
-    mov         ecx, [esp + 16 + 16]  // dst_width
-    mov         ebx, [esp + 16 + 20]  // height
-    mov         eax, esi              // row pointer
-    mov         ebp, ebx              // height
-    vpxor       ymm0, ymm0, ymm0      // clear accumulators
-    vpxor       ymm1, ymm1, ymm1
-    vpxor       ymm4, ymm4, ymm4
  // sum rows
  xloop:
-    vmovdqu     ymm2, [eax]       // read 16 pixels
+    vmovdqu     ymm3, [eax]       // read 32 bytes
-    vpermq      ymm2, ymm2, 0xd8  // unmutate for vpunpck
+    vpermq      ymm3, ymm2, 0xd8  // unmutate for vpunpck
-    lea         eax, [eax + edx]  // advance to next row
+    lea         eax, [eax + 32]
-    vpunpckhbw  ymm3, ymm2, ymm4
+    vmovdqu     ymm0, [edx]       // read 32 words from destination
-    vpunpcklbw  ymm2, ymm2, ymm4
+    vmovdqu     ymm1, [edx + 32]
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
    vpaddusw    ymm0, ymm0, ymm2  // sum 16 words
    vpaddusw    ymm1, ymm1, ymm3
-    sub         ebp, 1
+    vmovdqu     [edx], ymm0       // write 32 words to destination
-    jg          xloop
+    vmovdqu     [edx + 32], ymm1
+    lea        edx, [edx + 64]
-    vmovdqu     [edi], ymm0
-    vmovdqu     [edi + 32], ymm1
-    lea         edi, [edi + 64]   // dst_ptr
-    lea         esi, [esi + 32]   // src_ptr
-    mov         eax, esi          // row pointer
-    mov         ebp, ebx          // height
-    vpxor       ymm0, ymm0, ymm0  // clear accumulators
-    vpxor       ymm1, ymm1, ymm1
    sub         ecx, 32
    jg          xloop
-    pop         ebp
-    pop         ebx
-    pop         edi
-    pop         esi
    vzeroupper
    ret
  }

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -690,6 +690,8 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
 TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
 #if defined(__arm__) || defined (__aarch64__)
+// arm version subsamples by summing 4 pixels then multiplying by matrix with
+// 4x smaller coefficients which are rounded to nearest integer.
 TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
 #else
 TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)