ARM Neon optimized 4x4 box filter.

Fix for 2x2 box filter assembly code. Review URL: http://webrtc-codereview.appspot.com/240007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@37 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARM Neon optimized 4x4 box filter.
Fix for 2x2 box filter assembly code. Review URL: http://webrtc-codereview.appspot.com/240007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@37 16f28f9a-4ce2-e073-06de-1de4eb20be90
36ab38a3 · frkoenig@google.com · 3dcaf734 · 36ab38a3
Commit 36ab38a3 authored Oct 21, 2011 by frkoenig@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 89 additions and 6 deletions

scale.cc source/scale.cc +89 -6

No files found.
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -58,9 +58,11 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
    "vst1.u8    {q0}, [%1]!       \n"  // store even pixels
    "subs       %2, %2, #16       \n"  // 16 processed per loop
    "bhi        1b                \n"
-    : "+r"(src_ptr), "+r"(dst), "+r"(dst_width)   // Output registers
-    :                                  // Input registers
-    : "q0", "q1"                       // Clobber List
+    : "+r"(src_ptr),          // %0
+      "+r"(dst),              // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1"              // Clobber List
  );
 }

@@ -85,13 +87,87 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
    "vst1.u8    {q0}, [%2]!       \n"
    "subs       %3, %3, #16       \n"  // 16 processed per loop
    "bhi        1b                \n"
-    : "+r"(src_ptr), "+r"(src_stride),
-      "+r"(dst), "+r"(dst_width)                      // Output registers
-    :                                                 // Input registers
+    : "+r"(src_ptr),          // %0
+      "+r"(src_stride),       // %1
+      "+r"(dst),              // %2
+      "+r"(dst_width)         // %3
+    :
    : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
   );
 }

+#define HAS_SCALEROWDOWN4_NEON
+// Expecting widths on arm devices to be smaller.  Went with 8x4 blocks
+//  to get most coverage.  Look to back and evaluate 16x4 blocks with
+//  handling of leftovers.
+static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
+                               uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "mov        r4, #4            \n"
+    "1:                           \n"
+    "vld1.u8    {d0[0]}, [%0],r4  \n"   // load up only 2 pixels of data to
+    "vld1.u8    {d0[1]}, [%0],r4  \n"   //  represent the entire 8x4 block
+
+    "vst1.u16   {d0[0]}, [%1]!    \n"
+
+    "subs       %2, #2            \n"   // dst_width -= 2
+    "bhi        1b                \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "r4", "q0", "q1", "memory", "cc"
+  );
+}
+
+static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "1:                           \n"
+    "mov        r4, %0            \n"
+    "vld1.u8    {d0}, [r4],%3     \n"   // load up 8x4 block of input data
+    "vld1.u8    {d1}, [r4],%3     \n"
+    "vld1.u8    {d2}, [r4],%3     \n"
+    "vld1.u8    {d3}, [r4]        \n"
+
+    // data is loaded up int q0 and q1
+    // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
+    // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
+    // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
+    "vpaddl.u8  q0, q0            \n"
+
+    // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
+    // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
+    "vpadal.u8  q0, q1            \n"
+
+    // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
+    // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
+    "vpaddl.u16 q0, q0            \n"
+
+
+    // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
+    //      b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
+    "vadd.u32   d0, d1            \n"
+
+    "vrshr.u32  d0, d0, #4        \n"   // divide by 16 w/rounding
+
+    "vst1.u8    {d0[0]}, [%1]!    \n"
+    "vst1.u8    {d0[4]}, [%1]!    \n"
+
+    "add        %0, #8            \n"   // move src pointer to next 8 pixels
+    "subs       %2, #2            \n"   // dst_width -= 2
+    "bhi        1b                \n"
+
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(src_stride)         // %3
+    : "r4", "q0", "q1", "memory", "cc"
+  );
+}
+
 /**
 * SSE2 downscalers with interpolation.
 *
@@ -2704,6 +2780,13 @@ static void ScalePlaneDown4(int src_width, int src_height,
  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
                        uint8* dst_ptr, int dst_width);

+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
+  } else
+#endif
 #if defined(HAS_SCALEROWDOWN4_SSE2)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
      (dst_width % 8 == 0) && (src_stride % 16 == 0) &&