ScaleFilterRows optimized for NEON.

Includes unit test that scales the image up by 2. Currently this is done using the generic bilinear scale. Review URL: http://webrtc-codereview.appspot.com/330032 git-svn-id: http://libyuv.googlecode.com/svn/trunk@126 16f28f9a-4ce2-e073-06de-1de4eb20be90

ScaleFilterRows optimized for NEON.
Includes unit test that scales the image up by 2. Currently this is done using the generic bilinear scale. Review URL: http://webrtc-codereview.appspot.com/330032 git-svn-id: http://libyuv.googlecode.com/svn/trunk@126 16f28f9a-4ce2-e073-06de-1de4eb20be90
ea8d0eb0 · frkoenig@google.com · 17f198cd · ea8d0eb0 · ea8d0eb0 · ea8d0eb0
Commit ea8d0eb0 authored Jan 04, 2012 by frkoenig@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 86 additions and 9 deletions

README.chromium README.chromium +1 -1

scale.cc source/scale.cc +61 -0

scale_test.cc unit_test/scale_test.cc +24 -8

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 125
+Version: 126
 License: BSD
 License File: LICENSE


--- a/source/scale.cc
+++ b/source/scale.cc
@@ -502,6 +502,61 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
  );
 }

+// 16x2 -> 16x1
+#define HAS_SCALEFILTERROWS_NEON
+static void ScaleFilterRows_NEON(uint8* dst_ptr,
+                              const uint8* src_ptr, int src_stride,
+                              int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          2f                           \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #128                     \n"
+    "beq          3f                           \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    "1:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "vld1.u8      {q1}, [%2]!                  \n"
+    "subs         %3, #16                      \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bhi          1b                           \n"
+    "b            4f                           \n"
+
+    "2:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "subs         %3, #16                      \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bhi          2b                           \n"
+    "b            4f                           \n"
+
+    "3:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "vld1.u8      {q1}, [%2]!                  \n"
+    "subs         %3, #16                      \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bhi          3b                           \n"
+    "4:                                        \n"
+    "vst1.u8      {d1[7]}, [%0]                \n"
+    : "+r"(dst_ptr),          // %0
+      "+r"(src_ptr),          // %1
+      "+r"(src_stride),       // %2
+      "+r"(dst_width),        // %3
+      "+r"(source_y_fraction) // %4
+    :
+    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+
 /**
 * SSE2 downscalers with interpolation.
 *
@@ -3471,6 +3526,12 @@ void ScalePlaneBilinear(int src_width, int src_height,
                            int dst_width, int source_y_fraction);
    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
                            int dst_width, int dx);
+#if defined(HAS_SCALEFILTERROWS_NEON)
+    if (TestCpuFlag(kCpuHasNEON) &&
+        IS_ALIGNED(src_width, 16)) {
+      ScaleFilterRows = ScaleFilterRows_NEON;
+    } else
+#endif
 #if defined(HAS_SCALEFILTERROWS_SSSE3)
    if (TestCpuFlag(kCpuHasSSSE3) &&
        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&

--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -153,8 +153,8 @@ TEST_F(libyuvTest, ScaleDownBy2) {

  const int src_width = 1280;
  const int src_height = 720;
-  const int dst_width = src_width >> 1;
-  const int dst_height = src_height >> 1;
+  const int dst_width = src_width / 2;
+  const int dst_height = src_height / 2;
  int err = 0;

  for (int f = 0; f < 3; ++f)
@@ -169,8 +169,8 @@ TEST_F(libyuvTest, ScaleDownBy4) {

  const int src_width = 1280;
  const int src_height = 720;
-  const int dst_width = src_width >> 2;
-  const int dst_height = src_height >> 2;
+  const int dst_width = src_width / 4;
+  const int dst_height = src_height / 4;
  int err = 0;

  for (int f = 0; f < 3; ++f)
@@ -185,8 +185,8 @@ TEST_F(libyuvTest, ScaleDownBy34) {

  const int src_width = 1280;
  const int src_height = 720;
-  const int dst_width = (src_width*3) >> 2;
-  const int dst_height = (src_height*3) >> 2;
+  const int dst_width = src_width * 3 / 4;
+  const int dst_height = src_height * 3 / 4;
  int err = 0;

  for (int f = 0; f < 3; ++f)
@@ -200,8 +200,24 @@ TEST_F(libyuvTest, ScaleDownBy34) {
 TEST_F(libyuvTest, ScaleDownBy38) {
  int src_width = 1280;
  int src_height = 720;
-  int dst_width = (src_width*3) >> 3;
-  int dst_height = (src_height*3) >> 3;
+  int dst_width = src_width * 3 / 8;
+  int dst_height = src_height * 3 / 8;
+
+  int err = 0;
+
+  for (int f = 0; f < 3; ++f)
+    err += TestFilter (src_width, src_height,
+                       dst_width, dst_height,
+                       static_cast<FilterMode>(f));
+
+  EXPECT_EQ(0, err);
+}
+
+TEST_F(libyuvTest, ScalePlaneBilinear) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = 1366;
+  int dst_height = 768;

  int err = 0;