scale by even amounts using Neon

BUG=156 TEST=./libyuv_unittest --gtest_filter=*ARGBScale* Review URL: https://webrtc-codereview.appspot.com/930019 git-svn-id: http://libyuv.googlecode.com/svn/trunk@492 16f28f9a-4ce2-e073-06de-1de4eb20be90

scale by even amounts using Neon
BUG=156 TEST=./libyuv_unittest --gtest_filter=*ARGBScale* Review URL: https://webrtc-codereview.appspot.com/930019 git-svn-id: http://libyuv.googlecode.com/svn/trunk@492 16f28f9a-4ce2-e073-06de-1de4eb20be90
cb5262db · fbarchard@google.com · f08ac6bb · cb5262db · cb5262db · cb5262db
Commit cb5262db authored Nov 16, 2012 by fbarchard@google.com
11 changed files
--- a/Android.mk
+++ b/Android.mk
@@ -36,7 +36,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
        source/compare_neon.cc.neon \
        source/rotate_neon.cc.neon  \
        source/row_neon.cc.neon     \
-        source/scale_neon.cc.neon
+        source/scale_neon.cc.neon   \
+        source/scale_neon_argb.cc.neon
 endif
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/include

--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 491
+Version: 492
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 491
+#define LIBYUV_VERSION 492
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -90,6 +90,7 @@
        'source/row_win.cc',
        'source/scale.cc',
        'source/scale_argb.cc',
+        'source/scale_argb_neon.cc',
        'source/scale_mips.cc',
        'source/scale_neon.cc',
        'source/video_common.cc',

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -41,7 +41,7 @@ extern "C" {
 #endif
 #endif
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_MIRRORROW_NEON
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 #define HAS_MIRRORROW_UV_NEON

--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -29,6 +29,13 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
                               int src_stepx,
                               uint8* dst_ptr, int dst_width);
 #endif
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width);
+#endif
 void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
                            int src_stepx,
                            uint8* dst_ptr, int dst_width);
@@ -36,6 +43,7 @@ void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
 static void ARGBTranspose(const uint8* src, int src_stride,
                          uint8* dst, int dst_stride,
                          int width, int height) {
+  int src_pixel_step = src_stride >> 2;
  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
@@ -43,9 +51,13 @@ static void ARGBTranspose(const uint8* src, int src_stride,
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
  }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.
+      IS_ALIGNED(src, 4)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  }
 #endif
-  int src_pixel_step = src_stride >> 2;
  for (int i = 0; i < width; ++i) {  // column of source to row of dest.
    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
    dst += dst_stride;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -315,7 +315,8 @@ SplitUVRowANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31)
 SplitUVRowANY(SplitUVRow_Any_NEON, SplitUVRow_Unaligned_NEON, SplitUVRow_C, 15)
 #endif
 #ifdef HAS_SPLITUVROW_MIPS_DSPR2
-SplitUVRowANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, SplitUVRow_C, 15)
+SplitUVRowANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
+              SplitUVRow_C, 15)
 #endif
 #undef SplitUVRowANY

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -722,7 +722,7 @@ static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
  *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
 }
-#if defined(__ARM_NEON__)
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 // C mimic assembly.
 // TODO(fbarchard): Remove subsampling from Neon.
 void I444ToARGBRow_C(const uint8* src_y,

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -50,7 +50,7 @@ void SetUseReferenceImpl(bool use) {
 *
 */
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEROWDOWN2_NEON
 // Note - not static due to reuse in convert for 444 to 420.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
--- a/source/scale_argb_neon.cc
+++ b/source/scale_argb_neon.cc
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// This module is for GCC Neon
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, #4                         \n"  // 4 pixels per loop.
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov       r12, %4, lsl #2                 \n"
+    "add       %1, %0                          \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8    {d0}, [%0], r12                 \n"  // Read 4 2x2 blocks -> 2x1
+    "vld1.8    {d1}, [%1], r12                 \n"
+    "vld1.8    {d2}, [%0], r12                 \n"
+    "vld1.8    {d3}, [%1], r12                 \n"
+    "vld1.8    {d4}, [%0], r12                 \n"
+    "vld1.8    {d5}, [%1], r12                 \n"
+    "vld1.8    {d6}, [%0], r12                 \n"
+    "vld1.8    {d7}, [%1], r12                 \n"
+    "vaddl.u8  q0, d0, d1                      \n"
+    "vaddl.u8  q1, d2, d3                      \n"
+    "vaddl.u8  q2, d4, d5                      \n"
+    "vaddl.u8  q3, d6, d7                      \n"
+    "vswp.8    d1, d2                          \n"  // ab_cd -> ac_bd
+    "vswp.8    d5, d6                          \n"  // ef_gh -> eg_fh
+    "vadd.u16  q0, q0, q1                      \n"  // (a+b)_(c+d)
+    "vadd.u16  q2, q2, q3                      \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, #4                         \n"  // 4 pixels per loop.
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // __ARM_NEON__
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif