On Neon remove aligned SplitUVRow

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/930020 git-svn-id: http://libyuv.googlecode.com/svn/trunk@493 16f28f9a-4ce2-e073-06de-1de4eb20be90

On Neon remove aligned SplitUVRow
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/930020 git-svn-id: http://libyuv.googlecode.com/svn/trunk@493 16f28f9a-4ce2-e073-06de-1de4eb20be90
4a86a836 · fbarchard@google.com · cb5262db · 4a86a836 · 4a86a836 · 4a86a836
Commit 4a86a836 authored Nov 16, 2012 by fbarchard@google.com
9 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 492
+Version: 493
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -524,8 +524,6 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                               int pix);
 void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                               int pix);
-void SplitUVRow_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix);
 void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
                                     uint8* dst_v, int pix);
 void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
@@ -549,8 +547,6 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
                               uint8* dst_uv, int width);
 void MergeUVRow_Unaligned_AVX2(const uint8* src_u, const uint8* src_v,
                               uint8* dst_uv, int width);
-void MergeUVRow_Unaligned_NEON(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width);
 void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                         int width);
 void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 492
+#define LIBYUV_VERSION 493
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -397,12 +397,7 @@ static int X420ToI420(const uint8* src_y,
  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
    SplitUVRow = SplitUVRow_Any_NEON;
    if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_Unaligned_NEON;
+      SplitUVRow = SplitUVRow_NEON;
-      if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
-          IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-          IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-        SplitUVRow = SplitUVRow_NEON;
-      }
    }
  }
 #endif

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -520,7 +520,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  int halfwidth = (width + 1) >> 1;
  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                  int width) = MergeUVRow_C;
+                      int width) = MergeUVRow_C;
 #if defined(HAS_MERGEUVROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
    MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -551,12 +551,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_NEON;
+      MergeUVRow_ = MergeUVRow_NEON;
-      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_NEON;
-      }
    }
  }
 #endif

--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -249,7 +249,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
 #endif
  int halfwidth = (width + 1) >> 1;
  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                  int width) = MergeUVRow_C;
+                      int width) = MergeUVRow_C;
 #if defined(HAS_MERGEUVROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
    MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -276,10 +276,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_NEON;
+      MergeUVRow_ = MergeUVRow_NEON;
-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_NEON;
-      }
    }
  }
 #endif
@@ -358,7 +355,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
 #endif
  int halfwidth = (width + 1) >> 1;
  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                  int width) = MergeUVRow_C;
+                      int width) = MergeUVRow_C;
 #if defined(HAS_MERGEUVROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
    MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -385,10 +382,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_NEON;
+      MergeUVRow_ = MergeUVRow_NEON;
-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_NEON;
-      }
    }
  }
 #endif

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -312,7 +312,7 @@ SplitUVRowANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
 SplitUVRowANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31)
 #endif
 #ifdef HAS_SPLITUVROW_NEON
-SplitUVRowANY(SplitUVRow_Any_NEON, SplitUVRow_Unaligned_NEON, SplitUVRow_C, 15)
+SplitUVRowANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
 #endif
 #ifdef HAS_SPLITUVROW_MIPS_DSPR2
 SplitUVRowANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
@@ -338,7 +338,7 @@ MergeUVRow_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
 MergeUVRow_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31)
 #endif
 #ifdef HAS_MERGEUVROW_NEON
-MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_Unaligned_NEON, MergeUVRow_C, 15)
+MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
 #endif
 #undef MergeUVRow_ANY

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -748,33 +748,11 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
 #endif  // HAS_UYVYTOARGBROW_NEON
 #ifdef HAS_SPLITUVROW_NEON
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                     int width) {
  asm volatile (
    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld2.u8    {q0, q1}, [%0:128]!            \n"  // load 16 pairs of UV
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vst1.u8    {q0}, [%1:128]!                \n"  // store U
-    "vst1.u8    {q1}, [%2:128]!                \n"  // store V
-    "bgt        1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "memory", "cc", "q0", "q1"  // Clobber List
-  );
-}
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
-// Alignment requirement: Multiple of 16 pixels, pointers unaligned.
-void SplitUVRow_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int width) {
-  asm volatile (
-    ".p2align  2                               \n"
  "1:                                          \n"
    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
@@ -793,32 +771,10 @@ void SplitUVRow_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 #ifdef HAS_MERGEUVROW_NEON
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                     int width) {
  asm volatile (
    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld1.u8    {q0}, [%0:128]!                \n"  // load U
-    "vld1.u8    {q1}, [%1:128]!                \n"  // load V
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vst2.u8    {q0, q1}, [%2:128]!            \n"  // store 16 pairs of UV
-    "bgt        1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "memory", "cc", "q0", "q1"  // Clobber List
-  );
-}
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_Unaligned_NEON(const uint8* src_u, const uint8* src_v,
-                            uint8* dst_uv, int width) {
-  asm volatile (
-    ".p2align  2                               \n"
  "1:                                          \n"
    "vld1.u8    {q0}, [%0]!                    \n"  // load U
    "vld1.u8    {q1}, [%1]!                    \n"  // load V

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -1017,7 +1017,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
  for (int i = 0; i < dst_height; ++i) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+                  dx);
    dst_argb += dst_stride;
    y += dy;
  }