clang format / lint cleanup for arm scale functions

TBR=kjellander@chromium.org BUG=libyuv:725 TEST=lint Change-Id: I76f777427f9b1458faba12796fb0011d8e3228d5 Reviewed-on: https://chromium-review.googlesource.com/646586Reviewed-by: Cheng Wang <wangcheng@google.com>

clang format / lint cleanup for arm scale functions
TBR=kjellander@chromium.org BUG=libyuv:725 TEST=lint Change-Id: I76f777427f9b1458faba12796fb0011d8e3228d5 Reviewed-on: https://chromium-review.googlesource.com/646586Reviewed-by: Cheng Wang <wangcheng@google.com>
0acc6771 · Frank Barchard · a826dd71 · 0acc6771 · 0acc6771 · 0acc6771
Commit 0acc6771 authored Aug 31, 2017 by Frank Barchard
Showing with 36 additions and 48 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

scale_neon.cc source/scale_neon.cc +15 -23

scale_neon64.cc source/scale_neon64.cc +19 -23

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1666
+Version: 1667
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1666
+#define LIBYUV_VERSION 1667
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -535,15 +535,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
      );
 }
-// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA8_LANE(n)                      \
  "lsr        %5, %3, #16                    \n" \
  "add        %6, %1, %5                     \n" \
  "add        %3, %3, %4                     \n" \
-    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
-// clang-format on
 // The NEON version mimics this formula (from row_common.cc):
 // #define BLENDER(a, b, f) (uint8)((int)(a) +
@@ -719,7 +717,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
      );
 }
 //  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
 //  4a:  3e04        subs  r6, #4
 //  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
@@ -727,7 +724,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
 //  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
 //  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
                                  ptrdiff_t src_stride,
                                  uint8* dst_argb,
@@ -760,22 +756,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
      "1:                                        \n"
      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
-                                                      // pixels.
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
      "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
      "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
      "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB
-                                                      // pixels.
      "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB
-                                                      // pixels.
      "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
      "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
      "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
      "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-      "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and
+      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes
-                                                      // pack
      "vrshrn.u16 d1, q1, #2                     \n"
      "vrshrn.u16 d2, q2, #2                     \n"
      "vrshrn.u16 d3, q3, #2                     \n"
@@ -825,8 +817,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
      "mov        r12, %4, lsl #2                \n"
      "add        %1, %1, %0                     \n"
      "1:                                        \n"
-      "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks ->
+      "vld1.8     {d0}, [%0], r12                \n"  // 4 2x2 blocks -> 2x1
-                                                      // 2x1
      "vld1.8     {d1}, [%1], r12                \n"
      "vld1.8     {d2}, [%0], r12                \n"
      "vld1.8     {d3}, [%1], r12                \n"
@@ -855,7 +846,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
 }
-// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD1_DATA32_LANE(dn, n)                 \
@@ -863,7 +853,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
  "add        %6, %1, %5, lsl #2             \n" \
  "add        %3, %3, %4                     \n" \
  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
-// clang-format on
 void ScaleARGBCols_NEON(uint8* dst_argb,
                        const uint8* src_argb,
@@ -873,14 +862,19 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
  int tmp;
  const uint8* src_tmp = src_argb;
  asm volatile(
-      "1:                                          \n" LOAD1_DATA32_LANE(
+      "1:                                        \n"
-          d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
+      // clang-format off
-          LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
+      LOAD1_DATA32_LANE(d0, 0)
-              d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
+      LOAD1_DATA32_LANE(d0, 1)
+      LOAD1_DATA32_LANE(d1, 0)
+      LOAD1_DATA32_LANE(d1, 1)
+      LOAD1_DATA32_LANE(d2, 0)
+      LOAD1_DATA32_LANE(d2, 1)
+      LOAD1_DATA32_LANE(d3, 0)
+      LOAD1_DATA32_LANE(d3, 1)
+      // clang-format on
      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-              "subs       %2, %2, #8                     \n"  // 8 processed per
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-                                                              // loop
      "bgt        1b                             \n"
      : "+r"(dst_argb),   // %0
        "+r"(src_argb),   // %1
@@ -895,7 +889,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
 #undef LOAD1_DATA32_LANE
-// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
@@ -903,7 +896,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
  "add        %6, %1, %5, lsl #2                         \n" \
  "add        %3, %3, %4                                 \n" \
  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
-// clang-format on
 void ScaleARGBFilterCols_NEON(uint8* dst_argb,
                              const uint8* src_argb,

--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -51,8 +51,8 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
      "1:                                        \n"
      // load even pixels into v0, odd into v1
      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
      "st1        {v0.16b}, [%1], #16            \n"
      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
@@ -156,8 +156,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
      "1:                                                \n"
      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
      "subs      %w2, %w2, #24                           \n"
-      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1,
+      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
-                                                              // v2
      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
      "b.gt      1b                                      \n"
      : "+r"(src_ptr),   // %0
@@ -550,7 +549,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
      );
 }
-// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA8_LANE(n)                      \
@@ -558,7 +556,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
  "add        %6, %1, %5                     \n" \
  "add        %3, %3, %4                     \n" \
  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
-// clang-format on
 // The NEON version mimics this formula (from row_common.cc):
 // #define BLENDER(a, b, f) (uint8)((int)(a) +
@@ -572,8 +569,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
  int dx_offset[4] = {0, 1, 2, 3};
  int* tmp = dx_offset;
  const uint8* src_tmp = src_ptr;
-  int64 x64 = (int64)x;
+  int64 x64 = (int64)x;    // NOLINT
-  int64 dx64 = (int64)dx;
+  int64 dx64 = (int64)dx;  // NOLINT
  asm volatile (
    "dup        v0.4s, %w3                     \n"  // x
    "dup        v1.4s, %w4                     \n"  // dx
@@ -769,15 +766,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
      "add        %1, %1, %0                     \n"
      "1:                                        \n"
      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
-                                                                  // pixels.
      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
      "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
      "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
      "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
      "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
      "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
-                                                                    // more ARGB
-                                                                    // pixels.
      "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
      "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
@@ -867,7 +861,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
-// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD1_DATA32_LANE(vn, n)                 \
@@ -875,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
  "add        %6, %1, %5, lsl #2             \n" \
  "add        %3, %3, %4                     \n" \
  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
-// clang-format on
 void ScaleARGBCols_NEON(uint8* dst_argb,
                        const uint8* src_argb,
@@ -883,15 +875,21 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
                        int x,
                        int dx) {
  const uint8* src_tmp = src_argb;
-  int64 x64 = (int64)x;
+  int64 x64 = (int64)x;    // NOLINT
-  int64 dx64 = (int64)dx;
+  int64 dx64 = (int64)dx;  // NOLINT
  int64 tmp64;
  asm volatile(
-      "1:                                          \n" LOAD1_DATA32_LANE(
+      "1:                                        \n"
-          v0, 0) LOAD1_DATA32_LANE(v0, 1) LOAD1_DATA32_LANE(v0, 2)
+      // clang-format off
-          LOAD1_DATA32_LANE(v0, 3) LOAD1_DATA32_LANE(v1, 0) LOAD1_DATA32_LANE(
+      LOAD1_DATA32_LANE(v0, 0)
-              v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3)
+      LOAD1_DATA32_LANE(v0, 1)
+      LOAD1_DATA32_LANE(v0, 2)
+      LOAD1_DATA32_LANE(v0, 3)
+      LOAD1_DATA32_LANE(v1, 0)
+      LOAD1_DATA32_LANE(v1, 1)
+      LOAD1_DATA32_LANE(v1, 2)
+      LOAD1_DATA32_LANE(v1, 3)
+      // clang-format on
      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
      "subs       %w2, %w2, #8                   \n"  // 8 processed per
                                                      // loop
@@ -909,7 +907,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
 #undef LOAD1_DATA32_LANE
-// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
@@ -917,7 +914,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
  "add        %6, %1, %5, lsl #2                    \n" \
  "add        %3, %3, %4                            \n" \
  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
-// clang-format on
 void ScaleARGBFilterCols_NEON(uint8* dst_argb,
                              const uint8* src_argb,
@@ -927,8 +923,8 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
  int dx_offset[4] = {0, 1, 2, 3};
  int* tmp = dx_offset;
  const uint8* src_tmp = src_argb;
-  int64 x64 = (int64)x;
+  int64 x64 = (int64)x;    // NOLINT
-  int64 dx64 = (int64)dx;
+  int64 dx64 = (int64)dx;  // NOLINT
  asm volatile (
    "dup        v0.4s, %w3                     \n"  // x
    "dup        v1.4s, %w4                     \n"  // dx