Commit 0acc6771 authored by Frank Barchard's avatar Frank Barchard

clang format / lint cleanup for arm scale functions

TBR=kjellander@chromium.org
BUG=libyuv:725
TEST=lint

Change-Id: I76f777427f9b1458faba12796fb0011d8e3228d5
Reviewed-on: https://chromium-review.googlesource.com/646586Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent a826dd71
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1666
Version: 1667
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1666
#define LIBYUV_VERSION 1667
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -535,15 +535,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// clang-format on
"vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
// The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) +
......@@ -719,7 +717,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
);
}
// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
// 4a: 3e04 subs r6, #4
// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
......@@ -727,7 +724,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
// 54: f942 038d vst2.32 {d16-d19}, [r2]!
// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
......@@ -760,22 +756,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
// pixels.
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
// pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
"vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
"vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n"
......@@ -825,8 +817,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
"1: \n"
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks ->
// 2x1
"vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
"vld1.8 {d1}, [%1], r12 \n"
"vld1.8 {d2}, [%0], r12 \n"
"vld1.8 {d3}, [%1], r12 \n"
......@@ -855,7 +846,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
: "memory", "cc", "r12", "q0", "q1", "q2", "q3");
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n) \
......@@ -863,7 +853,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"vld1.32 {" #dn "[" #n "]}, [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb,
......@@ -873,14 +862,19 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int tmp;
const uint8* src_tmp = src_argb;
asm volatile(
"1: \n" LOAD1_DATA32_LANE(
d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
"1: \n"
// clang-format off
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0)
LOAD1_DATA32_LANE(d2, 1)
LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
// clang-format on
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per
// loop
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
......@@ -895,7 +889,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
#undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
......@@ -903,7 +896,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
// clang-format on
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb,
......
......@@ -51,8 +51,8 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
"1: \n"
// load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"subs %w2, %w2, #16 \n" // 16 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
......@@ -156,8 +156,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1,
// v2
"orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
......@@ -550,7 +549,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
......@@ -558,7 +556,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on
// The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) +
......@@ -572,8 +569,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
int64 x64 = (int64)x; // NOLINT
int64 dx64 = (int64)dx; // NOLINT
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
......@@ -769,15 +766,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"add %1, %1, %0 \n"
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
// more ARGB
// pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
......@@ -867,7 +861,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(vn, n) \
......@@ -875,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb,
......@@ -883,15 +875,21 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int x,
int dx) {
const uint8* src_tmp = src_argb;
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
int64 x64 = (int64)x; // NOLINT
int64 dx64 = (int64)dx; // NOLINT
int64 tmp64;
asm volatile(
"1: \n" LOAD1_DATA32_LANE(
v0, 0) LOAD1_DATA32_LANE(v0, 1) LOAD1_DATA32_LANE(v0, 2)
LOAD1_DATA32_LANE(v0, 3) LOAD1_DATA32_LANE(v1, 0) LOAD1_DATA32_LANE(
v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3)
"1: \n"
// clang-format off
LOAD1_DATA32_LANE(v0, 0)
LOAD1_DATA32_LANE(v0, 1)
LOAD1_DATA32_LANE(v0, 2)
LOAD1_DATA32_LANE(v0, 3)
LOAD1_DATA32_LANE(v1, 0)
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
// clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per
// loop
......@@ -909,7 +907,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
#undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
......@@ -917,7 +914,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb,
......@@ -927,8 +923,8 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
int64 x64 = (int64)x; // NOLINT
int64 dx64 = (int64)dx; // NOLINT
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment