Commit 8f5e9cd9 authored by Frank Barchard's avatar Frank Barchard

ScaleRowUp2_16_C port of NEON to C

Single pass upsample with bilinear filter.
NEON version optimized - Pixel Sailfish QC821

Was TestScaleRowUp2_16 (5741 ms)
Now TestScaleRowUp2_16 (4484 ms)
C   TestScaleRowUp2_16 (6555 ms)

TBR=kjellander@chromium.org
BUG=libyuv:718
TEST=LibYUVScaleTest.TestScaleRowUp2_16 (709 ms)

Change-Id: Ib04ceb53e0ab644a392c39c3396e313530161d92
Reviewed-on: https://chromium-review.googlesource.com/646701Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 2621c91b
......@@ -1306,6 +1306,35 @@ void ScaleSlope(int src_width,
}
#undef CENTERSTART
// Read 8x2 upsample with filtering and write 16x1.
// actually reads an extra pixel, so 9x2.
void ScaleRowUp2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
const uint16* src2 = src_ptr + src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
uint16 p0 = src_ptr[0];
uint16 p1 = src_ptr[1];
uint16 p2 = src2[0];
uint16 p3 = src2[1];
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
++src_ptr;
++src2;
dst += 2;
}
if (dst_width & 1) {
uint16 p0 = src_ptr[0];
uint16 p1 = src_ptr[1];
uint16 p2 = src2[0];
uint16 p3 = src2[1];
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -1010,66 +1010,43 @@ void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
}
// Read 8x2 upsample with filtering and write 16x1.
// actually reads an extra pixel, so 9x2.
// Actually reads an extra pixel, so 9x2.
void ScaleRowUp2_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
asm volatile(
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"movi v20.4h, #1 \n"
"movi v21.4h, #3 \n" // constants
"movi v22.4h, #9 \n"
"movi v0.8h, #9 \n" // constants
"movi v1.4s, #3 \n"
"1: \n"
"ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels
"ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2
// consider a variation of this for last 8x2 that replicates the last
// pixel.
"ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels
"ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2
"ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
"ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
"ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
"ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
"subs %w3, %w3, #16 \n" // 16 dst pixels per loop
// filter first 2x2 group to produce 1st and 4th dest pixels
// 9 3
// 3 1
"umull v8.4s, v0.4h, v22.4h \n"
"umlal v8.4s, v1.4h, v21.4h \n"
"umlal v8.4s, v2.4h, v21.4h \n"
"umlal v8.4s, v3.4h, v20.4h \n"
// filter first 2x2 group to produce 2nd and 5th dest pixel
// 3 9
// 1 3
"umull v9.4s, v0.4h, v21.4h \n"
"umlal v9.4s, v1.4h, v22.4h \n"
"umlal v9.4s, v2.4h, v20.4h \n"
"umlal v9.4s, v3.4h, v21.4h \n"
// filter second 2x2 group to produce 3rd and 6th dest pixels
// 9 3
// 3 1
"umull v10.4s, v4.4h, v22.4h \n"
"umlal v10.4s, v5.4h, v21.4h \n"
"umlal v10.4s, v6.4h, v21.4h \n"
"umlal v10.4s, v7.4h, v20.4h \n"
// filter second 2x2 group to produce 4th and 7th dest pixel
// 3 9
// 1 3
"umull v11.4s, v4.4h, v21.4h \n"
"umlal v11.4s, v5.4h, v22.4h \n"
"umlal v11.4s, v6.4h, v20.4h \n"
"umlal v11.4s, v7.4h, v21.4h \n"
"uqrshrn v8.4h, v8.4s, #4 \n" // downshift, round
"uqrshrn v9.4h, v9.4s, #4 \n"
"uqrshrn v10.4h, v10.4s, #4 \n"
"uqrshrn v11.4h, v11.4s, #4 \n"
"st4 {v8.4h, v9.4h, v10.4h, v11.4h}, [%2], #32 \n"
"umull v16.4s, v3.4h, v0.4h \n"
"umull2 v7.4s, v3.8h, v0.8h \n"
"umull v18.4s, v4.4h, v0.4h \n"
"umull2 v17.4s, v4.8h, v0.8h \n"
"uaddw v16.4s, v16.4s, v6.4h \n"
"uaddl2 v19.4s, v6.8h, v3.8h \n"
"uaddl v3.4s, v6.4h, v3.4h \n"
"uaddw2 v6.4s, v7.4s, v6.8h \n"
"uaddl2 v7.4s, v5.8h, v4.8h \n"
"uaddl v4.4s, v5.4h, v4.4h \n"
"uaddw v18.4s, v18.4s, v5.4h \n"
"mla v16.4s, v4.4s, v1.4s \n"
"mla v18.4s, v3.4s, v1.4s \n"
"mla v6.4s, v7.4s, v1.4s \n"
"uaddw2 v4.4s, v17.4s, v5.8h \n"
"uqrshrn v16.4h, v16.4s, #4 \n"
"mla v4.4s, v19.4s, v1.4s \n"
"uqrshrn2 v16.8h, v6.4s, #4 \n"
"uqrshrn v17.4h, v18.4s, #4 \n"
"uqrshrn2 v17.8h, v4.4s, #4 \n"
"st2 {v16.8h-v17.8h}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
......@@ -1077,9 +1054,8 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
"+r"(dst_width) // %3
: "r"(2LL), // %4
"r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v20", "v21", "v22" // Clobber List
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
"v19" // Clobber List
);
}
......
......@@ -450,12 +450,14 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) {
}
#endif // HAS_SCALEROWDOWN2_SSSE3
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
extern "C" void ScaleRowUp2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun
......@@ -469,10 +471,19 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
for (int i = 0; i < 640 * 2 + 1; ++i) {
orig_pixels[i] = i;
}
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
} else {
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
}
#else
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
#endif
}
for (int i = 0; i < 1280; ++i) {
......@@ -481,7 +492,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
EXPECT_EQ(dst_pixels_c[1279], 800);
}
#endif
extern "C" void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride,
......@@ -501,6 +511,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
orig_pixels[i] = i;
}
ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment