Commit 8f5e9cd9 authored by Frank Barchard's avatar Frank Barchard

ScaleRowUp2_16_C port of NEON to C

Single pass upsample with bilinear filter.
NEON version optimized - Pixel Sailfish QC821

Was TestScaleRowUp2_16 (5741 ms)
Now TestScaleRowUp2_16 (4484 ms)
C   TestScaleRowUp2_16 (6555 ms)

TBR=kjellander@chromium.org
BUG=libyuv:718
TEST=LibYUVScaleTest.TestScaleRowUp2_16 (709 ms)

Change-Id: Ib04ceb53e0ab644a392c39c3396e313530161d92
Reviewed-on: https://chromium-review.googlesource.com/646701Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 2621c91b
...@@ -1306,6 +1306,35 @@ void ScaleSlope(int src_width, ...@@ -1306,6 +1306,35 @@ void ScaleSlope(int src_width,
} }
#undef CENTERSTART #undef CENTERSTART
// Read 8x2 upsample with filtering and write 16x1.
// actually reads an extra pixel, so 9x2.
void ScaleRowUp2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
const uint16* src2 = src_ptr + src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
uint16 p0 = src_ptr[0];
uint16 p1 = src_ptr[1];
uint16 p2 = src2[0];
uint16 p3 = src2[1];
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
++src_ptr;
++src2;
dst += 2;
}
if (dst_width & 1) {
uint16 p0 = src_ptr[0];
uint16 p1 = src_ptr[1];
uint16 p2 = src2[0];
uint16 p3 = src2[1];
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -1010,66 +1010,43 @@ void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, ...@@ -1010,66 +1010,43 @@ void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
} }
// Read 8x2 upsample with filtering and write 16x1. // Read 8x2 upsample with filtering and write 16x1.
// actually reads an extra pixel, so 9x2. // Actually reads an extra pixel, so 9x2.
void ScaleRowUp2_16_NEON(const uint16* src_ptr, void ScaleRowUp2_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint16* dst, uint16* dst,
int dst_width) { int dst_width) {
asm volatile( asm volatile(
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"movi v20.4h, #1 \n" "movi v0.8h, #9 \n" // constants
"movi v21.4h, #3 \n" // constants "movi v1.4s, #3 \n"
"movi v22.4h, #9 \n"
"1: \n" "1: \n"
"ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
"ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
"ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
// consider a variation of this for last 8x2 that replicates the last "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
// pixel.
"ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels
"ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2
"subs %w3, %w3, #16 \n" // 16 dst pixels per loop "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
"umull v16.4s, v3.4h, v0.4h \n"
// filter first 2x2 group to produce 1st and 4th dest pixels "umull2 v7.4s, v3.8h, v0.8h \n"
// 9 3 "umull v18.4s, v4.4h, v0.4h \n"
// 3 1 "umull2 v17.4s, v4.8h, v0.8h \n"
"umull v8.4s, v0.4h, v22.4h \n" "uaddw v16.4s, v16.4s, v6.4h \n"
"umlal v8.4s, v1.4h, v21.4h \n" "uaddl2 v19.4s, v6.8h, v3.8h \n"
"umlal v8.4s, v2.4h, v21.4h \n" "uaddl v3.4s, v6.4h, v3.4h \n"
"umlal v8.4s, v3.4h, v20.4h \n" "uaddw2 v6.4s, v7.4s, v6.8h \n"
"uaddl2 v7.4s, v5.8h, v4.8h \n"
// filter first 2x2 group to produce 2nd and 5th dest pixel "uaddl v4.4s, v5.4h, v4.4h \n"
// 3 9 "uaddw v18.4s, v18.4s, v5.4h \n"
// 1 3 "mla v16.4s, v4.4s, v1.4s \n"
"umull v9.4s, v0.4h, v21.4h \n" "mla v18.4s, v3.4s, v1.4s \n"
"umlal v9.4s, v1.4h, v22.4h \n" "mla v6.4s, v7.4s, v1.4s \n"
"umlal v9.4s, v2.4h, v20.4h \n" "uaddw2 v4.4s, v17.4s, v5.8h \n"
"umlal v9.4s, v3.4h, v21.4h \n" "uqrshrn v16.4h, v16.4s, #4 \n"
"mla v4.4s, v19.4s, v1.4s \n"
// filter second 2x2 group to produce 3rd and 6th dest pixels "uqrshrn2 v16.8h, v6.4s, #4 \n"
// 9 3 "uqrshrn v17.4h, v18.4s, #4 \n"
// 3 1 "uqrshrn2 v17.8h, v4.4s, #4 \n"
"umull v10.4s, v4.4h, v22.4h \n" "st2 {v16.8h-v17.8h}, [%2], #32 \n"
"umlal v10.4s, v5.4h, v21.4h \n"
"umlal v10.4s, v6.4h, v21.4h \n"
"umlal v10.4s, v7.4h, v20.4h \n"
// filter second 2x2 group to produce 4th and 7th dest pixel
// 3 9
// 1 3
"umull v11.4s, v4.4h, v21.4h \n"
"umlal v11.4s, v5.4h, v22.4h \n"
"umlal v11.4s, v6.4h, v20.4h \n"
"umlal v11.4s, v7.4h, v21.4h \n"
"uqrshrn v8.4h, v8.4s, #4 \n" // downshift, round
"uqrshrn v9.4h, v9.4s, #4 \n"
"uqrshrn v10.4h, v10.4s, #4 \n"
"uqrshrn v11.4h, v11.4s, #4 \n"
"st4 {v8.4h, v9.4h, v10.4h, v11.4h}, [%2], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
...@@ -1077,9 +1054,8 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr, ...@@ -1077,9 +1054,8 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: "r"(2LL), // %4 : "r"(2LL), // %4
"r"(14LL) // %5 "r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v19" // Clobber List
"v11", "v20", "v21", "v22" // Clobber List
); );
} }
......
...@@ -450,12 +450,14 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) { ...@@ -450,12 +450,14 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) {
} }
#endif // HAS_SCALEROWDOWN2_SSSE3 #endif // HAS_SCALEROWDOWN2_SSSE3
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr, extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint16* dst, uint16* dst,
int dst_width); int dst_width);
extern "C" void ScaleRowUp2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun
...@@ -469,10 +471,19 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { ...@@ -469,10 +471,19 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
for (int i = 0; i < 640 * 2 + 1; ++i) { for (int i = 0; i < 640 * 2 + 1; ++i) {
orig_pixels[i] = i; orig_pixels[i] = i;
} }
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_c[0], 1280); ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
} else {
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
}
#else
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
#endif
} }
for (int i = 0; i < 1280; ++i) { for (int i = 0; i < 1280; ++i) {
...@@ -481,7 +492,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { ...@@ -481,7 +492,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16); EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
EXPECT_EQ(dst_pixels_c[1279], 800); EXPECT_EQ(dst_pixels_c[1279], 800);
} }
#endif
extern "C" void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, extern "C" void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
...@@ -501,6 +511,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { ...@@ -501,6 +511,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
orig_pixels[i] = i; orig_pixels[i] = i;
} }
ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280); ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON); int has_neon = TestCpuFlag(kCpuHasNEON);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment