Commit 7e59ee4c authored by Frank Barchard's avatar Frank Barchard

Upsample 8x2 pixels to 16x1 with bilinear filtering

Downsample 16x2 to 8x1 with box filtering

[ RUN      ] LibYUVScaleTest.TestScaleRowUp2_16
[       OK ] LibYUVScaleTest.TestScaleRowUp2_16 (579 ms)
[ RUN      ] LibYUVScaleTest.TestScaleRowDown2Box_16
[       OK ] LibYUVScaleTest.TestScaleRowDown2Box_16 (329 ms)
[----------] 2 tests from LibYUVScaleTest (909 ms total)

TBR=kjellander@chromium.org
BUG=libyuv:718
TEST=LibYUVScaleTest.TestScaleRowUp2_16 and LibYUVScaleTest.TestScaleRowDown2Box_16

Change-Id: I457d44123f2751e5f71bf3935401fff74b8e9db2
Reviewed-on: https://chromium-review.googlesource.com/608876Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 9508c3e7
......@@ -995,6 +995,112 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
#undef LOAD2_DATA32_LANE
// Read 16x2 average down and write 8x1.
void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #8 \n" // 8 processed per loop
"uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
"uaddlp v1.4s, v1.8h \n"
"uadalp v0.4s, v2.8h \n" // row 2 add adjacent +
// row1
"uadalp v1.4s, v3.8h \n"
"rshrn v0.4h, v0.4s, #2 \n" // downshift, round and
// pack
"rshrn2 v0.8h, v1.4s, #2 \n"
"st1 {v0.8h}, [%2], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "v0", "v1", "v2", "v3" // Clobber List
);
}
// Read 8x2 upsample with filtering and write 16x1.
// actually reads an extra pixel, so 9x2.
void ScaleRowUp2_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"movi v20.4h, #1 \n"
"movi v21.4h, #3 \n" // constants
"movi v22.4h, #9 \n"
"1: \n"
"ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels
"ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2
// consider a variation of this for last 8x2 that replicates the last pixel.
"ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels
"ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2
"subs %w3, %w3, #16 \n" // 16 dst pixels per loop
// filter first 2x2 group to produce 1st and 4th dest pixels
// 9 3
// 3 1
"umull v8.4s, v0.4h, v22.4h \n"
"umlal v8.4s, v1.4h, v21.4h \n"
"umlal v8.4s, v2.4h, v21.4h \n"
"umlal v8.4s, v3.4h, v20.4h \n"
// filter first 2x2 group to produce 2nd and 5th dest pixel
// 3 9
// 1 3
"umull v9.4s, v0.4h, v21.4h \n"
"umlal v9.4s, v1.4h, v22.4h \n"
"umlal v9.4s, v2.4h, v20.4h \n"
"umlal v9.4s, v3.4h, v21.4h \n"
// filter second 2x2 group to produce 3rd and 6th dest pixels
// 9 3
// 3 1
"umull v10.4s, v4.4h, v22.4h \n"
"umlal v10.4s, v5.4h, v21.4h \n"
"umlal v10.4s, v6.4h, v21.4h \n"
"umlal v10.4s, v7.4h, v20.4h \n"
// filter second 2x2 group to produce 4th and 7th dest pixel
// 3 9
// 1 3
"umull v11.4s, v4.4h, v21.4h \n"
"umlal v11.4s, v5.4h, v22.4h \n"
"umlal v11.4s, v6.4h, v20.4h \n"
"umlal v11.4s, v7.4h, v21.4h \n"
"uqrshrn v8.4h, v8.4s, #4 \n" // downshift, round
"uqrshrn v9.4h, v9.4s, #4 \n"
"uqrshrn v10.4h, v10.4s, #4 \n"
"uqrshrn v11.4h, v11.4s, #4 \n"
"st4 {v8.4h, v9.4h, v10.4h, v11.4h}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
: "r"(2LL), // %4
"r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v20", "v21", "v22" // Clobber List
);
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
......
......@@ -450,4 +450,80 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) {
}
#endif // HAS_SCALEROWDOWN2_SSSE3
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun
SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt));
memset(dst_pixels_c, 2, sizeof(dst_pixels_c));
for (int i = 0; i < 640 * 2 + 1; ++i) {
orig_pixels[i] = i;
}
ScaleRowUp2_16_NEON(&orig_pixels[0],
640 * 2,
&dst_pixels_c[0],
1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ScaleRowUp2_16_NEON(&orig_pixels[0],
640 * 2,
&dst_pixels_opt[0],
1280);
}
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
EXPECT_EQ(dst_pixels_c[1279], 800);
}
extern "C" void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
SIMD_ALIGNED(uint16 orig_pixels[2560 * 2]);
SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt));
memset(dst_pixels_c, 2, sizeof(dst_pixels_c));
for (int i = 0; i < 2560 * 2; ++i) {
orig_pixels[i] = i;
}
ScaleRowDown2Box_16_NEON(&orig_pixels[0],
2560 * 2,
&dst_pixels_c[0],
1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ScaleRowDown2Box_16_NEON(&orig_pixels[0],
2560 * 2,
&dst_pixels_opt[0],
1280);
}
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
EXPECT_EQ(dst_pixels_c[0], 1281);
EXPECT_EQ(dst_pixels_c[1279], 3839);
}
#endif // __aarch64__
} // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment