Commit f0a9d6d2 authored by Frank Barchard's avatar Frank Barchard

Gaussian reorder for benefit of A73

Roughly. instead of 4 loads and 8 multiples, use 1 load and 2 multiples
4 times over.  The original code, as with the C code from clang and gcc,
did all the loads, then all the math, then the store.  The new code
does a load, then the math, then the next load, etc.
This schedules better on current arm 64 cpus.
Number of registers also reduced, reusing the same registers.

HiSilicon ARM A73:

Now
TestGaussRow_Opt (890 ms)
TestGaussCol_Opt (571 ms)

Was
TestGaussRow_Opt (1061 ms)
TestGaussCol_Opt (595 ms)

Qualcomm 821 (Pixel):

Now
TestGaussRow_Opt (571 ms)
TestGaussCol_Opt (474 ms)

Was
TestGaussRow_Opt (751 ms)
TestGaussCol_Opt (520 ms)

TBR=kjellander@chromium.org
BUG=libyuv:719
TEST=LibYUVPlanarTest.TestGaussRow_Opt

Reviewed-on: https://chromium-review.googlesource.com/627478Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Change-Id: I5ec81191d460801f0d4a89f0384f89925ff036de
Reviewed-on: https://chromium-review.googlesource.com/634448
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent ad240944
......@@ -2706,22 +2706,19 @@ void GaussCol_NEON(const uint16* src0,
"1: \n"
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
"ld1 {v2.8h}, [%4], #16 \n"
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
"ld1 {v2.8h}, [%1], #16 \n"
"ld1 {v3.8h}, [%2], #16 \n"
"ld1 {v4.8h}, [%3], #16 \n"
"ld1 {v5.8h}, [%4], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"ld1 {v2.8h}, [%2], #16 \n"
"umlal v0.4s, v2.4h, v7.4h \n" // * 6
"umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
"ld1 {v2.8h}, [%3], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"subs %w6, %w6, #8 \n" // 8 processed per loop
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"umlal v0.4s, v3.4h, v7.4h \n" // * 6
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
"umlal v0.4s, v4.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
......@@ -2733,93 +2730,7 @@ void GaussCol_NEON(const uint16* src0,
"+r"(dst), // %5
"+r"(width) // %6
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
#if 0
a8: ad7f8d82 ldp q2, q3, [x12,#-16]
ac: 3cdf8186 ldur q6, [x12,#-8]
b0: 3cdf4184 ldur q4, [x12,#-12]
b4: 3cc04185 ldur q5, [x12,#4]
b8: 3cc08187 ldur q7, [x12,#8]
bc: 3cdfc190 ldur q16, [x12,#-4]
c0: 3cc0c191 ldur q17, [x12,#12]
c4: 3dc00592 ldr q18, [x12,#16]
c8: 4ea094c2 mla v2.4s, v6.4s, v0.4s #6
cc: 4ea48604 add v4.4s, v16.4s, v4.4s
d0: 4ea58625 add v5.4s, v17.4s, v5.4s
d4: 4ea38442 add v2.4s, v2.4s, v3.4s
d8: 4ea094e3 mla v3.4s, v7.4s, v0.4s #6
dc: 4f225484 shl v4.4s, v4.4s, #2
e0: 4f2254a5 shl v5.4s, v5.4s, #2
e4: 4eb28463 add v3.4s, v3.4s, v18.4s
e8: 4ea48442 add v2.4s, v2.4s, v4.4s
ec: 4ea58463 add v3.4s, v3.4s, v5.4s
f0: 4ea18442 add v2.4s, v2.4s, v1.4s #128
f4: 4ea18463 add v3.4s, v3.4s, v1.4s #128
f8: 0f188442 shrn v2.4h, v2.4s, #8
fc: 0f188463 shrn v3.4h, v3.4s, #8
100: f10021ad subs x13, x13, #0x8
104: 6d3f8d62 stp d2, d3, [x11,#-8]
108: 9100416b add x11, x11, #0x10
10c: 9100818c add x12, x12, #0x20
110: 54fffcc1 b.ne a8 <GaussRow_C+0xa8>
#endif
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON3(const uint32* src, uint16* dst, int width) {
asm volatile(
"movi v0.4s, #6 \n" // constant 6
"add %0, %0, #0x10 \n"
"add %1, %1, #0x8 \n"
"1: \n"
"ldp q2, q3, [%0,#-16] \n"
"ldur q6, [%0,#-8] \n"
"ldur q4, [%0,#-12] \n"
"ldur q5, [%0,#4] \n"
"ldur q7, [%0,#8] \n"
"ldur q16, [%0,#-4] \n"
"ldur q17, [%0,#12] \n"
"ldr q18, [%0,#16] \n"
"mla v2.4s, v6.4s, v0.4s \n"
"add v4.4s, v16.4s, v4.4s \n"
"add v5.4s, v17.4s, v5.4s \n"
"add v2.4s, v2.4s, v3.4s \n"
"mla v3.4s, v7.4s, v0.4s \n"
"shl v4.4s, v4.4s, #2 \n"
"shl v5.4s, v5.4s, #2 \n"
"add v3.4s, v3.4s, v18.4s \n"
"add v2.4s, v2.4s, v4.4s \n"
"add v3.4s, v3.4s, v5.4s \n"
"add v2.4s, v2.4s, v1.4s \n"
"add v3.4s, v3.4s, v1.4s \n"
"shrn v2.4h, v2.4s, #8 \n"
"shrn v3.4h, v3.4s, #8 \n"
"subs %w2, %w2, #0x8 \n"
"stp d2, d3, [%1,#-8] \n"
"add %1, %1, #0x10 \n"
"add %0, %0, #0x20 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18" );
}
void GaussRow_NEON2(const uint32* src, uint16* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ =
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
++src;
}
: "cc", "memory", "v0", "v1", "v2", "v6", "v7");
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
......@@ -2832,19 +2743,19 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
"movi v7.4s, #6 \n" // constant 6
"1: \n"
"ld1 {v1.4s,v2.4s,v3.4s}, [%0], %6 \n" // load 12 source samples
"ld1 {v4.4s,v5.4s}, [%1], #32 \n"
"ld1 {v16.4s,v17.4s}, [%2], #32 \n"
"ld1 {v18.4s,v19.4s}, [%3], #32 \n"
"subs %w5, %w5, #8 \n" // 8 processed per loop
"add v0.4s, v1.4s, v2.4s \n" // * 1
"add v1.4s, v2.4s, v3.4s \n" // * 1
"add v2.4s, v4.4s, v18.4s \n" // add rows for * 4
"add v3.4s, v5.4s, v19.4s \n"
"ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
"add v0.4s, v0.4s, v1.4s \n" // * 1
"add v1.4s, v1.4s, v2.4s \n" // * 1
"ld1 {v2.4s,v3.4s}, [%2], #32 \n"
"mla v0.4s, v2.4s, v7.4s \n" // * 6
"mla v1.4s, v3.4s, v7.4s \n" // * 6
"ld1 {v2.4s,v3.4s}, [%1], #32 \n"
"ld1 {v4.4s,v5.4s}, [%3], #32 \n"
"add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
"add v3.4s, v3.4s, v5.4s \n"
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v1.4s, v3.4s, v6.4s \n" // * 4
"mla v0.4s, v16.4s, v7.4s \n" // * 6
"mla v1.4s, v17.4s, v7.4s \n" // * 6
"subs %w5, %w5, #8 \n" // 8 processed per loop
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
......@@ -2856,44 +2767,8 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
: "r"(32LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19" );
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON4(const uint32* src, uint16* dst, int width) {
const uint32* src1 = src + 1;
const uint32* src2 = src + 2;
const uint32* src3 = src + 3;
asm volatile(
"movi v6.4s, #4 \n" // constant 4
"movi v7.4s, #6 \n" // constant 6
"1: \n"
"ld1 {v0.4s,v1.4s}, [%0], %6 \n" // load 8 source samples
"ld1 {v2.4s}, [%1], #16 \n"
"ld1 {v3.4s}, [%2], #16 \n"
"ld1 {v4.4s}, [%3], #16 \n"
"subs %w5, %w5, #4 \n" // 4 processed per loop
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v0.4s, v3.4s, v7.4s \n" // * 6
"mla v0.4s, v4.4s, v6.4s \n" // * 4
"add v0.4s, v0.4s, v1.4s \n" // * 1
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"st1 {v0.4h}, [%4], #8 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
: "r"(16LL) // %6
: "cc", "memory", "v0", "v1", "v0", "v1", "v2", "v3", "v4", "v6", "v7" );
: "r"(32LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
......
......@@ -2759,8 +2759,9 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1);
EXPECT_EQ(dst_pixels_c[1279], 20496);
EXPECT_EQ(dst_pixels_c[0],
static_cast<uint16>(0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1));
EXPECT_EQ(dst_pixels_c[1279], static_cast<uint16>(20496));
}
extern "C" void GaussCol_NEON(const uint16* src0,
......@@ -2819,8 +2820,9 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
}
EXPECT_EQ(dst_pixels_c[0],
0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + 1280 * 4 * 1);
EXPECT_EQ(dst_pixels_c[1279], 61424);
static_cast<uint32>(0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 +
1280 * 4 * 1));
EXPECT_EQ(dst_pixels_c[1279], static_cast<uint32>(61424));
}
} // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment