Commit ad240944 authored by Frank Barchard's avatar Frank Barchard

GaussRow_NEON from int to short

 [ RUN      ] LibYUVPlanarTest.TestGaussRow_Opt
 [       OK ] LibYUVPlanarTest.TestGaussRow_Opt (601 ms)
 [ RUN      ] LibYUVPlanarTest.TestGaussCol_Opt
 [       OK ] LibYUVPlanarTest.TestGaussCol_Opt (522 ms)

TBR=kjellander@chromium.org
BUG=libyuv:719
TEST=LibYUVPlanarTest.TestGaussRow_Opt

Change-Id: I1242b98672538e889f3ab48f215d6dabc7144ea7
Reviewed-on: https://chromium-review.googlesource.com/627478Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent 1cc539f7
......@@ -2672,6 +2672,15 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
}
}
void GaussRow_C(const uint32* src, uint16* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ =
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
++src;
}
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_C(const uint16* src0,
const uint16* src1,
......
......@@ -2692,13 +2692,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
: "cc", "memory", "v1", "v2");
}
static vec16 kGauseCoefficients[4] = {
{1, 4, 6, 4, 1, 0, 0, 0},
{0, 1, 4, 6, 4, 1, 0, 0},
{0, 0, 1, 4, 6, 4, 1, 0},
{0, 0, 0, 1, 4, 6, 4, 1},
};
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_NEON(const uint16* src0,
const uint16* src1,
......@@ -2719,15 +2712,15 @@ void GaussCol_NEON(const uint16* src0,
"ld1 {v5.8h}, [%4], #16 \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"umlal v0.4s, v3.4h, v7.4h \n" // * 6
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
"umlal v0.4s, v4.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"umlal v0.4s, v3.4h, v7.4h \n" // * 6
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
"umlal v0.4s, v4.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
......@@ -2743,41 +2736,164 @@ void GaussCol_NEON(const uint16* src0,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
#if 0
a8: ad7f8d82 ldp q2, q3, [x12,#-16]
ac: 3cdf8186 ldur q6, [x12,#-8]
b0: 3cdf4184 ldur q4, [x12,#-12]
b4: 3cc04185 ldur q5, [x12,#4]
b8: 3cc08187 ldur q7, [x12,#8]
bc: 3cdfc190 ldur q16, [x12,#-4]
c0: 3cc0c191 ldur q17, [x12,#12]
c4: 3dc00592 ldr q18, [x12,#16]
c8: 4ea094c2 mla v2.4s, v6.4s, v0.4s #6
cc: 4ea48604 add v4.4s, v16.4s, v4.4s
d0: 4ea58625 add v5.4s, v17.4s, v5.4s
d4: 4ea38442 add v2.4s, v2.4s, v3.4s
d8: 4ea094e3 mla v3.4s, v7.4s, v0.4s #6
dc: 4f225484 shl v4.4s, v4.4s, #2
e0: 4f2254a5 shl v5.4s, v5.4s, #2
e4: 4eb28463 add v3.4s, v3.4s, v18.4s
e8: 4ea48442 add v2.4s, v2.4s, v4.4s
ec: 4ea58463 add v3.4s, v3.4s, v5.4s
f0: 4ea18442 add v2.4s, v2.4s, v1.4s #128
f4: 4ea18463 add v3.4s, v3.4s, v1.4s #128
f8: 0f188442 shrn v2.4h, v2.4s, #8
fc: 0f188463 shrn v3.4h, v3.4s, #8
100: f10021ad subs x13, x13, #0x8
104: 6d3f8d62 stp d2, d3, [x11,#-8]
108: 9100416b add x11, x11, #0x10
10c: 9100818c add x12, x12, #0x20
110: 54fffcc1 b.ne a8 <GaussRow_C+0xa8>
#endif
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
void GaussRow_NEON3(const uint32* src, uint16* dst, int width) {
asm volatile(
"ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [%3] \n"
"movi v0.4s, #6 \n" // constant 6
"add %0, %0, #0x10 \n"
"add %1, %1, #0x8 \n"
"1: \n"
"ld1 {v0.8h}, [%0], %4 \n" // load 8 source samples
"subs %w2, %w2, #4 \n" // 4 processed per loop
"umull v1.4s, v0.4h, v20.4h \n" // first pixel
"umlal2 v1.4s, v0.8h, v20.8h \n"
"addv s1, v1.4s \n"
"ldp q2, q3, [%0,#-16] \n"
"ldur q6, [%0,#-8] \n"
"ldur q4, [%0,#-12] \n"
"ldur q5, [%0,#4] \n"
"ldur q7, [%0,#8] \n"
"ldur q16, [%0,#-4] \n"
"ldur q17, [%0,#12] \n"
"ldr q18, [%0,#16] \n"
"mla v2.4s, v6.4s, v0.4s \n"
"add v4.4s, v16.4s, v4.4s \n"
"add v5.4s, v17.4s, v5.4s \n"
"add v2.4s, v2.4s, v3.4s \n"
"mla v3.4s, v7.4s, v0.4s \n"
"shl v4.4s, v4.4s, #2 \n"
"shl v5.4s, v5.4s, #2 \n"
"add v3.4s, v3.4s, v18.4s \n"
"add v2.4s, v2.4s, v4.4s \n"
"add v3.4s, v3.4s, v5.4s \n"
"add v2.4s, v2.4s, v1.4s \n"
"add v3.4s, v3.4s, v1.4s \n"
"shrn v2.4h, v2.4s, #8 \n"
"shrn v3.4h, v3.4s, #8 \n"
"subs %w2, %w2, #0x8 \n"
"stp d2, d3, [%1,#-8] \n"
"add %1, %1, #0x10 \n"
"add %0, %0, #0x20 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18" );
}
"umull v2.4s, v0.4h, v21.4h \n" // second pixel
"umlal2 v2.4s, v0.8h, v21.8h \n"
"addv s2, v2.4s \n"
"umull v3.4s, v0.4h, v22.4h \n" // third pixel
"umlal2 v3.4s, v0.8h, v22.8h \n"
"addv s3, v3.4s \n"
void GaussRow_NEON2(const uint32* src, uint16* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ =
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
++src;
}
}
"umull v4.4s, v0.4h, v23.4h \n" // forth pixel
"umlal2 v4.4s, v0.8h, v23.8h \n"
"addv s4, v4.4s \n"
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
const uint32* src1 = src + 1;
const uint32* src2 = src + 2;
const uint32* src3 = src + 3;
asm volatile(
"movi v6.4s, #4 \n" // constant 4
"movi v7.4s, #6 \n" // constant 6
"st4 {v1.s,v2.s,v3.s,v4.s}[0], [%1], #16 \n" // store 4 samples
"1: \n"
"ld1 {v1.4s,v2.4s,v3.4s}, [%0], %6 \n" // load 12 source samples
"ld1 {v4.4s,v5.4s}, [%1], #32 \n"
"ld1 {v16.4s,v17.4s}, [%2], #32 \n"
"ld1 {v18.4s,v19.4s}, [%3], #32 \n"
"subs %w5, %w5, #8 \n" // 8 processed per loop
"add v0.4s, v1.4s, v2.4s \n" // * 1
"add v1.4s, v2.4s, v3.4s \n" // * 1
"add v2.4s, v4.4s, v18.4s \n" // add rows for * 4
"add v3.4s, v5.4s, v19.4s \n"
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v1.4s, v3.4s, v6.4s \n" // * 4
"mla v0.4s, v16.4s, v7.4s \n" // * 6
"mla v1.4s, v17.4s, v7.4s \n" // * 6
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src0), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(&kGauseCoefficients[0]), // %3
"r"(8LL) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v20", "v21", "v22",
"v23");
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
: "r"(32LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19" );
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON4(const uint32* src, uint16* dst, int width) {
const uint32* src1 = src + 1;
const uint32* src2 = src + 2;
const uint32* src3 = src + 3;
asm volatile(
"movi v6.4s, #4 \n" // constant 4
"movi v7.4s, #6 \n" // constant 6
"1: \n"
"ld1 {v0.4s,v1.4s}, [%0], %6 \n" // load 8 source samples
"ld1 {v2.4s}, [%1], #16 \n"
"ld1 {v3.4s}, [%2], #16 \n"
"ld1 {v4.4s}, [%3], #16 \n"
"subs %w5, %w5, #4 \n" // 4 processed per loop
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v0.4s, v3.4s, v7.4s \n" // * 6
"mla v0.4s, v4.4s, v6.4s \n" // * 4
"add v0.4s, v0.4s, v1.4s \n" // * 1
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"st1 {v0.4h}, [%4], #8 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
: "r"(16LL) // %6
: "cc", "memory", "v0", "v1", "v0", "v1", "v2", "v3", "v4", "v6", "v7" );
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
......
......@@ -2725,23 +2725,22 @@ TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
EXPECT_EQ(0, diff);
}
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
extern "C" void GaussRow_NEON(const uint16* src0, uint32* dst, int width);
extern "C" void GaussRow_NEON(const uint32* src, uint16* dst, int width);
extern "C" void GaussRow_C(const uint32* src, uint16* dst, int width);
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
SIMD_ALIGNED(uint16 orig_pixels[1280 + 4]);
SIMD_ALIGNED(uint32 dst_pixels_c[1280]);
SIMD_ALIGNED(uint32 dst_pixels_opt[1280]);
SIMD_ALIGNED(uint32 orig_pixels[1280 + 4]);
SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
for (int i = 0; i < 1280 + 4; ++i) {
orig_pixels[i] = i;
orig_pixels[i] = i * 256;
}
GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280);
GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
......@@ -2749,10 +2748,10 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
if (has_neon) {
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else {
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
}
#else
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
#endif
}
......@@ -2824,6 +2823,4 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
EXPECT_EQ(dst_pixels_c[1279], 61424);
}
#endif // aarch64
} // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment