Commit eca08525 authored by Frank Barchard's avatar Frank Barchard

HalfFloat Neon for ARMv7.

64 bit version made similar to 32 bit with registers 1 for load and store results, and 2 and 3 as expanded float temporary values.

TEST=out/Release/libyuv_unittest --gtest_filter=*Half*

BUG=libyuv:560
R=wangcheng@google.com

Review URL: https://codereview.chromium.org/2467723002 .
parent 10ce829b
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1632
Version: 1633
License: BSD
License File: LICENSE
......
......@@ -288,6 +288,7 @@ extern "C" {
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON
#define HAS_HALFFLOATROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I422ALPHATOARGBROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
......@@ -329,11 +330,6 @@ extern "C" {
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
// TODO(fbarchard): Port to 32 bit.
#if defined(__aarch64__)
#define HAS_HALFFLOATROW_NEON
#endif
// Effects:
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1632
#define LIBYUV_VERSION 1633
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -2733,7 +2733,63 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "q0", "q1" // Clobber List
);
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile (
"vdup.32 q0, %3 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u8 q2, d2 \n" // 8 int's
"vmovl.u8 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, q0 \n" // adjust exponent
"vmul.f32 q3, q3, q0 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n"
MEMACCESS(1)
"vst1.8 {q1}, [%0]! \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
// TODO(fbarchard): multiply by element.
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vdup.32 q0, %3 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u8 q2, d2 \n" // 8 int's
"vmovl.u8 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, q0 \n" // adjust exponent
"vmul.f32 q3, q3, q0 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n"
MEMACCESS(1)
"vst1.8 {q1}, [%0]! \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus
} // extern "C"
......
......@@ -2718,19 +2718,19 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"fcvtn v4.4h, v2.4s \n" // 8 floatsgit
"fcvtn2 v4.8h, v1.4s \n"
"scvtf v3.4s, v3.4s \n"
"fcvtn v1.4h, v2.4s \n" // 8 floatsgit
"fcvtn2 v1.8h, v3.4s \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v4"
: "cc", "memory", "v1", "v2", "v3"
);
}
......@@ -2741,21 +2741,21 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"scvtf v3.4s, v3.4s \n"
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
"fmul v1.4s, v1.4s, %3.s[0] \n"
"uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v4.8h, v1.4s, #13 \n"
"fmul v3.4s, v3.4s, %3.s[0] \n"
"uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v1.8h, v3.4s, #13 \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "v1", "v2", "v4"
: "cc", "memory", "v1", "v2", "v3"
);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment