Commit 4ad33344 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Pass float parameters via vector 2 float and "w" for scalar multiply.

Scalar multiply expects a 'd' register.  The "w" (float) uses 's' for float
and wont work with the multiply in 32 bit (it does in 64 bit).
A vector 2 of float passes as 'd' register.
A vector 4 of float passes as 'q' register.
This change copies the float into the first entry of a vector 2
and passes that.  The optimizer removes the extra copy, allowing
the single float to use referenced as

Test: LibYUVPlanarTest.TestByteToFloat
Bug: libyuv:786
Change-Id: I8773c5bae043c7b84e1d1db7fdea6731aa0b1323
Reviewed-on: https://chromium-review.googlesource.com/973984
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
parent d8680893
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1706 Version: 1707
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -103,6 +103,10 @@ ios simulator ...@@ -103,6 +103,10 @@ ios simulator
ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest ninja -v -C out/Release libyuv_unittest
ios disassembly
otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
### Android ### Android
https://code.google.com/p/chromium/wiki/AndroidBuildInstructions https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
...@@ -144,6 +148,8 @@ arm disassembly: ...@@ -144,6 +148,8 @@ arm disassembly:
third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
Running tests: Running tests:
build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
......
...@@ -55,9 +55,9 @@ extern "C" { ...@@ -55,9 +55,9 @@ extern "C" {
#endif // clang >= 3.4 #endif // clang >= 3.4
#endif // __clang__ #endif // __clang__
// clang >= 6.0.0 required for AVX512. // clang >= 7.0.0 required for AVX512.
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
#if (__clang_major__ >= 6) #if (__clang_major__ >= 7)
#define CLANG_HAS_AVX512 1 #define CLANG_HAS_AVX512 1
#endif // clang >= 6 #endif // clang >= 6
#endif // __clang__ #endif // __clang__
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1706 #define LIBYUV_VERSION 1707
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -2604,12 +2604,15 @@ void SobelYRow_NEON(const uint8_t* src_y0, ...@@ -2604,12 +2604,15 @@ void SobelYRow_NEON(const uint8_t* src_y0,
); );
} }
// %y passes a float as a scalar vector for vector * scalar multiply.
// the regoster must be d0 to d15 and indexed with [0] or [1] to access
// the float in the first or second float of the d-reg
void HalfFloat1Row_NEON(const uint16_t* src, void HalfFloat1Row_NEON(const uint16_t* src,
uint16_t* dst, uint16_t* dst,
float /*unused*/, float /*unused*/,
int width) { int width) {
asm volatile( asm volatile(
"vdup.32 q0, %3 \n"
"1: \n" "1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts "vld1.8 {q1}, [%0]! \n" // load 8 shorts
...@@ -2618,8 +2621,8 @@ void HalfFloat1Row_NEON(const uint16_t* src, ...@@ -2618,8 +2621,8 @@ void HalfFloat1Row_NEON(const uint16_t* src,
"vmovl.u16 q3, d3 \n" "vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n" "vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, q0 \n" // adjust exponent "vmul.f32 q2, q2, %y3 \n" // adjust exponent
"vmul.f32 q3, q3, q0 \n" "vmul.f32 q3, q3, %y3 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n" "vqshrn.u32 d3, q3, #13 \n"
"vst1.8 {q1}, [%1]! \n" "vst1.8 {q1}, [%1]! \n"
...@@ -2627,17 +2630,15 @@ void HalfFloat1Row_NEON(const uint16_t* src, ...@@ -2627,17 +2630,15 @@ void HalfFloat1Row_NEON(const uint16_t* src,
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(1.9259299444e-34f) // %3 : "w"(1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q2", "q3"); : "cc", "memory", "q1", "q2", "q3");
} }
// TODO(fbarchard): multiply by element.
void HalfFloatRow_NEON(const uint16_t* src, void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst, uint16_t* dst,
float scale, float scale,
int width) { int width) {
asm volatile( asm volatile(
"vdup.32 q0, %3 \n"
"1: \n" "1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts "vld1.8 {q1}, [%0]! \n" // load 8 shorts
...@@ -2646,8 +2647,8 @@ void HalfFloatRow_NEON(const uint16_t* src, ...@@ -2646,8 +2647,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
"vmovl.u16 q3, d3 \n" "vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n" "vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, q0 \n" // adjust exponent "vmul.f32 q2, q2, %y3 \n" // adjust exponent
"vmul.f32 q3, q3, q0 \n" "vmul.f32 q3, q3, %y3 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n" "vqshrn.u32 d3, q3, #13 \n"
"vst1.8 {q1}, [%1]! \n" "vst1.8 {q1}, [%1]! \n"
...@@ -2655,8 +2656,8 @@ void HalfFloatRow_NEON(const uint16_t* src, ...@@ -2655,8 +2656,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(scale * 1.9259299444e-34f) // %3 : "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q2", "q3"); : "cc", "memory", "q1", "q2", "q3");
} }
void ByteToFloatRow_NEON(const uint8_t* src, void ByteToFloatRow_NEON(const uint8_t* src,
...@@ -2664,7 +2665,6 @@ void ByteToFloatRow_NEON(const uint8_t* src, ...@@ -2664,7 +2665,6 @@ void ByteToFloatRow_NEON(const uint8_t* src,
float scale, float scale,
int width) { int width) {
asm volatile( asm volatile(
"vdup.32 q0, %3 \n"
"1: \n" "1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 bytes "vld1.8 {d2}, [%0]! \n" // load 8 bytes
...@@ -2674,15 +2674,15 @@ void ByteToFloatRow_NEON(const uint8_t* src, ...@@ -2674,15 +2674,15 @@ void ByteToFloatRow_NEON(const uint8_t* src,
"vmovl.u16 q3, d3 \n" "vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n" "vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, d0[0] \n" // scale "vmul.f32 q2, q2, %y3 \n" // scale
"vmul.f32 q3, q3, d0[0] \n" "vmul.f32 q3, q3, %y3 \n"
"vst1.8 {q2, q3}, [%1]! \n" // store 8 floats "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(scale) // %3 : "w"(scale) // %3
: "cc", "memory", "q0", "q1", "q2", "q3"); : "cc", "memory", "q1", "q2", "q3");
} }
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment