Commit 4ad33344 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Pass float parameters via vector 2 float and "w" for scalar multiply.

Scalar multiply expects a 'd' register.  The "w" (float) uses 's' for float
and wont work with the multiply in 32 bit (it does in 64 bit).
A vector 2 of float passes as 'd' register.
A vector 4 of float passes as 'q' register.
This change copies the float into the first entry of a vector 2
and passes that.  The optimizer removes the extra copy, allowing
the single float to use referenced as

Test: LibYUVPlanarTest.TestByteToFloat
Bug: libyuv:786
Change-Id: I8773c5bae043c7b84e1d1db7fdea6731aa0b1323
Reviewed-on: https://chromium-review.googlesource.com/973984
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
parent d8680893
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1706
Version: 1707
License: BSD
License File: LICENSE
......
......@@ -103,6 +103,10 @@ ios simulator
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
ios disassembly
otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
### Android
https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
......@@ -144,6 +148,8 @@ arm disassembly:
third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
Running tests:
build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
......
......@@ -55,9 +55,9 @@ extern "C" {
#endif // clang >= 3.4
#endif // __clang__
// clang >= 6.0.0 required for AVX512.
// clang >= 7.0.0 required for AVX512.
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
#if (__clang_major__ >= 6)
#if (__clang_major__ >= 7)
#define CLANG_HAS_AVX512 1
#endif // clang >= 6
#endif // __clang__
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1706
#define LIBYUV_VERSION 1707
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -2604,12 +2604,15 @@ void SobelYRow_NEON(const uint8_t* src_y0,
);
}
// %y passes a float as a scalar vector for vector * scalar multiply.
// the regoster must be d0 to d15 and indexed with [0] or [1] to access
// the float in the first or second float of the d-reg
void HalfFloat1Row_NEON(const uint16_t* src,
uint16_t* dst,
float /*unused*/,
int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
......@@ -2618,8 +2621,8 @@ void HalfFloat1Row_NEON(const uint16_t* src,
"vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, q0 \n" // adjust exponent
"vmul.f32 q3, q3, q0 \n"
"vmul.f32 q2, q2, %y3 \n" // adjust exponent
"vmul.f32 q3, q3, %y3 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n"
"vst1.8 {q1}, [%1]! \n"
......@@ -2627,17 +2630,15 @@ void HalfFloat1Row_NEON(const uint16_t* src,
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q2", "q3");
: "w"(1.9259299444e-34f) // %3
: "cc", "memory", "q1", "q2", "q3");
}
// TODO(fbarchard): multiply by element.
void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
......@@ -2646,8 +2647,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
"vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, q0 \n" // adjust exponent
"vmul.f32 q3, q3, q0 \n"
"vmul.f32 q2, q2, %y3 \n" // adjust exponent
"vmul.f32 q3, q3, %y3 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n"
"vst1.8 {q1}, [%1]! \n"
......@@ -2655,8 +2656,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q2", "q3");
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "q1", "q2", "q3");
}
void ByteToFloatRow_NEON(const uint8_t* src,
......@@ -2664,7 +2665,6 @@ void ByteToFloatRow_NEON(const uint8_t* src,
float scale,
int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
......@@ -2674,15 +2674,15 @@ void ByteToFloatRow_NEON(const uint8_t* src,
"vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, d0[0] \n" // scale
"vmul.f32 q3, q3, d0[0] \n"
"vmul.f32 q2, q2, %y3 \n" // scale
"vmul.f32 q3, q3, %y3 \n"
"vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "q0", "q1", "q2", "q3");
: "w"(scale) // %3
: "cc", "memory", "q1", "q2", "q3");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment