Pass float parameters via vector 2 float and "w" for scalar multiply.

Scalar multiply expects a 'd' register. The "w" (float) uses 's' for float and wont work with the multiply in 32 bit (it does in 64 bit). A vector 2 of float passes as 'd' register. A vector 4 of float passes as 'q' register. This change copies the float into the first entry of a vector 2 and passes that. The optimizer removes the extra copy, allowing the single float to use referenced as Test: LibYUVPlanarTest.TestByteToFloat Bug: libyuv:786 Change-Id: I8773c5bae043c7b84e1d1db7fdea6731aa0b1323 Reviewed-on: https://chromium-review.googlesource.com/973984 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org>

Pass float parameters via vector 2 float and "w" for scalar multiply.
Scalar multiply expects a 'd' register. The "w" (float) uses 's' for float and wont work with the multiply in 32 bit (it does in 64 bit). A vector 2 of float passes as 'd' register. A vector 4 of float passes as 'q' register. This change copies the float into the first entry of a vector 2 and passes that. The optimizer removes the extra copy, allowing the single float to use referenced as Test: LibYUVPlanarTest.TestByteToFloat Bug: libyuv:786 Change-Id: I8773c5bae043c7b84e1d1db7fdea6731aa0b1323 Reviewed-on: https://chromium-review.googlesource.com/973984 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org>
4ad33344 · Frank Barchard · Commit Bot · d8680893 · 4ad33344 · 4ad33344
Commit 4ad33344 authored Mar 28, 2018 by Frank Barchard Committed by Commit Bot Mar 28, 2018
Showing with 26 additions and 20 deletions

README.chromium README.chromium +1 -1

getting_started.md docs/getting_started.md +6 -0

row.h include/libyuv/row.h +2 -2

version.h include/libyuv/version.h +1 -1

row_neon.cc source/row_neon.cc +16 -16

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1706
+Version: 1707
 License: BSD
 License File: LICENSE


--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -103,6 +103,10 @@ ios simulator
    ninja -v -C out/Debug libyuv_unittest
    ninja -v -C out/Release libyuv_unittest

+ios disassembly
+
+    otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+
 ### Android
 https://code.google.com/p/chromium/wiki/AndroidBuildInstructions

@@ -144,6 +148,8 @@ arm disassembly:

    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt

+    Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
+
 Running tests:

    build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -55,9 +55,9 @@ extern "C" {
 #endif  // clang >= 3.4
 #endif  // __clang__

-// clang >= 6.0.0 required for AVX512.
+// clang >= 7.0.0 required for AVX512.
 #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ >= 6)
+#if (__clang_major__ >= 7)
 #define CLANG_HAS_AVX512 1
 #endif  // clang >= 6
 #endif  // __clang__

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1706
+#define LIBYUV_VERSION 1707

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2604,12 +2604,15 @@ void SobelYRow_NEON(const uint8_t* src_y0,
      );
 }

+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
 void HalfFloat1Row_NEON(const uint16_t* src,
                        uint16_t* dst,
                        float /*unused*/,
                        int width) {
  asm volatile(
-      "vdup.32    q0, %3                         \n"

      "1:                                        \n"
      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
@@ -2618,8 +2621,8 @@ void HalfFloat1Row_NEON(const uint16_t* src,
      "vmovl.u16  q3, d3                         \n"
      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
-      "vmul.f32   q3, q3, q0                     \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
      "vqshrn.u32 d3, q3, #13                    \n"
      "vst1.8     {q1}, [%1]!                    \n"
@@ -2627,17 +2630,15 @@ void HalfFloat1Row_NEON(const uint16_t* src,
      : "+r"(src),              // %0
        "+r"(dst),              // %1
        "+r"(width)             // %2
-      : "r"(1.9259299444e-34f)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3");
+      : "w"(1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
 }

-// TODO(fbarchard): multiply by element.
 void HalfFloatRow_NEON(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
                       int width) {
  asm volatile(
-      "vdup.32    q0, %3                         \n"

      "1:                                        \n"
      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
@@ -2646,8 +2647,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
      "vmovl.u16  q3, d3                         \n"
      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
-      "vmul.f32   q3, q3, q0                     \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
      "vqshrn.u32 d3, q3, #13                    \n"
      "vst1.8     {q1}, [%1]!                    \n"
@@ -2655,8 +2656,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
      : "+r"(src),                      // %0
        "+r"(dst),                      // %1
        "+r"(width)                     // %2
-      : "r"(scale * 1.9259299444e-34f)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3");
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
 }

 void ByteToFloatRow_NEON(const uint8_t* src,
@@ -2664,7 +2665,6 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                         float scale,
                         int width) {
  asm volatile(
-      "vdup.32    q0, %3                         \n"

      "1:                                        \n"
      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
@@ -2674,15 +2674,15 @@ void ByteToFloatRow_NEON(const uint8_t* src,
      "vmovl.u16  q3, d3                         \n"
      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, d0[0]                  \n"  // scale
-      "vmul.f32   q3, q3, d0[0]                  \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // scale
+      "vmul.f32   q3, q3, %y3                    \n"
      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
      "bgt        1b                             \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
-      : "r"(scale)   // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3");
+      : "w"(scale)   // %3
+      : "cc", "memory", "q1", "q2", "q3");
 }

 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..