scale float samples and return max value

BUG=libyuv:717 TEST=ScaleSum unittest to compare C vs Arm implementation TBR=kjellander@chromium.org Change-Id: Iaa7af5547d979aad4722f868d31b405340115748 Reviewed-on: https://chromium-review.googlesource.com/600534Reviewed-by: Cheng Wang <wangcheng@google.com>

scale float samples and return max value
BUG=libyuv:717 TEST=ScaleSum unittest to compare C vs Arm implementation TBR=kjellander@chromium.org Change-Id: Iaa7af5547d979aad4722f868d31b405340115748 Reviewed-on: https://chromium-review.googlesource.com/600534Reviewed-by: Cheng Wang <wangcheng@google.com>
8676ad70 · Frank Barchard · 27036e33 · 8676ad70 · 8676ad70 · 8676ad70
Commit 8676ad70 authored Aug 04, 2017 by Frank Barchard
14 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1663
+Version: 1664
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -359,6 +359,11 @@ extern "C" {
 #define HAS_SOBELYROW_NEON
 #endif

+// The following are available on AArch64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SCALESUMSAMPLES_NEON
+#endif
+
 // The following are available on Mips platforms:
 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
@@ -3152,6 +3157,14 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
                                 const uint8* luma,
                                 uint32 lumacoeff);

+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
+void ScaleSamples_C(const float* src, float* dst, float scale, int width);
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1663
+#define LIBYUV_VERSION 1664

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -26,67 +26,61 @@ extern "C" {
 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
  uint32 diff;

-  asm volatile (
-    "vmov.u16   q4, #0                         \n"  // accumulator
+  asm volatile(
+      "vmov.u16   q4, #0                         \n"  // accumulator

-  "1:                                          \n"
-    "vld1.8     {q0, q1}, [%0]!                \n"
-    "vld1.8     {q2, q3}, [%1]!                \n"
-    "veor.32    q0, q0, q2                     \n"
-    "veor.32    q1, q1, q3                     \n"
-    "vcnt.i8    q0, q0                         \n"
-    "vcnt.i8    q1, q1                         \n"
-    "subs       %2, %2, #32                    \n"
-    "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
-    "vpadal.u8  q4, q0                         \n"  // 8 shorts
-    "bgt        1b                             \n"
+      "1:                                          \n"
+      "vld1.8     {q0, q1}, [%0]!                \n"
+      "vld1.8     {q2, q3}, [%1]!                \n"
+      "veor.32    q0, q0, q2                     \n"
+      "veor.32    q1, q1, q3                     \n"
+      "vcnt.i8    q0, q0                         \n"
+      "vcnt.i8    q1, q1                         \n"
+      "subs       %2, %2, #32                    \n"
+      "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
+      "vpadal.u8  q4, q0                         \n"  // 8 shorts
+      "bgt        1b                             \n"

-    "vpaddl.u16 q0, q4                         \n"  // 4 ints
-    "vpadd.u32  d0, d0, d1                     \n"
-    "vpadd.u32  d0, d0, d0                     \n"
-    "vmov.32    %3, d0[0]                      \n"
- 
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(diff)
-    :
-    :  "cc", "q0", "q1", "q2", "q3", "q4");
+      "vpaddl.u16 q0, q4                         \n"  // 4 ints
+      "vpadd.u32  d0, d0, d1                     \n"
+      "vpadd.u32  d0, d0, d0                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "q0", "q1", "q2", "q3", "q4");
  return diff;
 }

 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  uint32 sse;
-  asm volatile (
-    "vmov.u8    q8, #0                         \n"
-    "vmov.u8    q10, #0                        \n"
-    "vmov.u8    q9, #0                         \n"
-    "vmov.u8    q11, #0                        \n"
+  asm volatile(
+      "vmov.u8    q8, #0                         \n"
+      "vmov.u8    q10, #0                        \n"
+      "vmov.u8    q9, #0                         \n"
+      "vmov.u8    q11, #0                        \n"

-  "1:                                          \n"
-    "vld1.8     {q0}, [%0]!                    \n"
-    "vld1.8     {q1}, [%1]!                    \n"
-    "subs       %2, %2, #16                    \n"
-    "vsubl.u8   q2, d0, d2                     \n"
-    "vsubl.u8   q3, d1, d3                     \n"
-    "vmlal.s16  q8, d4, d4                     \n"
-    "vmlal.s16  q9, d6, d6                     \n"
-    "vmlal.s16  q10, d5, d5                    \n"
-    "vmlal.s16  q11, d7, d7                    \n"
-    "bgt        1b                             \n"
+      "1:                                          \n"
+      "vld1.8     {q0}, [%0]!                    \n"
+      "vld1.8     {q1}, [%1]!                    \n"
+      "subs       %2, %2, #16                    \n"
+      "vsubl.u8   q2, d0, d2                     \n"
+      "vsubl.u8   q3, d1, d3                     \n"
+      "vmlal.s16  q8, d4, d4                     \n"
+      "vmlal.s16  q9, d6, d6                     \n"
+      "vmlal.s16  q10, d5, d5                    \n"
+      "vmlal.s16  q11, d7, d7                    \n"
+      "bgt        1b                             \n"

-    "vadd.u32   q8, q8, q9                     \n"
-    "vadd.u32   q10, q10, q11                  \n"
-    "vadd.u32   q11, q8, q10                   \n"
-    "vpaddl.u32 q1, q11                        \n"
-    "vadd.u64   d0, d2, d3                     \n"
-    "vmov.32    %3, d0[0]                      \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+      "vadd.u32   q8, q8, q9                     \n"
+      "vadd.u32   q10, q10, q11                  \n"
+      "vadd.u32   q11, q8, q10                   \n"
+      "vpaddl.u32 q1, q11                        \n"
+      "vadd.u64   d0, d2, d3                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  return sse;
 }


--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -24,63 +24,57 @@ extern "C" {
 // uses short accumulator which restricts count to 131 KB
 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
  uint32 diff;
-  asm volatile (
-    "movi       v4.8h, #0                      \n"
+  asm volatile(
+      "movi       v4.8h, #0                      \n"

-  "1:                                          \n"
-    "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
-    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
-    "eor        v0.16b, v0.16b, v2.16b         \n"
-    "eor        v1.16b, v1.16b, v3.16b         \n" 
-    "cnt        v0.16b, v0.16b                 \n"
-    "cnt        v1.16b, v1.16b                 \n"
-    "subs       %w2, %w2, #32                  \n"
-    "add        v0.16b, v0.16b, v1.16b         \n"
-    "uadalp     v4.8h, v0.16b                  \n"
-    "b.gt       1b                             \n"
+      "1:                                          \n"
+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
+      "eor        v0.16b, v0.16b, v2.16b         \n"
+      "eor        v1.16b, v1.16b, v3.16b         \n"
+      "cnt        v0.16b, v0.16b                 \n"
+      "cnt        v1.16b, v1.16b                 \n"
+      "subs       %w2, %w2, #32                  \n"
+      "add        v0.16b, v0.16b, v1.16b         \n"
+      "uadalp     v4.8h, v0.16b                  \n"
+      "b.gt       1b                             \n"

-    "uaddlv     s4, v4.8h                      \n"
-    "fmov       %w3, s4                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(diff)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v4");
+      "uaddlv     s4, v4.8h                      \n"
+      "fmov       %w3, s4                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v4");
  return diff;
 }

 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  uint32 sse;
-  asm volatile (
-    "eor        v16.16b, v16.16b, v16.16b      \n"
-    "eor        v18.16b, v18.16b, v18.16b      \n"
-    "eor        v17.16b, v17.16b, v17.16b      \n"
-    "eor        v19.16b, v19.16b, v19.16b      \n"
+  asm volatile(
+      "eor        v16.16b, v16.16b, v16.16b      \n"
+      "eor        v18.16b, v18.16b, v18.16b      \n"
+      "eor        v17.16b, v17.16b, v17.16b      \n"
+      "eor        v19.16b, v19.16b, v19.16b      \n"

-  "1:                                          \n"
-    "ld1        {v0.16b}, [%0], #16            \n"
-    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %w2, %w2, #16                  \n"
-    "usubl      v2.8h, v0.8b, v1.8b            \n"
-    "usubl2     v3.8h, v0.16b, v1.16b          \n"
-    "smlal      v16.4s, v2.4h, v2.4h           \n"
-    "smlal      v17.4s, v3.4h, v3.4h           \n"
-    "smlal2     v18.4s, v2.8h, v2.8h           \n"
-    "smlal2     v19.4s, v3.8h, v3.8h           \n"
-    "b.gt       1b                             \n"
+      "1:                                          \n"
+      "ld1        {v0.16b}, [%0], #16            \n"
+      "ld1        {v1.16b}, [%1], #16            \n"
+      "subs       %w2, %w2, #16                  \n"
+      "usubl      v2.8h, v0.8b, v1.8b            \n"
+      "usubl2     v3.8h, v0.16b, v1.16b          \n"
+      "smlal      v16.4s, v2.4h, v2.4h           \n"
+      "smlal      v17.4s, v3.4h, v3.4h           \n"
+      "smlal2     v18.4s, v2.8h, v2.8h           \n"
+      "smlal2     v19.4s, v3.8h, v3.8h           \n"
+      "b.gt       1b                             \n"

-    "add        v16.4s, v16.4s, v17.4s         \n"
-    "add        v18.4s, v18.4s, v19.4s         \n"
-    "add        v19.4s, v16.4s, v18.4s         \n"
-    "addv       s0, v19.4s                     \n"
-    "fmov       %w3, s0                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+      "add        v16.4s, v16.4s, v17.4s         \n"
+      "add        v18.4s, v18.4s, v19.4s         \n"
+      "add        v19.4s, v16.4s, v18.4s         \n"
+      "addv       s0, v19.4s                     \n"
+      "fmov       %w3, s0                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
  return sse;
 }


--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
--- a/source/rotate_win.cc
+++ b/source/rotate_win.cc
@@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
    movdqa    xmm7, xmm5
    lea       eax, [eax + 8 * edi + 16]
    neg       edi
-    // Second round of bit swap.
+        // Second round of bit swap.
    movdqa    xmm5, xmm0
    punpcklwd xmm0, xmm2
    punpckhwd xmm5, xmm2
@@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
    punpckhwd xmm6, xmm7
    movdqa    xmm7, xmm6

-    // Third round of bit swap.
-    // Write to the destination pointer.
+        // Third round of bit swap.
+        // Write to the destination pointer.
    movdqa    xmm6, xmm0
    punpckldq xmm0, xmm4
    punpckhdq xmm6, xmm4

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2639,6 +2639,25 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
 }
 #endif

+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
+  float fmax = 0.f;
+  int i;
+  for (i = 0; i < width; ++i) {
+    float v = *src++ * scale;
+    *dst++ = v;
+    fmax = (v > fmax) ? v : fmax;
+  }
+  return fmax;
+}
+
+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float v = *src++ * scale;
+    *dst++ = v;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2612,6 +2612,53 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
      : "cc", "memory", "v1", "v2", "v3");
 }

+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fmax;
+  asm volatile(
+      "movi       v3.4s, #0                      \n"  // max
+      "movi       v4.4s, #0                      \n"  // max
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v1.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v2.4s, v2.4s, %4.s[0]          \n"  // scale
+      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
+      "fmax       v3.4s, v3.4s, v1.4s            \n"  // max
+      "fmax       v4.4s, v4.4s, v2.4s            \n"
+      "b.gt       1b                             \n"
+      "fmax       v3.4s, v3.4s, v4.4s            \n"  // max
+      "fmaxv      %s3, v3.4s                     \n"  // signed max acculator
+
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width),  // %2
+        "=w"(fmax)    // %3
+      : "w"(scale)    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4");
+  return fmax;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus

--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -816,7 +816,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
    mov        ecx, [esp + 12]  // src_width
    pxor       xmm5, xmm5

-    // sum rows
+        // sum rows
  xloop:
    movdqu     xmm3, [eax]  // read 16 bytes
    lea        eax, [eax + 16]
@@ -847,7 +847,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
    mov         ecx, [esp + 12]  // src_width
    vpxor       ymm5, ymm5, ymm5

-    // sum rows
+        // sum rows
  xloop:
    vmovdqu     ymm3, [eax]  // read 32 bytes
    lea         eax, [eax + 32]
@@ -939,7 +939,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
    add        ecx, 2 - 1
    jl         xloop99

-        // 1 pixel remainder
+            // 1 pixel remainder
    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
    movd       xmm0, ebx
    psrlw      xmm2, 9  // 7 bit fractions.
@@ -1194,7 +1194,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
    sub        ecx, 4
    jl         xloop49

-    // 4 Pixel loop.
+        // 4 Pixel loop.
 xloop4:
    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
@@ -1218,7 +1218,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
    test       ecx, 2
    je         xloop29

-    // 2 Pixels.
+        // 2 Pixels.
    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
    pextrw     eax, xmm2, 5  // get x2 integer.
@@ -1231,7 +1231,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
    test       ecx, 1
    je         xloop99

-    // 1 Pixels.
+        // 1 Pixels.
    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
    movd       dword ptr [edi], xmm0
 xloop99:
@@ -1309,7 +1309,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
    add        ecx, 2 - 1
    jl         xloop99

-        // 1 pixel remainder
+            // 1 pixel remainder
    psrlw      xmm2, 9  // 7 bit fractions.
    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
    pshufb     xmm2, xmm5  // 00000000

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -11,6 +11,9 @@
 #include <stdlib.h>
 #include <time.h>

+// row.h defines SIMD_ALIGNED, overriding unit_test.h
+#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
+
 #include "../unit_test/unit_test.h"
 #include "libyuv/compare.h"
 #include "libyuv/convert.h"
@@ -2518,4 +2521,146 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }

+float TestScaleSumSamples(int benchmark_width,
+                          int benchmark_height,
+                          int benchmark_iterations,
+                          float scale,
+                          bool opt) {
+  int i, j;
+  float max_c, max_opt;
+  const int y_plane_size = benchmark_width * benchmark_height * 4;
+
+  align_buffer_page_end(orig_y, y_plane_size * 3);
+  uint8* dst_opt = orig_y + y_plane_size;
+  uint8* dst_c = orig_y + y_plane_size * 2;
+
+  // Randomize works but may contain some denormals affecting performance.
+  // MemRandomize(orig_y, y_plane_size);
+  for (i = 0; i < y_plane_size / 4; ++i) {
+    (reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
+  }
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 1, y_plane_size);
+
+  // Disable all optimizations.
+  max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+                            reinterpret_cast<float*>(dst_c), scale,
+                            benchmark_width * benchmark_height);
+
+  // Enable optimizations.
+  for (j = 0; j < benchmark_iterations; j++) {
+#ifdef HAS_SCALESUMSAMPLES_NEON
+    if (opt) {
+      max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
+                                     reinterpret_cast<float*>(dst_opt), scale,
+                                     benchmark_width * benchmark_height);
+
+    } else {
+      max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+                                  reinterpret_cast<float*>(dst_opt), scale,
+                                  benchmark_width * benchmark_height);
+    }
+#else
+    max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+                                reinterpret_cast<float*>(dst_opt), scale,
+                                benchmark_width * benchmark_height);
+#endif
+  }
+
+  float max_diff = 0;
+  for (i = 0; i < y_plane_size / 4; ++i) {
+    float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
+                          (reinterpret_cast<float*>(dst_opt)[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) {
+  float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
+                                   benchmark_iterations_, 1.2f, false);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) {
+  float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
+                                   benchmark_iterations_, 1.2f, true);
+  EXPECT_EQ(0, diff);
+}
+
+float TestScaleSamples(int benchmark_width,
+                       int benchmark_height,
+                       int benchmark_iterations,
+                       float scale,
+                       bool opt) {
+  int i, j;
+  const int y_plane_size = benchmark_width * benchmark_height * 4;
+
+  align_buffer_page_end(orig_y, y_plane_size * 3);
+  uint8* dst_opt = orig_y + y_plane_size;
+  uint8* dst_c = orig_y + y_plane_size * 2;
+
+  // Randomize works but may contain some denormals affecting performance.
+  // MemRandomize(orig_y, y_plane_size);
+  for (i = 0; i < y_plane_size / 4; ++i) {
+    (reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
+  }
+
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 1, y_plane_size);
+
+  // Disable all optimizations.
+  ScaleSamples_C(reinterpret_cast<float*>(orig_y),
+                 reinterpret_cast<float*>(dst_c), scale,
+                 benchmark_width * benchmark_height);
+
+  // Enable optimizations.
+  for (j = 0; j < benchmark_iterations; j++) {
+#ifdef HAS_SCALESAMPLES_NEON
+    if (opt) {
+      max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
+                                  reinterpret_cast<float*>(dst_opt), scale,
+                                  benchmark_width * benchmark_height);
+
+    } else {
+      ScaleSamples_C(reinterpret_cast<float*>(orig_y),
+                     reinterpret_cast<float*>(dst_opt), scale,
+                     benchmark_width * benchmark_height);
+    }
+#else
+    ScaleSamples_C(reinterpret_cast<float*>(orig_y),
+                   reinterpret_cast<float*>(dst_opt), scale,
+                   benchmark_width * benchmark_height);
+#endif
+  }
+
+  float max_diff = 0;
+  for (i = 0; i < y_plane_size / 4; ++i) {
+    float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
+                          (reinterpret_cast<float*>(dst_opt)[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSamples_C) {
+  float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, 1.2f, false);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
+  float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, 1.2f, true);
+  EXPECT_EQ(0, diff);
+}
+
 }  // namespace libyuv
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -36,6 +36,9 @@ static __inline int Abs(int v) {
  return v >= 0 ? v : -v;
 }

+static __inline float FAbs(float v) {
+  return v >= 0 ? v : -v;
+}
 #define OFFBY 0

 // Scaling uses 16.16 fixed point to step thru the source image, so a
@@ -70,8 +73,11 @@ static inline bool SizeValid(int src_width,
  uint8* var;                                                                 \
  uint8* var##_mem;                                                           \
  var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
-  var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) &  /* NOLINT */ \
-      ~4095) - (size)) & ~63);
+  var = (uint8*)((intptr_t)(var##_mem +                                       \
+                            (((size) + 4095 + 63) & /* NOLINT */              \
+                             ~4095) -                                         \
+                            (size)) &                                         \
+                 ~63);

 #define free_aligned_buffer_page_end(var) \
  free(var##_mem);                        \