Floating point Gaussian kernels

On SkylakeX for 720p TestGaussPlane_F32 (657 ms) On Pixel3 TestGaussPlane_F32 (1787 ms) Bug: libyuv:852, b/145611468 Change-Id: I9859af1b9381621067992305727da285f82bdded Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1949667 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Marat Dukhan <maratek@google.com>

Floating point Gaussian kernels
On SkylakeX for 720p TestGaussPlane_F32 (657 ms) On Pixel3 TestGaussPlane_F32 (1787 ms) Bug: libyuv:852, b/145611468 Change-Id: I9859af1b9381621067992305727da285f82bdded Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1949667 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Marat Dukhan <maratek@google.com>
6e6f81b8 · Frank Barchard · Commit Bot · d82f4baf · 6e6f81b8 · 6e6f81b8
Commit 6e6f81b8 authored Dec 09, 2019 by Frank Barchard Committed by Commit Bot Dec 09, 2019
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1741
+Version: 1742
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -743,6 +743,19 @@ int ARGBBlur(const uint8_t* src_argb,
             int height,
             int radius);
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
 // Multiply ARGB image by ARGB value.
 LIBYUV_API
 int ARGBShade(const uint8_t* src_argb,

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -419,6 +419,9 @@ extern "C" {
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #define HAS_SCALESUMSAMPLES_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_GAUSSCOL_F32_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_ABGRTOUVROW_MSA
@@ -601,6 +604,7 @@ extern "C" {
 #endif
 typedef __declspec(align(16)) int16_t vec16[8];
 typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
 typedef __declspec(align(16)) int8_t vec8[16];
 typedef __declspec(align(16)) uint16_t uvec16[8];
 typedef __declspec(align(16)) uint32_t uvec32[4];
@@ -620,6 +624,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
 #endif
 typedef int16_t __attribute__((vector_size(16))) vec16;
 typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
 typedef int8_t __attribute__((vector_size(16))) vec8;
 typedef uint16_t __attribute__((vector_size(16))) uvec16;
 typedef uint32_t __attribute__((vector_size(16))) uvec32;
@@ -634,6 +639,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8;
 #define SIMD_ALIGNED(var) var
 typedef int16_t vec16[8];
 typedef int32_t vec32[4];
+typedef float vecf32[4];
 typedef int8_t vec8[16];
 typedef uint16_t uvec16[8];
 typedef uint32_t uvec32[4];
@@ -4256,6 +4262,25 @@ void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
                           const struct YuvConstants* yuvconstants,
                           int width);
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width);
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1741
+#define LIBYUV_VERSION 1742
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -3043,6 +3043,84 @@ int ARGBShuffle(const uint8_t* src_bgra,
  return 0;
 }
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
+  int y;
+  void (*GaussCol_F32)(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width) = GaussCol_F32_C;
+  void (*GaussRow_F32)(const float* src, float* dst, int width) = GaussRow_F32_C;
+  if (!src || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+#if defined(HAS_GAUSSCOL_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussCol_F32 = GaussCol_F32_NEON;
+  }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussRow_F32 = GaussRow_F32_NEON;
+  }
+#endif
+  {
+    // 2 pixels on each side, but aligned out to 16 bytes.
+    align_buffer_64(rowbuf, (4 + width + 4) * 4);
+    memset(rowbuf, 0, 16);
+    memset(rowbuf + (4 + width) * 4, 0, 16);
+    float* row = (float*)(rowbuf + 16);
+    const float* src0 = src;
+    const float* src1 = src;
+    const float* src2 = src;
+    const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+    const float* src4 = src3 + ((height > 2) ? src_stride: 0);
+    for (y = 0; y < height; ++y) {
+      GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+      // Extrude edge by 2 floats
+      row[-2] = row[-1] = row[0];
+      row[width + 1] = row[width] = row[width - 1];
+      GaussRow_F32(row - 2, dst, width);
+      src0 = src1;
+      src1 = src2;
+      src2 = src3;
+      src3 = src4;
+      if ((y + 2) < (height - 1)) {
+        src4 += src_stride;
+      }
+      dst += dst_stride;
+    }
+    free_aligned_buffer_64(rowbuf);
+  }
+  return 0;
+}
 // Sobel ARGB effect.
 static int ARGBSobelize(const uint8_t* src_argb,
                        int src_stride_argb,

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -3358,6 +3358,29 @@ void GaussCol_C(const uint16_t* src0,
  }
 }
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ =
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * (1.0f / 256.0f);
+    ++src;
+  }
+}
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
 // Convert biplanar NV21 to packed YUV24
 void NV21ToYUV24Row_C(const uint8_t* src_y,
                      const uint8_t* src_vu,

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2921,6 +2921,82 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width) {
+  asm volatile(
+      "ld2r       {v6.4s, v7.4s}, [%7]           \n"  // constants 4 and 6
+      "1:                                        \n"
+      "ld1        {v0.4s, v1.4s}, [%0], #32      \n"  // load 8 samples, 5 rows
+      "ld1        {v2.4s, v3.4s}, [%1], #32      \n"
+      "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
+      "ld1        {v4.4s, v5.4s}, [%2], #32      \n"
+      "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "fmla       v0.4s, v4.4s, v7.4s            \n"  // * 6
+      "ld1        {v2.4s, v3.4s}, [%3], #32      \n"
+      "fmla       v1.4s, v5.4s, v7.4s            \n"
+      "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
+      "ld1        {v4.4s, v5.4s}, [%4], #32      \n"
+      "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "fadd       v0.4s, v0.4s, v4.4s            \n"  // * 1
+      "fadd       v1.4s, v1.4s, v5.4s            \n"
+      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
+      "st1        {v0.4s, v1.4s}, [%5], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      : "r"(&kGaussCoefficients) // %7
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src,
+                       float* dst,
+                       int width) {
+  asm volatile(
+      "ld3r       {v6.4s, v7.4s, v8.4s}, [%3]    \n"  // constants 4, 6, 1/256
+      "1:                                        \n"
+      "ld1        {v0.4s, v1.4s, v2.4s}, [%0], %4\n"  // load 12 samples, 5 rows
+      "fadd       v0.4s, v0.4s, v1.4s            \n"  // * 1
+      "ld1        {v4.4s, v5.4s}, [%0], %5       \n"
+      "fadd       v1.4s, v1.4s, v2.4s            \n"
+      "fmla       v0.4s, v4.4s, v7.4s            \n"  // * 6
+      "ld1        {v2.4s, v3.4s}, [%0], %4       \n"
+      "fmla       v1.4s, v5.4s, v7.4s            \n"
+      "ld1        {v4.4s, v5.4s}, [%0], %6       \n"
+      "fadd       v2.4s, v2.4s, v4.4s            \n"
+      "fadd       v3.4s, v3.4s, v5.4s            \n"
+      "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
+      "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "fmul       v0.4s, v0.4s, v8.4s            \n"  // / 256
+      "fmul       v1.4s, v1.4s, v8.4s            \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "st1        {v0.4s, v1.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "r"(&kGaussCoefficients), // %3
+        "r"(8LL),    // %4
+        "r"(-4LL),   // %5
+        "r"(20LL)    // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
 // Convert biplanar NV21 to packed YUV24
 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -3234,33 +3234,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
 extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
 TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
-  SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]);
+  SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]);
-  SIMD_ALIGNED(uint16_t dst_pixels_c[640]);
+  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
-  SIMD_ALIGNED(uint16_t dst_pixels_opt[640]);
+  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
  memset(orig_pixels, 0, sizeof(orig_pixels));
  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
-  for (int i = 0; i < 640 + 4; ++i) {
+  for (int i = 0; i < 1280 + 8; ++i) {
    orig_pixels[i] = i * 256;
  }
-  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
+  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
-  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
    int has_neon = TestCpuFlag(kCpuHasNEON);
    if (has_neon) {
-      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
+      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
    } else {
-      GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+      GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
    }
 #else
-    GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+    GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
 #endif
  }
-  for (int i = 0; i < 640; ++i) {
+  for (int i = 0; i < 1280; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
@@ -3286,48 +3286,127 @@ extern "C" void GaussCol_C(const uint16_t* src0,
                           int width);
 TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
-  SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]);
+  SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]);
-  SIMD_ALIGNED(uint32_t dst_pixels_c[640]);
+  SIMD_ALIGNED(uint32_t dst_pixels_c[1280]);
-  SIMD_ALIGNED(uint32_t dst_pixels_opt[640]);
+  SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]);
  memset(orig_pixels, 0, sizeof(orig_pixels));
  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
-  for (int i = 0; i < 640 * 5; ++i) {
+  for (int i = 0; i < 1280 * 5; ++i) {
-    orig_pixels[i] = i;
+    orig_pixels[i] = static_cast<float>(i);
  }
-  GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+  GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-             &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
+             &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
-             640);
+             1280);
-  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
    int has_neon = TestCpuFlag(kCpuHasNEON);
    if (has_neon) {
-      GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+      GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-                    &orig_pixels[640 * 3], &orig_pixels[640 * 4],
+                    &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
-                    &dst_pixels_opt[0], 640);
+                    &dst_pixels_opt[0], 1280);
    } else {
-      GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+      GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-                 &orig_pixels[640 * 3], &orig_pixels[640 * 4],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
-                 &dst_pixels_opt[0], 640);
+                 &dst_pixels_opt[0], 1280);
    }
 #else
-    GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+    GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-               &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0],
+               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
-               640);
+               1280);
 #endif
  }
-  for (int i = 0; i < 640; ++i) {
+  for (int i = 0; i < 1280; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
+}
-  EXPECT_EQ(dst_pixels_c[0],
+TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
-            static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
+  SIMD_ALIGNED(float orig_pixels[1280 + 4]);
-                                  640 * 4 * 1));
+  SIMD_ALIGNED(float dst_pixels_c[1280]);
-  EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
+  SIMD_ALIGNED(float dst_pixels_opt[1280]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+  for (int i = 0; i < 1280 + 4; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
+  }
+  GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
+    } else {
+      GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
+    }
+#else
+    GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
+#endif
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+}
+TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
+  SIMD_ALIGNED(float dst_pixels_c[1280]);
+  SIMD_ALIGNED(float dst_pixels_opt[1280]);
+  align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4);  // 5 rows
+  float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf);
+  memset(orig_pixels, 0, 1280 * 5 * 4);
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+  for (int i = 0; i < 1280 * 5; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
+  }
+   GaussCol_F32_C(&orig_pixels[0],
+                  &orig_pixels[1280],
+                  &orig_pixels[1280 * 2],
+                  &orig_pixels[1280 * 3],
+                  &orig_pixels[1280 * 4],
+                  &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      GaussCol_F32_NEON(&orig_pixels[0],
+                        &orig_pixels[1280],
+                        &orig_pixels[1280 * 2],
+                        &orig_pixels[1280 * 3],
+                        &orig_pixels[1280 * 4],
+                        &dst_pixels_opt[0], 1280);
+    } else {
+      GaussCol_F32_C(&orig_pixels[0],
+                     &orig_pixels[1280],
+                     &orig_pixels[1280 * 2],
+                     &orig_pixels[1280 * 3],
+                     &orig_pixels[1280 * 4],
+                     &dst_pixels_opt[0], 1280);
+    }
+#else
+    GaussCol_F32_C(&orig_pixels[0],
+                   &orig_pixels[1280],
+                   &orig_pixels[1280 * 2],
+                   &orig_pixels[1280 * 3],
+                   &orig_pixels[1280 * 4],
+                   &dst_pixels_opt[0], 1280);
+#endif
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(orig_pixels_buf);
 }
 TEST_F(LibYUVPlanarTest, SwapUVRow) {
@@ -3360,6 +3439,39 @@ TEST_F(LibYUVPlanarTest, SwapUVRow) {
  free_aligned_buffer_page_end(src_pixels_vu);
  free_aligned_buffer_page_end(dst_pixels_uv);
 }
-#endif
+#endif  // ENABLE_ROW_TESTS
+TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
+  const int kSize = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_pixels_opt, kSize);
+  align_buffer_page_end(dst_pixels_c, kSize);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f;
+  }
+  memset(dst_pixels_opt, 1, kSize);
+  memset(dst_pixels_c, 2, kSize);
+  MaskCpuFlags(disable_cpu_flags_);
+  GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+                 (float*)(dst_pixels_c), benchmark_width_,
+                 benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+                   (float*)(dst_pixels_opt), benchmark_width_,
+                   benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
+    EXPECT_NEAR(((float*)(dst_pixels_c))  [i],
+                ((float*)(dst_pixels_opt))[i], 1.f) << i;
+  }
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
 }  // namespace libyuv