Commit 6e6f81b8 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Floating point Gaussian kernels

On SkylakeX for 720p
TestGaussPlane_F32 (657 ms)

On Pixel3
TestGaussPlane_F32 (1787 ms)

Bug: libyuv:852, b/145611468
Change-Id: I9859af1b9381621067992305727da285f82bdded
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1949667
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMarat Dukhan <maratek@google.com>
parent d82f4baf
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1741 Version: 1742
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -743,6 +743,19 @@ int ARGBBlur(const uint8_t* src_argb, ...@@ -743,6 +743,19 @@ int ARGBBlur(const uint8_t* src_argb,
int height, int height,
int radius); int radius);
// Gaussian 5x5 blur a float plane.
// Coefficients of 1, 4, 6, 4, 1.
// Each destination pixel is a blur of the 5x5
// pixels from the source.
// Source edges are clamped.
LIBYUV_API
int GaussPlane_F32(const float* src,
int src_stride,
float* dst,
int dst_stride,
int width,
int height);
// Multiply ARGB image by ARGB value. // Multiply ARGB image by ARGB value.
LIBYUV_API LIBYUV_API
int ARGBShade(const uint8_t* src_argb, int ARGBShade(const uint8_t* src_argb,
......
...@@ -419,6 +419,9 @@ extern "C" { ...@@ -419,6 +419,9 @@ extern "C" {
// The following are available on AArch64 platforms: // The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALESUMSAMPLES_NEON #define HAS_SCALESUMSAMPLES_NEON
#define HAS_GAUSSROW_F32_NEON
#define HAS_GAUSSCOL_F32_NEON
#endif #endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_ABGRTOUVROW_MSA #define HAS_ABGRTOUVROW_MSA
...@@ -601,6 +604,7 @@ extern "C" { ...@@ -601,6 +604,7 @@ extern "C" {
#endif #endif
typedef __declspec(align(16)) int16_t vec16[8]; typedef __declspec(align(16)) int16_t vec16[8];
typedef __declspec(align(16)) int32_t vec32[4]; typedef __declspec(align(16)) int32_t vec32[4];
typedef __declspec(align(16)) float vecf32[4];
typedef __declspec(align(16)) int8_t vec8[16]; typedef __declspec(align(16)) int8_t vec8[16];
typedef __declspec(align(16)) uint16_t uvec16[8]; typedef __declspec(align(16)) uint16_t uvec16[8];
typedef __declspec(align(16)) uint32_t uvec32[4]; typedef __declspec(align(16)) uint32_t uvec32[4];
...@@ -620,6 +624,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32]; ...@@ -620,6 +624,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
#endif #endif
typedef int16_t __attribute__((vector_size(16))) vec16; typedef int16_t __attribute__((vector_size(16))) vec16;
typedef int32_t __attribute__((vector_size(16))) vec32; typedef int32_t __attribute__((vector_size(16))) vec32;
typedef float __attribute__((vector_size(16))) vecf32;
typedef int8_t __attribute__((vector_size(16))) vec8; typedef int8_t __attribute__((vector_size(16))) vec8;
typedef uint16_t __attribute__((vector_size(16))) uvec16; typedef uint16_t __attribute__((vector_size(16))) uvec16;
typedef uint32_t __attribute__((vector_size(16))) uvec32; typedef uint32_t __attribute__((vector_size(16))) uvec32;
...@@ -634,6 +639,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8; ...@@ -634,6 +639,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8;
#define SIMD_ALIGNED(var) var #define SIMD_ALIGNED(var) var
typedef int16_t vec16[8]; typedef int16_t vec16[8];
typedef int32_t vec32[4]; typedef int32_t vec32[4];
typedef float vecf32[4];
typedef int8_t vec8[16]; typedef int8_t vec8[16];
typedef uint16_t uvec16[8]; typedef uint16_t uvec16[8];
typedef uint32_t uvec32[4]; typedef uint32_t uvec32[4];
...@@ -4256,6 +4262,25 @@ void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr, ...@@ -4256,6 +4262,25 @@ void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void GaussRow_F32_NEON(const float* src, float* dst, int width);
void GaussRow_F32_C(const float* src, float* dst, int width);
void GaussCol_F32_NEON(const float* src0,
const float* src1,
const float* src2,
const float* src3,
const float* src4,
float* dst,
int width);
void GaussCol_F32_C(const float* src0,
const float* src1,
const float* src2,
const float* src3,
const float* src4,
float* dst,
int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1741 #define LIBYUV_VERSION 1742
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -3043,6 +3043,84 @@ int ARGBShuffle(const uint8_t* src_bgra, ...@@ -3043,6 +3043,84 @@ int ARGBShuffle(const uint8_t* src_bgra,
return 0; return 0;
} }
// Gauss blur a float plane using Gaussian 5x5 filter with
// coefficients of 1, 4, 6, 4, 1.
// Each destination pixel is a blur of the 5x5
// pixels from the source.
// Source edges are clamped.
// Edge is 2 pixels on each side, and interior is multiple of 4.
LIBYUV_API
int GaussPlane_F32(const float* src,
int src_stride,
float* dst,
int dst_stride,
int width,
int height) {
int y;
void (*GaussCol_F32)(const float* src0,
const float* src1,
const float* src2,
const float* src3,
const float* src4,
float* dst,
int width) = GaussCol_F32_C;
void (*GaussRow_F32)(const float* src, float* dst, int width) = GaussRow_F32_C;
if (!src || !dst || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src = src + (height - 1) * src_stride;
src_stride = -src_stride;
}
#if defined(HAS_GAUSSCOL_F32_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
GaussCol_F32 = GaussCol_F32_NEON;
}
#endif
#if defined(HAS_GAUSSROW_F32_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
GaussRow_F32 = GaussRow_F32_NEON;
}
#endif
{
// 2 pixels on each side, but aligned out to 16 bytes.
align_buffer_64(rowbuf, (4 + width + 4) * 4);
memset(rowbuf, 0, 16);
memset(rowbuf + (4 + width) * 4, 0, 16);
float* row = (float*)(rowbuf + 16);
const float* src0 = src;
const float* src1 = src;
const float* src2 = src;
const float* src3 = src2 + ((height > 1) ? src_stride : 0);
const float* src4 = src3 + ((height > 2) ? src_stride: 0);
for (y = 0; y < height; ++y) {
GaussCol_F32(src0, src1, src2, src3, src4, row, width);
// Extrude edge by 2 floats
row[-2] = row[-1] = row[0];
row[width + 1] = row[width] = row[width - 1];
GaussRow_F32(row - 2, dst, width);
src0 = src1;
src1 = src2;
src2 = src3;
src3 = src4;
if ((y + 2) < (height - 1)) {
src4 += src_stride;
}
dst += dst_stride;
}
free_aligned_buffer_64(rowbuf);
}
return 0;
}
// Sobel ARGB effect. // Sobel ARGB effect.
static int ARGBSobelize(const uint8_t* src_argb, static int ARGBSobelize(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
......
...@@ -3358,6 +3358,29 @@ void GaussCol_C(const uint16_t* src0, ...@@ -3358,6 +3358,29 @@ void GaussCol_C(const uint16_t* src0,
} }
} }
void GaussRow_F32_C(const float* src, float* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ =
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * (1.0f / 256.0f);
++src;
}
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_F32_C(const float* src0,
const float* src1,
const float* src2,
const float* src3,
const float* src4,
float* dst,
int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
}
}
// Convert biplanar NV21 to packed YUV24 // Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_C(const uint8_t* src_y, void NV21ToYUV24Row_C(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
......
...@@ -2921,6 +2921,82 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { ...@@ -2921,6 +2921,82 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
} }
static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_F32_NEON(const float* src0,
const float* src1,
const float* src2,
const float* src3,
const float* src4,
float* dst,
int width) {
asm volatile(
"ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
"1: \n"
"ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
"ld1 {v2.4s, v3.4s}, [%1], #32 \n"
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"ld1 {v4.4s, v5.4s}, [%2], #32 \n"
"fmla v1.4s, v3.4s, v6.4s \n"
"fmla v0.4s, v4.4s, v7.4s \n" // * 6
"ld1 {v2.4s, v3.4s}, [%3], #32 \n"
"fmla v1.4s, v5.4s, v7.4s \n"
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"ld1 {v4.4s, v5.4s}, [%4], #32 \n"
"fmla v1.4s, v3.4s, v6.4s \n"
"fadd v0.4s, v0.4s, v4.4s \n" // * 1
"fadd v1.4s, v1.4s, v5.4s \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(src4), // %4
"+r"(dst), // %5
"+r"(width) // %6
: "r"(&kGaussCoefficients) // %7
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_F32_NEON(const float* src,
float* dst,
int width) {
asm volatile(
"ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
"1: \n"
"ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4\n" // load 12 samples, 5 rows
"fadd v0.4s, v0.4s, v1.4s \n" // * 1
"ld1 {v4.4s, v5.4s}, [%0], %5 \n"
"fadd v1.4s, v1.4s, v2.4s \n"
"fmla v0.4s, v4.4s, v7.4s \n" // * 6
"ld1 {v2.4s, v3.4s}, [%0], %4 \n"
"fmla v1.4s, v5.4s, v7.4s \n"
"ld1 {v4.4s, v5.4s}, [%0], %6 \n"
"fadd v2.4s, v2.4s, v4.4s \n"
"fadd v3.4s, v3.4s, v5.4s \n"
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"fmla v1.4s, v3.4s, v6.4s \n"
"fmul v0.4s, v0.4s, v8.4s \n" // / 256
"fmul v1.4s, v1.4s, v8.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(&kGaussCoefficients), // %3
"r"(8LL), // %4
"r"(-4LL), // %5
"r"(20LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
}
// Convert biplanar NV21 to packed YUV24 // Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_NEON(const uint8_t* src_y, void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
......
...@@ -3234,33 +3234,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width); ...@@ -3234,33 +3234,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width); extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]); SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]);
SIMD_ALIGNED(uint16_t dst_pixels_c[640]); SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
SIMD_ALIGNED(uint16_t dst_pixels_opt[640]); SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels)); memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
for (int i = 0; i < 640 + 4; ++i) { for (int i = 0; i < 1280 + 8; ++i) {
orig_pixels[i] = i * 256; orig_pixels[i] = i * 256;
} }
GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640); GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
int has_neon = TestCpuFlag(kCpuHasNEON); int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) { if (has_neon) {
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640); GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else { } else {
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640); GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} }
#else #else
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640); GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
#endif #endif
} }
for (int i = 0; i < 640; ++i) { for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
} }
...@@ -3286,48 +3286,127 @@ extern "C" void GaussCol_C(const uint16_t* src0, ...@@ -3286,48 +3286,127 @@ extern "C" void GaussCol_C(const uint16_t* src0,
int width); int width);
TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]); SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]);
SIMD_ALIGNED(uint32_t dst_pixels_c[640]); SIMD_ALIGNED(uint32_t dst_pixels_c[1280]);
SIMD_ALIGNED(uint32_t dst_pixels_opt[640]); SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels)); memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
for (int i = 0; i < 640 * 5; ++i) { for (int i = 0; i < 1280 * 5; ++i) {
orig_pixels[i] = i; orig_pixels[i] = static_cast<float>(i);
} }
GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
&orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
640); 1280);
for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
int has_neon = TestCpuFlag(kCpuHasNEON); int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) { if (has_neon) {
GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
&orig_pixels[640 * 3], &orig_pixels[640 * 4], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
&dst_pixels_opt[0], 640); &dst_pixels_opt[0], 1280);
} else { } else {
GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
&orig_pixels[640 * 3], &orig_pixels[640 * 4], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
&dst_pixels_opt[0], 640); &dst_pixels_opt[0], 1280);
} }
#else #else
GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
&orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
640); 1280);
#endif #endif
} }
for (int i = 0; i < 640; ++i) { for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
} }
}
EXPECT_EQ(dst_pixels_c[0], TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 + SIMD_ALIGNED(float orig_pixels[1280 + 4]);
640 * 4 * 1)); SIMD_ALIGNED(float dst_pixels_c[1280]);
EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704)); SIMD_ALIGNED(float dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
for (int i = 0; i < 1280 + 4; ++i) {
orig_pixels[i] = static_cast<float>(i);
}
GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else {
GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
}
#else
GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
#endif
}
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
}
TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
SIMD_ALIGNED(float dst_pixels_c[1280]);
SIMD_ALIGNED(float dst_pixels_opt[1280]);
align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4); // 5 rows
float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf);
memset(orig_pixels, 0, 1280 * 5 * 4);
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
for (int i = 0; i < 1280 * 5; ++i) {
orig_pixels[i] = static_cast<float>(i);
}
GaussCol_F32_C(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
&dst_pixels_c[0], 1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
GaussCol_F32_NEON(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
} else {
GaussCol_F32_C(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
}
#else
GaussCol_F32_C(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
#endif
}
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(orig_pixels_buf);
} }
TEST_F(LibYUVPlanarTest, SwapUVRow) { TEST_F(LibYUVPlanarTest, SwapUVRow) {
...@@ -3360,6 +3439,39 @@ TEST_F(LibYUVPlanarTest, SwapUVRow) { ...@@ -3360,6 +3439,39 @@ TEST_F(LibYUVPlanarTest, SwapUVRow) {
free_aligned_buffer_page_end(src_pixels_vu); free_aligned_buffer_page_end(src_pixels_vu);
free_aligned_buffer_page_end(dst_pixels_uv); free_aligned_buffer_page_end(dst_pixels_uv);
} }
#endif #endif // ENABLE_ROW_TESTS
TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
const int kSize = benchmark_width_ * benchmark_height_ * 4;
align_buffer_page_end(orig_pixels, kSize);
align_buffer_page_end(dst_pixels_opt, kSize);
align_buffer_page_end(dst_pixels_c, kSize);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
((float*)(orig_pixels))[i] = (i & 1023) * 3.14f;
}
memset(dst_pixels_opt, 1, kSize);
memset(dst_pixels_c, 2, kSize);
MaskCpuFlags(disable_cpu_flags_);
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
(float*)(dst_pixels_c), benchmark_width_,
benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
(float*)(dst_pixels_opt), benchmark_width_,
benchmark_width_, benchmark_height_);
}
for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
EXPECT_NEAR(((float*)(dst_pixels_c)) [i],
((float*)(dst_pixels_opt))[i], 1.f) << i;
}
free_aligned_buffer_page_end(dst_pixels_c);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(orig_pixels);
}
} // namespace libyuv } // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment