Commit 681c6c67 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

Add LIBYUV_API to NV12ToABGR and I444Rotate, I444Scale

Gaussian blur low levels ported to 32 bit neon.
But they are not hooked up to anything but a unittest.

Bug:b/248041731, b/132108021, b/129908793
Change-Id: Iccebb8ffd6b719810aa11dd770a525227da4c357
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1611206
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarChong Zhang <chz@google.com>
parent 05f72b86
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1727
Version: 1730
License: BSD
License File: LICENSE
......
......@@ -256,6 +256,7 @@ int NV21ToARGB(const uint8_t* src_y,
int height);
// Convert NV12 to ABGR.
LIBYUV_API
int NV12ToABGR(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
......
......@@ -126,6 +126,25 @@ int I444Scale(const uint8_t* src_y,
int dst_height,
enum FilterMode filtering);
LIBYUV_API
int I444Scale_16(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
int src_width,
int src_height,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_u,
int dst_stride_u,
uint16_t* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
enum FilterMode filtering);
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1727
#define LIBYUV_VERSION 1730
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -1793,8 +1793,9 @@ int NV21ToARGB(const uint8_t* src_y,
}
// Convert NV12 to ABGR.
// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
// To swap the UV use NV12 instead of NV21.LIBYUV_API
LIBYUV_API
int NV12ToABGR(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
......
......@@ -521,28 +521,19 @@ int I444Rotate(const uint8_t* src_y,
CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
case libyuv::kRotate90:
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y,
width, height);
RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u,
width, height);
RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v,
width, height);
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
case libyuv::kRotate270:
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y,
width, height);
RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u,
width, height);
RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v,
width, height);
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
case libyuv::kRotate180:
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y,
width, height);
RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u,
width, height);
RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v,
width, height);
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
default:
break;
......
......@@ -2685,6 +2685,84 @@ void ByteToFloatRow_NEON(const uint8_t* src,
: "cc", "memory", "q1", "q2", "q3");
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_NEON(const uint16_t* src0,
const uint16_t* src1,
const uint16_t* src2,
const uint16_t* src3,
const uint16_t* src4,
uint32_t* dst,
int width) {
asm volatile(
"vmov.u16 d6, #4 \n" // constant 4
"vmov.u16 d7, #6 \n" // constant 6
"1: \n"
"vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
"vld1.16 {q2}, [%4]! \n"
"vaddl.u16 q0, d2, d4 \n" // * 1
"vaddl.u16 q1, d3, d5 \n" // * 1
"vld1.16 {q2}, [%1]! \n"
"vmlal.u16 q0, d4, d6 \n" // * 4
"vmlal.u16 q1, d5, d6 \n" // * 4
"vld1.16 {q2}, [%2]! \n"
"vmlal.u16 q0, d4, d7 \n" // * 6
"vmlal.u16 q1, d5, d7 \n" // * 6
"vld1.16 {q2}, [%3]! \n"
"vmlal.u16 q0, d4, d6 \n" // * 4
"vmlal.u16 q1, d5, d6 \n" // * 4
"subs %6, %6, #8 \n" // 8 processed per loop
"vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
"bgt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(src4), // %4
"+r"(dst), // %5
"+r"(width) // %6
:
: "cc", "memory", "q0", "q1", "q2", "q3");
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src1 = src + 1;
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
"vmov.u32 q10, #4 \n" // constant 4
"vmov.u32 q11, #6 \n" // constant 6
"1: \n"
"vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
"vld1.32 {q2}, [%0] \n"
"vadd.u32 q0, q0, q1 \n" // * 1
"vadd.u32 q1, q1, q2 \n" // * 1
"vld1.32 {q2, q3}, [%2]! \n"
"vmla.u32 q0, q2, q11 \n" // * 6
"vmla.u32 q1, q3, q11 \n" // * 6
"vld1.32 {q2, q3}, [%1]! \n"
"vld1.32 {q8, q9}, [%3]! \n"
"vadd.u32 q2, q2, q8 \n" // add rows for * 4
"vadd.u32 q3, q3, q9 \n"
"vmla.u32 q0, q2, q10 \n" // * 4
"vmla.u32 q1, q3, q10 \n" // * 4
"subs %5, %5, #8 \n" // 8 processed per loop
"vqshrn.u32 d0, q0, #8 \n" // round and pack
"vqshrn.u32 d1, q1, #8 \n"
"vst1.u16 {q0}, [%4]! \n" // store 8 samples
"bgt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
}
// Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
......
......@@ -1824,6 +1824,39 @@ int I444Scale(const uint8_t* src_y,
return 0;
}
LIBYUV_API
int I444Scale_16(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
int src_width,
int src_height,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_u,
int dst_stride_u,
uint16_t* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
enum FilterMode filtering) {
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
dst_width, dst_height, filtering);
ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
dst_width, dst_height, filtering);
ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
dst_width, dst_height, filtering);
return 0;
}
// Deprecated api
LIBYUV_API
int Scale(const uint8_t* src_y,
......
......@@ -3186,7 +3186,8 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
}
GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
......@@ -3239,7 +3240,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
&orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
640);
for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
......
......@@ -135,6 +135,123 @@ TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
benchmark_cpu_info_);
}
static void I444TestRotate(int src_width,
int src_height,
int dst_width,
int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
if (src_width < 1) {
src_width = 1;
}
if (src_height == 0) {
src_height = 1;
}
if (dst_width < 1) {
dst_width = 1;
}
if (dst_height < 1) {
dst_height = 1;
}
int src_i444_y_size = src_width * Abs(src_height);
int src_i444_uv_size = src_width * Abs(src_height);
int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
align_buffer_page_end(src_i444, src_i444_size);
for (int i = 0; i < src_i444_size; ++i) {
src_i444[i] = fastrand() & 0xff;
}
int dst_i444_y_size = dst_width * dst_height;
int dst_i444_uv_size = dst_width * dst_height;
int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
align_buffer_page_end(dst_i444_c, dst_i444_size);
align_buffer_page_end(dst_i444_opt, dst_i444_size);
memset(dst_i444_c, 2, dst_i444_size);
memset(dst_i444_opt, 3, dst_i444_size);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
dst_width, src_width, src_height, mode);
}
// Rotation should be exact.
for (int i = 0; i < dst_i444_size; ++i) {
EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
}
free_aligned_buffer_page_end(dst_i444_c);
free_aligned_buffer_page_end(dst_i444_opt);
free_aligned_buffer_page_end(src_i444);
}
TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
// TODO(fbarchard): Remove odd width tests.
// Odd width tests work but disabled because they use C code and can be
// tested by passing an odd width command line or environment variable.
TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
static void NV12TestRotate(int src_width,
int src_height,
int dst_width,
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment