Commit 62a961be authored by fbarchard@google.com's avatar fbarchard@google.com

Neon version of I420ToNV12 and I420ToNV21. NV21ToI420 added as function. …

Neon version of I420ToNV12 and I420ToNV21.  NV21ToI420 added as function.  CopyRow changed to vld4.8 to allow unaligned copy.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/922005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@435 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 66fe097a
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 433
Version: 435
License: BSD
License File: LICENSE
......
......@@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert NV12 to I420. Also used for NV21.
// Convert NV12 to I420.
LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
......@@ -82,6 +82,15 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert NV21 to I420.
LIBYUV_API
int NV21ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert M420 to I420.
LIBYUV_API
int M420ToI420(const uint8* src_m420, int src_stride_m420,
......
......@@ -56,10 +56,25 @@ int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
// TODO(fbarchard): I420ToNV12
// TODO(fbarchard): I420ToM420
// TODO(fbarchard): I420ToQ420
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
LIBYUV_API
int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
LIBYUV_API
int I420ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
......
......@@ -168,6 +168,7 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTOYROW_NEON
#define HAS_MERGEUV_NEON
#endif
// The following are available on Mips platforms
......@@ -308,6 +309,11 @@ void SplitUV_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 433
#define LIBYUV_VERSION 435
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -302,7 +302,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) {
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#elif defined(HAS_COPYROW_X86)
......@@ -460,6 +460,22 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
width, height);
}
// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
LIBYUV_API
int NV21ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return X420ToI420(src_y, src_stride_y, src_stride_y,
src_vu, src_stride_vu,
dst_y, dst_stride_y,
dst_v, dst_stride_v,
dst_u, dst_stride_u,
width, height);
}
// Convert M420 to I420.
LIBYUV_API
int M420ToI420(const uint8* src_m420, int src_stride_m420,
......@@ -503,7 +519,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
// CopyRow for rows of just Y in Q420 copied to Y plane of I420.
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
......
......@@ -50,7 +50,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
int halfwidth = (width + 1) >> 1;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) {
CopyRow = CopyRow_NEON;
}
#elif defined(HAS_COPYROW_X86)
......@@ -477,6 +477,62 @@ int I420ToV210(const uint8* src_y, int src_stride_y,
return 0;
}
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
dst_stride_y = -dst_stride_y;
dst_stride_uv = -dst_stride_uv;
}
int halfwidth = (width + 1) >> 1;
void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUV_C;
#if defined(HAS_SPLITUV_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
MergeUV = MergeUV_NEON;
}
#endif
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
int halfheight = (height + 1) >> 1;
for (int y = 0; y < halfheight; ++y) {
// Copy a row of UV.
MergeUV_C(src_u, src_v, dst_uv, halfwidth);
src_u += src_stride_u;
src_v += src_stride_v;
dst_uv += dst_stride_uv;
}
return 0;
}
LIBYUV_API
int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height) {
return I420ToNV12(src_y, src_stride_y,
src_v, src_stride_v,
src_u, src_stride_u,
dst_y, src_stride_y,
dst_vu, dst_stride_vu,
width, height);
}
// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y,
......
......@@ -30,7 +30,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
int width, int height) {
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
......
......@@ -859,7 +859,7 @@ void RotatePlane180(const uint8* src, int src_stride,
#endif
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
......
......@@ -90,7 +90,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
#endif
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
......
......@@ -717,6 +717,21 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
}
}
void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
for (int x = 0; x < width - 1; x += 2) {
dst_uv[0] = src_u[x];
dst_uv[1] = src_v[x];
dst_uv[2] = src_u[x + 1];
dst_uv[3] = src_v[x + 1];
dst_uv += 4;
}
if (width & 1) {
dst_uv[0] = src_u[width - 1];
dst_uv[1] = src_v[width - 1];
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
......
......@@ -345,7 +345,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
"vld2.u8 {q0, q1}, [%0:128]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1:128]! \n" // store U
"vst1.u8 {q1}, [%2:128]! \n" // Store V
"vst1.u8 {q1}, [%2:128]! \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
......@@ -355,6 +355,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
: "memory", "cc", "q0", "q1" // Clobber List
);
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
// Alignment requirement: Multiple of 16 pixels, pointers unaligned.
void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
......@@ -365,7 +366,7 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V
"vst1.u8 {q1}, [%2]! \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
......@@ -377,21 +378,43 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
#endif // HAS_SPLITUV_NEON
#ifdef HAS_MERGEUV_NEON
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {q0}, [%1]! \n" // load U
"vld1.u8 {q1}, [%2]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
"vst2.u8 {q0, q1}, [%0]! \n" // store 16 pairs of UV
"bgt 1b \n"
:
"+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "memory", "cc", "q0", "q1" // Clobber List
);
}
#endif // HAS_MERGEUV_NEON
#ifdef HAS_COPYROW_NEON
// Copy multiple of 64
// Copy multiple of 32. vld4.u8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vldm %0!, {q0, q1, q2, q3} \n" // load 64
"subs %2, %2, #64 \n" // 64 processed per loop
"vstm %1!, {q0, q1, q2, q3} \n" // store 64
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
"vst4.u8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
: "memory", "cc", "q0", "q1" // Clobber List
);
}
#endif // HAS_COPYROW_NEON
......@@ -403,7 +426,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store
"vst1.u8 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment