Commit 62a961be authored by fbarchard@google.com's avatar fbarchard@google.com

Neon version of I420ToNV12 and I420ToNV21. NV21ToI420 added as function. …

Neon version of I420ToNV12 and I420ToNV21.  NV21ToI420 added as function.  CopyRow changed to vld4.8 to allow unaligned copy.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/922005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@435 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 66fe097a
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 433 Version: 435
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y, ...@@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height); int width, int height);
// Convert NV12 to I420. Also used for NV21. // Convert NV12 to I420.
LIBYUV_API LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y, int NV12ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv, const uint8* src_uv, int src_stride_uv,
...@@ -82,6 +82,15 @@ int NV12ToI420(const uint8* src_y, int src_stride_y, ...@@ -82,6 +82,15 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height); int width, int height);
// Convert NV21 to I420.
LIBYUV_API
int NV21ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert M420 to I420. // Convert M420 to I420.
LIBYUV_API LIBYUV_API
int M420ToI420(const uint8* src_m420, int src_stride_m420, int M420ToI420(const uint8* src_m420, int src_stride_m420,
......
...@@ -56,10 +56,25 @@ int I400Copy(const uint8* src_y, int src_stride_y, ...@@ -56,10 +56,25 @@ int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
// TODO(fbarchard): I420ToNV12
// TODO(fbarchard): I420ToM420 // TODO(fbarchard): I420ToM420
// TODO(fbarchard): I420ToQ420 // TODO(fbarchard): I420ToQ420
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
LIBYUV_API
int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
LIBYUV_API LIBYUV_API
int I420ToYUY2(const uint8* src_y, int src_stride_y, int I420ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
......
...@@ -168,6 +168,7 @@ extern "C" { ...@@ -168,6 +168,7 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYROW_NEON
#define HAS_MERGEUV_NEON
#endif #endif
// The following are available on Mips platforms // The following are available on Mips platforms
...@@ -308,6 +309,11 @@ void SplitUV_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); ...@@ -308,6 +309,11 @@ void SplitUV_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix); int pix);
void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 433 #define LIBYUV_VERSION 435
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -302,7 +302,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, ...@@ -302,7 +302,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) { int width, int height) {
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON; CopyRow = CopyRow_NEON;
} }
#elif defined(HAS_COPYROW_X86) #elif defined(HAS_COPYROW_X86)
...@@ -460,6 +460,22 @@ int NV12ToI420(const uint8* src_y, int src_stride_y, ...@@ -460,6 +460,22 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
width, height); width, height);
} }
// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
LIBYUV_API
int NV21ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return X420ToI420(src_y, src_stride_y, src_stride_y,
src_vu, src_stride_vu,
dst_y, dst_stride_y,
dst_v, dst_stride_v,
dst_u, dst_stride_u,
width, height);
}
// Convert M420 to I420. // Convert M420 to I420.
LIBYUV_API LIBYUV_API
int M420ToI420(const uint8* src_m420, int src_stride_m420, int M420ToI420(const uint8* src_m420, int src_stride_m420,
...@@ -503,7 +519,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, ...@@ -503,7 +519,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
// CopyRow for rows of just Y in Q420 copied to Y plane of I420. // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON; CopyRow = CopyRow_NEON;
} }
#endif #endif
......
...@@ -50,7 +50,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y, ...@@ -50,7 +50,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) {
CopyRow = CopyRow_NEON; CopyRow = CopyRow_NEON;
} }
#elif defined(HAS_COPYROW_X86) #elif defined(HAS_COPYROW_X86)
...@@ -477,6 +477,62 @@ int I420ToV210(const uint8* src_y, int src_stride_y, ...@@ -477,6 +477,62 @@ int I420ToV210(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
dst_stride_y = -dst_stride_y;
dst_stride_uv = -dst_stride_uv;
}
int halfwidth = (width + 1) >> 1;
void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUV_C;
#if defined(HAS_SPLITUV_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
MergeUV = MergeUV_NEON;
}
#endif
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
int halfheight = (height + 1) >> 1;
for (int y = 0; y < halfheight; ++y) {
// Copy a row of UV.
MergeUV_C(src_u, src_v, dst_uv, halfwidth);
src_u += src_stride_u;
src_v += src_stride_v;
dst_uv += dst_stride_uv;
}
return 0;
}
LIBYUV_API
int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height) {
return I420ToNV12(src_y, src_stride_y,
src_v, src_stride_v,
src_u, src_stride_u,
dst_y, src_stride_y,
dst_vu, dst_stride_vu,
width, height);
}
// Convert I420 to ARGB. // Convert I420 to ARGB.
LIBYUV_API LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y, int I420ToARGB(const uint8* src_y, int src_stride_y,
......
...@@ -30,7 +30,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -30,7 +30,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
int width, int height) { int width, int height) {
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON; CopyRow = CopyRow_NEON;
} }
#endif #endif
......
...@@ -859,7 +859,7 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -859,7 +859,7 @@ void RotatePlane180(const uint8* src, int src_stride,
#endif #endif
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON; CopyRow = CopyRow_NEON;
} }
#endif #endif
......
...@@ -90,7 +90,7 @@ void ARGBRotate180(const uint8* src, int src_stride, ...@@ -90,7 +90,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
#endif #endif
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON; CopyRow = CopyRow_NEON;
} }
#endif #endif
......
...@@ -717,6 +717,21 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { ...@@ -717,6 +717,21 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
} }
} }
void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
for (int x = 0; x < width - 1; x += 2) {
dst_uv[0] = src_u[x];
dst_uv[1] = src_v[x];
dst_uv[2] = src_u[x + 1];
dst_uv[3] = src_v[x + 1];
dst_uv += 4;
}
if (width & 1) {
dst_uv[0] = src_u[width - 1];
dst_uv[1] = src_v[width - 1];
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) { void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count); memcpy(dst, src, count);
} }
......
...@@ -345,7 +345,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { ...@@ -345,7 +345,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
"vld2.u8 {q0, q1}, [%0:128]! \n" // load 16 pairs of UV "vld2.u8 {q0, q1}, [%0:128]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1:128]! \n" // store U "vst1.u8 {q0}, [%1:128]! \n" // store U
"vst1.u8 {q1}, [%2:128]! \n" // Store V "vst1.u8 {q1}, [%2:128]! \n" // store V
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -355,6 +355,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { ...@@ -355,6 +355,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
: "memory", "cc", "q0", "q1" // Clobber List : "memory", "cc", "q0", "q1" // Clobber List
); );
} }
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
// Alignment requirement: Multiple of 16 pixels, pointers unaligned. // Alignment requirement: Multiple of 16 pixels, pointers unaligned.
void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
...@@ -365,7 +366,7 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -365,7 +366,7 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V "vst1.u8 {q1}, [%2]! \n" // store V
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -377,21 +378,43 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -377,21 +378,43 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
} }
#endif // HAS_SPLITUV_NEON #endif // HAS_SPLITUV_NEON
#ifdef HAS_MERGEUV_NEON
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {q0}, [%1]! \n" // load U
"vld1.u8 {q1}, [%2]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
"vst2.u8 {q0, q1}, [%0]! \n" // store 16 pairs of UV
"bgt 1b \n"
:
"+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "memory", "cc", "q0", "q1" // Clobber List
);
}
#endif // HAS_MERGEUV_NEON
#ifdef HAS_COPYROW_NEON #ifdef HAS_COPYROW_NEON
// Copy multiple of 64 // Copy multiple of 32. vld4.u8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vldm %0!, {q0, q1, q2, q3} \n" // load 64 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #64 \n" // 64 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
"vstm %1!, {q0, q1, q2, q3} \n" // store 64 "vst4.u8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 // Output registers "+r"(count) // %2 // Output registers
: // Input registers : // Input registers
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List : "memory", "cc", "q0", "q1" // Clobber List
); );
} }
#endif // HAS_COPYROW_NEON #endif // HAS_COPYROW_NEON
...@@ -403,7 +426,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) { ...@@ -403,7 +426,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store "vst1.u8 {q0}, [%0]! \n" // store
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment