Commit e0d8648b authored by fbarchard@google.com's avatar fbarchard@google.com

MergeUV aligned and SplitUV cpu detect combined with width check.

BUG=none
TEST=libyuvTest.I420ToNV12_Any
Review URL: https://webrtc-codereview.appspot.com/937005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@451 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 818b7102
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 449 Version: 451
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -121,7 +121,8 @@ extern "C" { ...@@ -121,7 +121,8 @@ extern "C" {
#define HAS_UYVYTOYROW_AVX2 #define HAS_UYVYTOYROW_AVX2
#define HAS_YUY2TOYROW_MMX #define HAS_YUY2TOYROW_MMX
#define HAS_UYVYTOYROW_MMX #define HAS_UYVYTOYROW_MMX
#define HAS_MERGEUV_SSE2 #define HAS_MERGEUV_AVX2
#define HAS_MERGEUV_MMX
#endif #endif
// The following are disabled when SSSE3 is available: // The following are disabled when SSSE3 is available:
...@@ -324,8 +325,22 @@ void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -324,8 +325,22 @@ void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width); int width);
void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width); int width);
void MergeUV_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width); int width);
void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width);
void MergeUV_Unaligned_AVX2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width);
void MergeUV_Unaligned_NEON(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width);
void MergeUV_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count);
...@@ -720,6 +735,24 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, ...@@ -720,6 +735,24 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 449 #define LIBYUV_VERSION 451
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -368,8 +368,7 @@ static int X420ToI420(const uint8* src_y, ...@@ -368,8 +368,7 @@ static int X420ToI420(const uint8* src_y,
void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUV_C; SplitUV_C;
#if defined(HAS_SPLITUV_SSE2) #if defined(HAS_SPLITUV_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
if (halfwidth >= 16) {
SplitUV = SplitUV_Any_SSE2; SplitUV = SplitUV_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) { if (IS_ALIGNED(halfwidth, 16)) {
SplitUV = SplitUV_Unaligned_SSE2; SplitUV = SplitUV_Unaligned_SSE2;
...@@ -380,11 +379,9 @@ static int X420ToI420(const uint8* src_y, ...@@ -380,11 +379,9 @@ static int X420ToI420(const uint8* src_y,
} }
} }
} }
}
#endif #endif
#if defined(HAS_SPLITUV_AVX2) #if defined(HAS_SPLITUV_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
if (halfwidth >= 32) {
SplitUV = SplitUV_Any_AVX2; SplitUV = SplitUV_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) { if (IS_ALIGNED(halfwidth, 32)) {
SplitUV = SplitUV_Unaligned_AVX2; SplitUV = SplitUV_Unaligned_AVX2;
...@@ -395,11 +392,9 @@ static int X420ToI420(const uint8* src_y, ...@@ -395,11 +392,9 @@ static int X420ToI420(const uint8* src_y,
} }
} }
} }
}
#endif #endif
#if defined(HAS_SPLITUV_NEON) #if defined(HAS_SPLITUV_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
if (halfwidth >= 16) {
SplitUV = SplitUV_Any_NEON; SplitUV = SplitUV_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) { if (IS_ALIGNED(halfwidth, 16)) {
SplitUV = SplitUV_Unaligned_NEON; SplitUV = SplitUV_Unaligned_NEON;
...@@ -410,11 +405,9 @@ static int X420ToI420(const uint8* src_y, ...@@ -410,11 +405,9 @@ static int X420ToI420(const uint8* src_y,
} }
} }
} }
}
#endif #endif
#if defined(HAS_SPLITUV_MIPS_DSPR2) #if defined(HAS_SPLITUV_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
if (halfwidth >= 16) {
SplitUV = SplitUV_Any_MIPS_DSPR2; SplitUV = SplitUV_Any_MIPS_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) { if (IS_ALIGNED(halfwidth, 16)) {
SplitUV = SplitUV_Unaligned_MIPS_DSPR2; SplitUV = SplitUV_Unaligned_MIPS_DSPR2;
...@@ -425,7 +418,6 @@ static int X420ToI420(const uint8* src_y, ...@@ -425,7 +418,6 @@ static int X420ToI420(const uint8* src_y,
} }
} }
} }
}
#endif #endif
if (dst_y) { if (dst_y) {
......
...@@ -521,18 +521,46 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, ...@@ -521,18 +521,46 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUV_C; int width) = MergeUV_C;
#if defined(HAS_MERGEUV_SSE2) #if defined(HAS_SPLITUV_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) && if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && MergeUV = MergeUV_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUV = MergeUV_Unaligned_SSE2;
if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
MergeUV = MergeUV_SSE2; MergeUV = MergeUV_SSE2;
} }
#elif defined(HAS_MERGEUV_NEON) }
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { }
#endif
#if defined(HAS_SPLITUV_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
MergeUV = MergeUV_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUV = MergeUV_Unaligned_AVX2;
if (IS_ALIGNED(src_u, 32) && IS_ALIGNED(src_stride_u, 32) &&
IS_ALIGNED(src_v, 32) && IS_ALIGNED(src_stride_v, 32) &&
IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
MergeUV = MergeUV_AVX2;
}
}
}
#endif
#if defined(HAS_SPLITUV_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
MergeUV = MergeUV_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUV = MergeUV_Unaligned_NEON;
if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
MergeUV = MergeUV_NEON; MergeUV = MergeUV_NEON;
} }
}
}
#endif #endif
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
int halfheight = (height + 1) >> 1; int halfheight = (height + 1) >> 1;
for (int y = 0; y < halfheight; ++y) { for (int y = 0; y < halfheight; ++y) {
......
...@@ -1190,32 +1190,53 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, ...@@ -1190,32 +1190,53 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
#endif #endif
#undef UV422ANY #undef UV422ANY
#define SPLITUVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \ #define SPLITUVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
void NAMEANY(const uint8* src_uv, \ void NAMEANY(const uint8* src_uv, \
uint8* dst_u, uint8* dst_v, int width) { \ uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \ int n = width & ~MASK; \
ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
ANYTOUV_C(src_uv + n * BPP, \ ANYTOUV_C(src_uv + n * 2, \
dst_u + n, \ dst_u + n, \
dst_v + n, \ dst_v + n, \
width & MASK); \ width & MASK); \
} }
#ifdef HAS_SPLITUV_SSE2 #ifdef HAS_SPLITUV_SSE2
SPLITUVANY(SplitUV_Any_SSE2, SplitUV_Unaligned_SSE2, SplitUV_C, 2, 15) SPLITUVANY(SplitUV_Any_SSE2, SplitUV_Unaligned_SSE2, SplitUV_C, 15)
#endif #endif
#ifdef HAS_SPLITUV_AVX2 #ifdef HAS_SPLITUV_AVX2
SPLITUVANY(SplitUV_Any_AVX2, SplitUV_Unaligned_AVX2, SplitUV_C, 2, 31) SPLITUVANY(SplitUV_Any_AVX2, SplitUV_Unaligned_AVX2, SplitUV_C, 31)
#endif #endif
#ifdef HAS_SPLITUV_NEON #ifdef HAS_SPLITUV_NEON
SPLITUVANY(SplitUV_Any_NEON, SplitUV_Unaligned_NEON, SplitUV_C, 2, 15) SPLITUVANY(SplitUV_Any_NEON, SplitUV_Unaligned_NEON, SplitUV_C, 15)
#endif #endif
#ifdef HAS_SPLITUV_MIPS_DSPR2 #ifdef HAS_SPLITUV_MIPS_DSPR2
SPLITUVANY(SplitUV_Any_MIPS_DSPR2, SplitUV_Unaligned_MIPS_DSPR2, SplitUV_C, SPLITUVANY(SplitUV_Any_MIPS_DSPR2, SplitUV_Unaligned_MIPS_DSPR2, SplitUV_C, 15)
2, 15)
#endif #endif
#undef SPLITUVANY #undef SPLITUVANY
#define MERGEUVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
void NAMEANY(const uint8* src_u, const uint8* src_v, \
uint8* dst_uv, int width) { \
int n = width & ~MASK; \
ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
ANYTOUV_C(src_u + n, \
src_v + n, \
dst_uv + n * 2, \
width & MASK); \
}
#ifdef HAS_MERGEUV_SSE2
MERGEUVANY(MergeUV_Any_SSE2, MergeUV_Unaligned_SSE2, MergeUV_C, 15)
#endif
#ifdef HAS_MERGEUV_AVX2
MERGEUVANY(MergeUV_Any_AVX2, MergeUV_Unaligned_AVX2, MergeUV_C, 31)
#endif
#ifdef HAS_MERGEUV_NEON
MERGEUVANY(MergeUV_Any_NEON, MergeUV_Unaligned_NEON, MergeUV_C, 15)
#endif
#undef MERGEUVANY
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
const int32* previous_cumsum, int width) { const int32* previous_cumsum, int width) {
int32 row_sum[4] = {0, 0, 0, 0}; int32 row_sum[4] = {0, 0, 0, 0};
......
...@@ -380,10 +380,32 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -380,10 +380,32 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
#ifdef HAS_MERGEUV_NEON #ifdef HAS_MERGEUV_NEON
// Reads 16 U's and V's and writes out 16 pairs of UV. // Reads 16 U's and V's and writes out 16 pairs of UV.
// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) { int width) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n"
"vld1.u8 {q0}, [%0:128]! \n" // load U
"vld1.u8 {q1}, [%1:128]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
"vst2.u8 {q0, q1}, [%2:128]! \n" // store 16 pairs of UV
"bgt 1b \n"
:
"+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "memory", "cc", "q0", "q1" // Clobber List
);
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUV_Unaligned_NEON(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width) {
asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
"vld1.u8 {q0}, [%0]! \n" // load U "vld1.u8 {q0}, [%0]! \n" // load U
"vld1.u8 {q1}, [%1]! \n" // load V "vld1.u8 {q1}, [%1]! \n" // load V
......
...@@ -2576,6 +2576,35 @@ void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -2576,6 +2576,35 @@ void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#endif #endif
); );
} }
void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width) {
asm volatile (
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%1,1),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm2 \n"
"movdqu %%xmm0,(%2) \n"
"movdqu %%xmm2,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
#endif
);
}
#endif // HAS_MERGEUV_SSE2 #endif // HAS_MERGEUV_SSE2
#ifdef HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_SSE2
......
...@@ -2650,6 +2650,36 @@ void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -2650,6 +2650,36 @@ void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
ret ret
} }
} }
__declspec(naked) __declspec(align(16))
void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_u
mov edx, [esp + 4 + 8] // src_v
mov edi, [esp + 4 + 12] // dst_uv
mov ecx, [esp + 4 + 16] // width
sub edx, eax
align 16
convertloop:
movdqu xmm0, [eax] // read 16 U's
movdqu xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 // first 8 UV pairs
punpckhbw xmm2, xmm1 // next 8 UV pairs
movdqu [edi], xmm0
movdqu [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
jg convertloop
pop edi
ret
}
}
#endif // HAS_MERGEUV_SSE2 #endif // HAS_MERGEUV_SSE2
#ifdef HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_SSE2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment