Commit e5f3fd4c authored by fbarchard@google.com's avatar fbarchard@google.com

YUY2 and UYVY Unaligned and any versions

TEST=none
BUG=none
Review URL: https://webrtc-codereview.appspot.com/379009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@168 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 79a06ac5
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 167
Version: 168
License: BSD
License File: LICENSE
......
......@@ -11,16 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define LIBYUV_VERSION 167
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#define LIBYUV_VERSION 168
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -366,7 +366,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
......@@ -382,7 +382,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
}
......@@ -428,7 +428,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = BGRAToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = BGRAToYAnyRow_SSSE3;
ARGBToYRow = BGRAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = BGRAToYRow_Unaligned_SSSE3;
}
......@@ -444,7 +444,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = BGRAToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = BGRAToUVAnyRow_SSSE3;
ARGBToUVRow = BGRAToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = BGRAToUVRow_Unaligned_SSSE3;
}
......@@ -490,7 +490,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ABGRToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ABGRToYAnyRow_SSSE3;
ARGBToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ABGRToYRow_Unaligned_SSSE3;
}
......@@ -506,7 +506,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = ABGRToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ABGRToUVAnyRow_SSSE3;
ARGBToUVRow = ABGRToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ABGRToUVRow_Unaligned_SSSE3;
}
......@@ -561,7 +561,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
......@@ -575,7 +575,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
} else
#endif
{
......@@ -630,7 +630,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
......@@ -644,7 +644,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
} else
#endif
{
......@@ -699,7 +699,7 @@ int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
......@@ -713,7 +713,7 @@ int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
} else
#endif
{
......@@ -768,7 +768,7 @@ int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
......@@ -782,7 +782,7 @@ int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
} else
#endif
{
......@@ -837,7 +837,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
......@@ -851,7 +851,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
} else
#endif
{
......
......@@ -452,22 +452,22 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToBayerRow)(const uint8* src_argb,
......@@ -490,7 +490,7 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
I420ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
dst_bayer += dst_stride_bayer;
src_y += src_stride_y;
......
......@@ -928,457 +928,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
return 0;
}
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#define HAS_YUY2TOI420ROW_SSE2
__declspec(naked)
void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void YUY2ToI420RowY_Unaligned_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void YUY2ToI420RowUV_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
#define HAS_UYVYTOI420ROW_SSE2
__declspec(naked)
void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_uyvy
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
#define HAS_YUY2TOI420ROW_SSE2
static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%4,1),%%xmm2 \n"
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0,(%1) \n"
"movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
static void YUY2ToI420RowY_Unaligned_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
static void YUY2ToI420RowUV_Unaligned_SSE2(const uint8* src_yuy2,
int stride_yuy2,
uint8* dst_u, uint8* dst_y,
int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu (%0,%4,1),%%xmm2 \n"
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0,(%1) \n"
"movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
#define HAS_UYVYTOI420ROW_SSE2
static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
asm volatile (
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%4,1),%%xmm2 \n"
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0,(%1) \n"
"movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
#endif
// Filter 2 rows of YUY2 UV's (422) into U and V (420)
void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
// Output a row of UV values, filtering 2 rows of YUY2
for (int x = 0; x < pix; x += 2) {
dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
src_yuy2 += 4;
dst_u += 1;
dst_v += 1;
}
}
void YUY2ToI420RowY_C(const uint8* src_yuy2,
uint8* dst_y, int pix) {
// Copy a row of yuy2 Y values
for (int x = 0; x < pix; ++x) {
dst_y[0] = src_yuy2[0];
src_yuy2 += 2;
dst_y += 1;
}
}
void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
// Copy a row of uyvy UV values
for (int x = 0; x < pix; x += 2) {
dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
src_uyvy += 4;
dst_u += 1;
dst_v += 1;
}
}
void UYVYToI420RowY_C(const uint8* src_uyvy,
uint8* dst_y, int pix) {
// Copy a row of uyvy Y values
for (int x = 0; x < pix; ++x) {
dst_y[0] = src_uyvy[1];
src_uyvy += 2;
dst_y += 1;
}
}
// Convert YUY2 to I420.
int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
......@@ -1391,36 +940,42 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void (*YUY2ToI420RowY)(const uint8* src_yuy2,
void (*YUY2ToYRow)(const uint8* src_yuy2,
uint8* dst_y, int pix);
YUY2ToI420RowY = YUY2ToI420RowY_C;
YUY2ToI420RowUV = YUY2ToI420RowUV_C;
YUY2ToYRow = YUY2ToYRow_C;
YUY2ToUVRow = YUY2ToUVRow_C;
#if defined(HAS_YUY2TOI420ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
YUY2ToI420RowUV = YUY2ToI420RowUV_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
YUY2ToI420RowY = YUY2ToI420RowY_Unaligned_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
if (width <= kMaxStride) {
YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUVRow = YUY2ToUVRow_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
}
}
#endif
for (int y = 0; y < height - 1; y += 2) {
YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
dst_u += dst_stride_u;
dst_v += dst_stride_v;
YUY2ToI420RowY(src_yuy2, dst_y, width);
YUY2ToI420RowY(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
YUY2ToYRow(src_yuy2, dst_y, width);
YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
dst_y += dst_stride_y * 2;
src_yuy2 += src_stride_yuy2 * 2;
}
if (height & 1) {
YUY2ToI420RowUV(src_yuy2, 0, dst_u, dst_v, width);
YUY2ToI420RowY(src_yuy2, dst_y, width);
YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
}
return 0;
}
......@@ -1437,34 +992,42 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void (*UYVYToI420RowY)(const uint8* src_uyvy,
void (*UYVYToYRow)(const uint8* src_uyvy,
uint8* dst_y, int pix);
UYVYToI420RowY = UYVYToI420RowY_C;
UYVYToI420RowUV = UYVYToI420RowUV_C;
UYVYToYRow = UYVYToYRow_C;
UYVYToUVRow = UYVYToUVRow_C;
#if defined(HAS_UYVYTOI420ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToI420RowY = UYVYToI420RowY_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
if (width <= kMaxStride) {
UYVYToUVRow = UYVYToUVRow_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToUVRow = UYVYToUVRow_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToYRow = UYVYToYRow_SSE2;
}
}
}
}
#endif
for (int y = 0; y < height - 1; y += 2) {
UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
dst_u += dst_stride_u;
dst_v += dst_stride_v;
UYVYToI420RowY(src_uyvy, dst_y, width);
UYVYToI420RowY(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
UYVYToYRow(src_uyvy, dst_y, width);
UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
dst_y += dst_stride_y * 2;
src_uyvy += src_stride_uyvy * 2;
}
if (height & 1) {
UYVYToI420RowUV(src_uyvy, 0, dst_u, dst_v, width);
UYVYToI420RowY(src_uyvy, dst_y, width);
UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width);
}
return 0;
}
......@@ -1481,32 +1044,32 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_NEON;
I420ToARGBRow = I420ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
}
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
}
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width);
I420ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
......@@ -1529,32 +1092,32 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
dst_stride_bgra = -dst_stride_bgra;
}
void (*FastConvertYUVToBGRARow)(const uint8* y_buf,
void (*I420ToBGRARow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON)
#if defined(HAS_I420TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRAAnyRow_NEON;
I420ToBGRARow = I420ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_NEON;
I420ToBGRARow = I420ToBGRARow_NEON;
}
} else
#elif defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3)
#elif defined(HAS_I420TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRAAnyRow_SSSE3;
I420ToBGRARow = I420ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3;
I420ToBGRARow = I420ToBGRARow_SSSE3;
}
} else
#endif
{
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
I420ToBGRARow = I420ToBGRARow_C;
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_bgra, width);
I420ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
dst_bgra += dst_stride_bgra;
src_y += src_stride_y;
if (y & 1) {
......@@ -1577,32 +1140,32 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
void (*FastConvertYUVToABGRRow)(const uint8* y_buf,
void (*I420ToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOABGRROW_NEON)
#if defined(HAS_I420TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRAnyRow_NEON;
I420ToABGRRow = I420ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_NEON;
I420ToABGRRow = I420ToABGRRow_NEON;
}
} else
#elif defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3)
#elif defined(HAS_I420TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRAnyRow_SSSE3;
I420ToABGRRow = I420ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3;
I420ToABGRRow = I420ToABGRRow_SSSE3;
}
} else
#endif
{
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
I420ToABGRRow = I420ToABGRRow_C;
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_abgr, width);
I420ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
dst_abgr += dst_stride_abgr;
src_y += src_stride_y;
if (y & 1) {
......@@ -1625,29 +1188,29 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToRGB24Row = ARGBToRGB24AnyRow_SSSE3;
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
......@@ -1659,7 +1222,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
I420ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRGB24Row(row, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
......@@ -1683,29 +1246,29 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToRAWRow = ARGBToRAWAnyRow_SSSE3;
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
......@@ -1717,7 +1280,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
I420ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRAWRow(row, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
......@@ -1741,29 +1304,29 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_rgb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565Row = ARGBToRGB565AnyRow_SSE2;
ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
......@@ -1774,7 +1337,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
I420ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRGB565Row(row, dst_rgb, width);
dst_rgb += dst_stride_rgb;
src_y += src_stride_y;
......@@ -1798,29 +1361,29 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToARGB1555Row = ARGBToARGB1555AnyRow_SSE2;
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
}
......@@ -1831,7 +1394,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
I420ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToARGB1555Row(row, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
......@@ -1855,29 +1418,29 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToARGB4444Row = ARGBToARGB4444AnyRow_SSE2;
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
}
......@@ -1888,7 +1451,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
I420ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToARGB4444Row(row, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
......@@ -1912,33 +1475,33 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_NEON;
I420ToARGBRow = I420ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
}
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
}
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width);
I420ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
......@@ -1959,23 +1522,23 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUV444ToARGBRow)(const uint8* y_buf,
void (*I444ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUV444TOARGBROW_SSSE3)
#if defined(HAS_I444TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3;
I444ToARGBRow = I444ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
I444ToARGBRow = I444ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
......@@ -1994,21 +1557,21 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYToARGBRow)(const uint8* y_buf,
void (*YToARGBRow)(const uint8* y_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYTOARGBROW_SSE2)
#if defined(HAS_YTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
YToARGBRow = YToARGBRow_SSE2;
} else
#endif
{
FastConvertYToARGBRow = FastConvertYToARGBRow_C;
YToARGBRow = YToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
FastConvertYToARGBRow(src_y, dst_argb, width);
YToARGBRow(src_y, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
......@@ -2205,7 +1768,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToRGB24Row = ARGBToRGB24AnyRow_SSSE3;
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
......@@ -2237,7 +1800,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
#if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToRAWRow = ARGBToRAWAnyRow_SSSE3;
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
......@@ -2267,29 +1830,29 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_NEON;
I420ToARGBRow = I420ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
}
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
}
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
int halfwidth = (width + 1) >> 1;
......@@ -2315,7 +1878,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
src_uv += src_stride_uv;
}
FastConvertYUVToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width);
I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
......@@ -2333,22 +1896,22 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb;
}
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
void (*I420ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
I420ToARGBRow = I420ToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
I420ToARGBRow = I420ToARGBRow_C;
}
SIMD_ALIGNED(uint8 row[kMaxStride]);
......@@ -2385,7 +1948,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
src_uv += src_stride_uv;
}
FastConvertYUVToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
ARGBToRGB565Row(row, dst_rgb, width);
dst_rgb += dst_stride_rgb;
src_y += src_stride_y;
......
......@@ -13,6 +13,11 @@
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define kMaxStride (2048 * 4)
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
......@@ -34,13 +39,15 @@
#define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_YTOARGBROW_SSE2
#define HAS_I420TOARGBROW_SSSE3
#define HAS_I420TOBGRAROW_SSSE3
#define HAS_I420TOABGRROW_SSSE3
#define HAS_I444TOARGBROW_SSSE3
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORROW_SSE2
#define HAS_YUY2TOI420ROW_SSE2
#define HAS_UYVYTOI420ROW_SSE2
#endif
// The following are available on Windows platforms
......@@ -48,7 +55,6 @@
#define HAS_RGB565TOARGBROW_SSE2
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB565ROW_SSE2
......@@ -59,14 +65,9 @@
// The following are available on Neon platforms
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_NEON
#define HAS_FASTCONVERTYUVTOARGBROW_NEON
#define HAS_FASTCONVERTYUVTOBGRAROW_NEON
#define HAS_FASTCONVERTYUVTOABGRROW_NEON
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#define HAS_I420TOARGBROW_NEON
#define HAS_I420TOBGRAROW_NEON
#define HAS_I420TOABGRROW_NEON
#endif
#if defined(_MSC_VER)
......@@ -81,21 +82,21 @@ typedef unsigned char __attribute__((vector_size(16))) uvec8;
typedef signed short __attribute__((vector_size(16))) vec16;
#endif
void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
......@@ -164,114 +165,144 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void FastConvertYUVToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToARGBRow_C(const uint8* y_buf,
void I420ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
void I420ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
// 'Any' wrappers use memcpy()
void FastConvertYUVToARGBAnyRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRAAnyRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRAnyRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void ARGBToRGB24AnyRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWAnyRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToBGRARow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToABGRRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void FastConvertYUVToARGBAnyRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRAAnyRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRAnyRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToBGRARow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToABGRRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix);
void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix);
void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix);
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix);
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix);
void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix);
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
#ifdef __cplusplus
} // extern "C"
......
......@@ -271,7 +271,7 @@ static __inline uint32 Clip(int32 val) {
}
static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
int ashift, int rshift, int gshift, int bshift) {
int ashift, int rshift, int gshift, int bshift) {
int32 y1 = (static_cast<int32>(y) - 16) * YG;
uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
......@@ -282,11 +282,11 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
(255u << ashift);
}
void FastConvertYUVToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I420ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
......@@ -300,11 +300,11 @@ void FastConvertYUVToARGBRow_C(const uint8* y_buf,
}
}
void FastConvertYUVToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I420ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
......@@ -318,11 +318,11 @@ void FastConvertYUVToBGRARow_C(const uint8* y_buf,
}
}
void FastConvertYUVToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I420ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
......@@ -336,11 +336,11 @@ void FastConvertYUVToABGRRow_C(const uint8* y_buf,
}
}
void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width; ++x) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
y_buf += 1;
......@@ -350,9 +350,9 @@ void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
}
}
void FastConvertYToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf,
int width) {
void YToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width; ++x) {
YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
y_buf += 1;
......@@ -368,6 +368,51 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) {
}
}
// Filter 2 rows of YUY2 UV's (422) into U and V (420)
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
// Output a row of UV values, filtering 2 rows of YUY2
for (int x = 0; x < pix; x += 2) {
dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
src_yuy2 += 4;
dst_u += 1;
dst_v += 1;
}
}
void YUY2ToYRow_C(const uint8* src_yuy2,
uint8* dst_y, int pix) {
// Copy a row of yuy2 Y values
for (int x = 0; x < pix; ++x) {
dst_y[0] = src_yuy2[0];
src_yuy2 += 2;
dst_y += 1;
}
}
void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
// Copy a row of uyvy UV values
for (int x = 0; x < pix; x += 2) {
dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
src_uyvy += 4;
dst_u += 1;
dst_v += 1;
}
}
void UYVYToYRow_C(const uint8* src_uyvy,
uint8* dst_y, int pix) {
// Copy a row of uyvy Y values
for (int x = 0; x < pix; ++x) {
dst_y[0] = src_uyvy[1];
src_uyvy += 2;
dst_y += 1;
}
}
// Wrappers to handle odd sizes/alignments
#define MAKEYUVANY(NAMEANY, NAME) \
void NAMEANY(const uint8* y_buf, \
......@@ -380,15 +425,15 @@ void NAMEANY(const uint8* y_buf, \
memcpy(rgb_buf, row, width << 2); \
}
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3)
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3)
MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3)
#if defined(HAS_I420TOARGBROW_SSSE3)
MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3)
MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3)
MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3)
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON)
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON)
MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON)
#if defined(HAS_I420TOARGBROW_NEON)
MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON)
MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON)
MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON)
#endif
#define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \
......@@ -401,27 +446,29 @@ void NAMEANY(const uint8* argb_buf, \
}
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
MAKEYUVANYRGB(ARGBToRGB24AnyRow_SSSE3, ARGBToRGB24Row_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRAWAnyRow_SSSE3, ARGBToRAWRow_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRGB565AnyRow_SSE2, ARGBToRGB565Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB1555AnyRow_SSE2, ARGBToARGB1555Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB4444AnyRow_SSE2, ARGBToARGB4444Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
#define MAKEARGBTOYANY(NAMEANY, ARGBTOY) \
#define MAKEANYTOYANY(NAMEANY, ARGBTOY) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
SIMD_ALIGNED(uint8 row[kMaxStride]); \
ARGBTOY(src_argb, row, width); \
memcpy(dst_y, row, width); \
}
MAKEARGBTOYANY(ARGBToYAnyRow_SSSE3, ARGBToYRow_Unaligned_SSSE3)
MAKEARGBTOYANY(BGRAToYAnyRow_SSSE3, BGRAToYRow_Unaligned_SSSE3)
MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3)
MAKEANYTOYANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3)
MAKEANYTOYANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3)
MAKEANYTOYANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3)
MAKEANYTOYANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2)
MAKEANYTOYANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2)
#define MAKEARGBTOUVANY(NAMEANY, ARGBTOUV) \
#define MAKEANYTOUVANY(NAMEANY, ARGBTOUV) \
void NAMEANY(const uint8* src_argb0, int src_stride_argb, \
uint8* dst_u, uint8* dst_v, int width) { \
SIMD_ALIGNED(uint8 row[kMaxStride * 2]); \
......@@ -431,9 +478,11 @@ MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3)
memcpy(dst_v, row + kMaxStride, halfwidth); \
}
MAKEARGBTOUVANY(ARGBToUVAnyRow_SSSE3, ARGBToUVRow_Unaligned_SSSE3)
MAKEARGBTOUVANY(BGRAToUVAnyRow_SSSE3, BGRAToUVRow_Unaligned_SSSE3)
MAKEARGBTOUVANY(ABGRToUVAnyRow_SSSE3, ABGRToUVRow_Unaligned_SSSE3)
MAKEANYTOUVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3)
MAKEANYTOUVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3)
MAKEANYTOUVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3)
MAKEANYTOUVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2)
MAKEANYTOUVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2)
#endif
#ifdef __cplusplus
......
......@@ -55,19 +55,19 @@ extern "C" {
"vtrn.u8 d22, d23 \n" \
"vtrn.u8 d16, d17 \n" \
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) || \
defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON) || \
defined(HAS_FASTCONVERTYUVTOABGRROW_NEON)
#if defined(HAS_I420TOARGBROW_NEON) || \
defined(HAS_I420TOBGRAROW_NEON) || \
defined(HAS_I420TOABGRROW_NEON)
static const vec8 kUVToRB[8] = { 127, 127, 127, 127, 102, 102, 102, 102 };
static const vec8 kUVToG[8] = { -25, -25, -25, -25, -52, -52, -52, -52 };
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
#if defined(HAS_I420TOARGBROW_NEON)
void I420ToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
......@@ -94,12 +94,12 @@ YUVTORGB
}
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON)
void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
#if defined(HAS_I420TOBGRAROW_NEON)
void I420ToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
......@@ -127,12 +127,12 @@ YUVTORGB
}
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_NEON)
void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
#if defined(HAS_I420TOABGRROW_NEON)
void I420ToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
......
......@@ -71,22 +71,22 @@ CONST uvec8 kShuffleMaskBGRAToARGB = {
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -100,15 +100,15 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile (
"movdqa %3,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
"movdqa %3,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -123,15 +123,15 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
asm volatile (
"movdqa %3,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
"movdqa %3,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -145,33 +145,33 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -185,33 +185,33 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -225,28 +225,28 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
......@@ -262,28 +262,28 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
......@@ -302,9 +302,9 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
......@@ -315,42 +315,42 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#endif
);
asm volatile (
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm6 \n"
"pavgb (%0,%4,1),%%xmm0 \n"
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm6 \n"
"pavgb (%0,%4,1),%%xmm0 \n"
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
......@@ -366,9 +366,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
......@@ -379,46 +379,46 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
#endif
);
asm volatile (
"sub %1,%2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm6 \n"
"movdqu (%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm1 \n"
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
"sub %1,%2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm6 \n"
"movdqu (%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm1 \n"
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
......@@ -432,7 +432,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#ifdef HAS_I420TOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
#define UR 0
......@@ -476,56 +476,56 @@ struct {
// Convert 8 pixels
#define YUVTORGB \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x4(%1),%1 \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
"pmaddubsw 16(%5),%%xmm1 \n" \
"pmaddubsw 32(%5),%%xmm2 \n" \
"psubw 48(%5),%%xmm0 \n" \
"psubw 64(%5),%%xmm1 \n" \
"psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%5),%%xmm3 \n" \
"pmullw 112(%5),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x4(%1),%1 \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
"pmaddubsw 16(%5),%%xmm1 \n" \
"pmaddubsw 32(%5),%%xmm2 \n" \
"psubw 48(%5),%%xmm0 \n" \
"psubw 64(%5),%%xmm1 \n" \
"psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%5),%%xmm3 \n" \
"pmullw 112(%5),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -539,29 +539,29 @@ void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi
);
}
void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5,(%3) \n"
"movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5,(%3) \n"
"movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -575,28 +575,28 @@ void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
);
}
void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -610,50 +610,50 @@ void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
);
}
void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
"movd (%1),%%xmm0 \n"
"movd (%1,%2,1),%%xmm1 \n"
"lea 0x4(%1),%1 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pmaddubsw (%5),%%xmm0 \n"
"pmaddubsw 16(%5),%%xmm1 \n"
"pmaddubsw 32(%5),%%xmm2 \n"
"psubw 48(%5),%%xmm0 \n"
"psubw 64(%5),%%xmm1 \n"
"psubw 80(%5),%%xmm2 \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm3 \n"
"psubsw 96(%5),%%xmm3 \n"
"pmullw 112(%5),%%xmm3 \n"
"paddsw %%xmm3,%%xmm0 \n"
"paddsw %%xmm3,%%xmm1 \n"
"paddsw %%xmm3,%%xmm2 \n"
"psraw $0x6,%%xmm0 \n"
"psraw $0x6,%%xmm1 \n"
"psraw $0x6,%%xmm2 \n"
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm1,%%xmm1 \n"
"packuswb %%xmm2,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"movdqa %%xmm0,(%3) \n"
"lea 0x10(%3),%3 \n"
"sub $0x4,%4 \n"
"ja 1b \n"
"movd (%1),%%xmm0 \n"
"movd (%1,%2,1),%%xmm1 \n"
"lea 0x4(%1),%1 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pmaddubsw (%5),%%xmm0 \n"
"pmaddubsw 16(%5),%%xmm1 \n"
"pmaddubsw 32(%5),%%xmm2 \n"
"psubw 48(%5),%%xmm0 \n"
"psubw 64(%5),%%xmm1 \n"
"psubw 80(%5),%%xmm2 \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm3 \n"
"psubsw 96(%5),%%xmm3 \n"
"pmullw 112(%5),%%xmm3 \n"
"paddsw %%xmm3,%%xmm0 \n"
"paddsw %%xmm3,%%xmm1 \n"
"paddsw %%xmm3,%%xmm2 \n"
"psraw $0x6,%%xmm0 \n"
"psraw $0x6,%%xmm1 \n"
"psraw $0x6,%%xmm2 \n"
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm1,%%xmm1 \n"
"packuswb %%xmm2,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"movdqa %%xmm0,(%3) \n"
"lea 0x10(%3),%3 \n"
"sub $0x4,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -668,43 +668,43 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
}
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
#ifdef HAS_YTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx
int width) { // r8
void YToARGBRow_SSE2(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
"mov $0x10001000,%%eax \n"
"movd %%eax,%%xmm3 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"mov $0x012a012a,%%eax \n"
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
"mov $0x10001000,%%eax \n"
"movd %%eax,%%xmm3 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"mov $0x012a012a,%%eax \n"
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
"1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"psubusw %%xmm3,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"psubusw %%xmm3,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,16(%1) \n"
"lea 32(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,16(%1) \n"
"lea 32(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
"+rm"(width) // %2
......@@ -787,15 +787,15 @@ CONST uvec8 kShuffleMirror = {
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa %3,%%xmm5 \n"
"lea -0x10(%0),%0 \n"
"movdqa %3,%%xmm5 \n"
"lea -0x10(%0),%0 \n"
"1: \n"
"movdqa (%0,%2),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"ja 1b \n"
"movdqa (%0,%2),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
......@@ -813,20 +813,20 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"lea -0x10(%0),%0 \n"
"lea -0x10(%0),%0 \n"
"1: \n"
"movdqu (%0,%2),%%xmm0 \n"
"movdqu %%xmm0,%%xmm1 \n"
"psllw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"ja 1b \n"
"movdqu (%0,%2),%%xmm0 \n"
"movdqu %%xmm0,%%xmm1 \n"
"psllw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
......@@ -839,6 +839,269 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
}
#endif
#ifdef HAS_YUY2TOI420ROW_SSE2
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%4,1),%%xmm2 \n"
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0,(%1) \n"
"movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
int stride_yuy2,
uint8* dst_u, uint8* dst_y,
int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu (%0,%4,1),%%xmm2 \n"
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0,(%1) \n"
"movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%4,1),%%xmm2 \n"
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0,(%1) \n"
"movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
asm volatile (
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu (%0,%4,1),%%xmm2 \n"
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0,(%1) \n"
"movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
#endif // HAS_YUY2TOI420ROW_SSE2
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -1174,7 +1174,7 @@ __asm {
}
}
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#ifdef HAS_I420TOARGBROW_SSSE3
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
......@@ -1242,11 +1242,11 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
}
__declspec(naked)
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I420ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
......@@ -1282,11 +1282,11 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
}
__declspec(naked)
void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I420ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
......@@ -1322,11 +1322,11 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
}
__declspec(naked)
void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I420ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
......@@ -1362,11 +1362,11 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
}
__declspec(naked)
void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
......@@ -1427,11 +1427,11 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
}
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
#ifdef HAS_YTOARGBROW_SSE2
__declspec(naked)
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
......@@ -1529,6 +1529,277 @@ __asm {
}
}
#endif
#ifdef HAS_YUY2TOI420ROW_SSE2
__declspec(naked)
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void UYVYToYRow_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_uyvy
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_uyvy
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
#endif // HAS_YUY2TOI420ROW_SSE2
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment