Commit b5b27d13 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGBToYUV with sse3 on any size/alignment

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/366011

git-svn-id: http://libyuv.googlecode.com/svn/trunk@161 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent caf39525
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 160
Version: 161
License: BSD
License File: LICENSE
......
......@@ -16,7 +16,7 @@ namespace libyuv {
extern "C" {
#endif
#define LIBYUV_VERSION 160
#define LIBYUV_VERSION 161
#ifdef __cplusplus
} // extern "C"
......
......@@ -365,6 +365,11 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -375,6 +380,12 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -416,6 +427,11 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = BGRAToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = BGRAToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = BGRAToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -426,6 +442,12 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
ARGBToUVRow = BGRAToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = BGRAToUVAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = BGRAToUVRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -467,6 +489,11 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ABGRToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ABGRToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ABGRToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -477,6 +504,12 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
ARGBToUVRow = ABGRToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ABGRToUVAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ABGRToUVRow_Unaligned_SSSE3;
}
} else
#endif
{
......
......@@ -100,12 +100,22 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
......@@ -235,6 +245,16 @@ void ARGBToRGB565AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void FastConvertYUVToARGBAnyRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......
......@@ -380,8 +380,17 @@ void NAMEANY(const uint8* y_buf, \
memcpy(rgb_buf, row, width << 2); \
}
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3)
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3)
MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3)
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON)
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON)
MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON)
#endif
// Wrappers to handle odd sizes/alignments
#define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \
void NAMEANY(const uint8* argb_buf, \
uint8* rgb_buf, \
......@@ -391,20 +400,40 @@ void NAMEANY(const uint8* argb_buf, \
memcpy(rgb_buf, row, width * BPP); \
}
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3)
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3)
MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3)
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
MAKEYUVANYRGB(ARGBToRGB24AnyRow_SSSE3, ARGBToRGB24Row_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRAWAnyRow_SSSE3, ARGBToRAWRow_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRGB565AnyRow_SSE2, ARGBToRGB565Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB1555AnyRow_SSE2, ARGBToARGB1555Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB4444AnyRow_SSE2, ARGBToARGB4444Row_SSE2, 2)
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON)
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON)
MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON)
#ifdef HAS_ARGBTOYROW_SSSE3
#define MAKEARGBTOYANY(NAMEANY, ARGBTOY) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
SIMD_ALIGNED(uint8 row[kMaxStride]); \
ARGBTOY(src_argb, row, width); \
memcpy(dst_y, row, width); \
}
MAKEARGBTOYANY(ARGBToYAnyRow_SSSE3, ARGBToYRow_Unaligned_SSSE3)
MAKEARGBTOYANY(BGRAToYAnyRow_SSSE3, BGRAToYRow_Unaligned_SSSE3)
MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3)
#define MAKEARGBTOUVANY(NAMEANY, ARGBTOUV) \
void NAMEANY(const uint8* src_argb0, int src_stride_argb, \
uint8* dst_u, uint8* dst_v, int width) { \
SIMD_ALIGNED(uint8 row[kMaxStride * 2]); \
ARGBTOUV(src_argb0, src_stride_argb, row, row + kMaxStride, width); \
int halfwidth = (width + 1) >> 1; \
memcpy(dst_u, row, halfwidth); \
memcpy(dst_v, row + kMaxStride, halfwidth); \
}
MAKEARGBTOUVANY(ARGBToUVAnyRow_SSSE3, ARGBToUVRow_Unaligned_SSSE3)
MAKEARGBTOUVANY(BGRAToUVAnyRow_SSSE3, BGRAToUVRow_Unaligned_SSSE3)
MAKEARGBTOUVANY(ABGRToUVAnyRow_SSSE3, ABGRToUVRow_Unaligned_SSSE3)
#endif
#ifdef __cplusplus
......
......@@ -257,6 +257,43 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kARGBToY), // %3
"m"(kAddY16) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
#endif
......@@ -325,6 +362,74 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#endif
);
}
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
"m"(kAddUV128) // %2
:
#if defined(__SSE2__)
"xmm3", "xmm4", "xmm5"
#endif
);
asm volatile (
"sub %1,%2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm6 \n"
"movdqu (%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm1 \n"
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb))
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
);
}
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
......@@ -624,6 +729,18 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
BGRAToARGBRow_SSSE3(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
ABGRToARGBRow_C(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
BGRAToARGBRow_C(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
......@@ -642,6 +759,22 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
ABGRToARGBRow_C(src_argb, row, pix);
ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BGRAToARGBRow_C(src_argb, row, pix);
BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
#endif
#ifdef HAS_MIRRORROW_SSSE3
......
......@@ -611,6 +611,39 @@ __asm {
}
}
__declspec(naked)
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
......@@ -644,6 +677,39 @@ __asm {
}
}
__declspec(naked)
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
......@@ -677,6 +743,39 @@ __asm {
}
}
__declspec(naked)
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
......@@ -741,6 +840,75 @@ __asm {
}
}
__declspec(naked)
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, kARGBToU
movdqa xmm6, kARGBToV
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
......@@ -805,6 +973,74 @@ __asm {
}
}
__declspec(naked)
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, kBGRAToU
movdqa xmm6, kBGRAToV
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
......@@ -869,6 +1105,75 @@ __asm {
}
}
__declspec(naked)
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, kABGRToU
movdqa xmm6, kABGRToV
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment