Commit 952a507c authored by fbarchard@google.com's avatar fbarchard@google.com

I420ToARGB without row buffer faster and unlimited in size. Uses SSSE3…

I420ToARGB without row buffer faster and unlimited in size.  Uses SSSE3 unaligned for multiples of 8 and C for remainder
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/470001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@232 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f3fb7b69
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 231
Version: 232
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION 231
#define INCLUDE_LIBYUV_VERSION 232
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -671,13 +671,15 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
}
}
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (IS_ALIGNED(width, 8)) {
I420ToARGBRow = I420ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I420ToARGBRow = I420ToARGBRow_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
......@@ -717,13 +719,15 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
}
}
#elif defined(HAS_I420TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToBGRARow = I420ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
if (IS_ALIGNED(width, 8)) {
I420ToBGRARow = I420ToBGRARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
I420ToBGRARow = I420ToBGRARow_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
......@@ -763,13 +767,15 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
}
}
#elif defined(HAS_I420TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToABGRRow = I420ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
if (IS_ALIGNED(width, 8)) {
I420ToABGRRow = I420ToABGRRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
I420ToABGRRow = I420ToABGRRow_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
......@@ -816,7 +822,9 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
ARGBToRGB24Row_C;
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (width * 3 <= kMaxStride) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
......@@ -869,7 +877,9 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
ARGBToRAWRow_C;
#if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (width * 3 <= kMaxStride) {
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
......@@ -922,7 +932,9 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
ARGBToRGB565Row_C;
#if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width * 2 <= kMaxStride) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
}
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
......@@ -974,7 +986,9 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
ARGBToARGB1555Row_C;
#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width * 2 <= kMaxStride) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
}
if (IS_ALIGNED(width, 4)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
}
......@@ -1026,7 +1040,9 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
ARGBToARGB4444Row_C;
#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width * 2 <= kMaxStride) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
}
if (IS_ALIGNED(width, 4)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
}
......
......@@ -216,7 +216,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
}
}
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
......@@ -478,7 +478,9 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (width * 3 <= kMaxStride) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
......@@ -508,7 +510,9 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
#if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (width * 3 <= kMaxStride) {
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
......@@ -548,7 +552,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
}
}
#elif defined(HAS_I420TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
......
......@@ -18,7 +18,7 @@ namespace libyuv {
extern "C" {
#endif
#define kMaxStride (2048 * 4)
#define kMaxStride (2560 * 4)
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
......@@ -236,6 +236,24 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
uint8* rgb_buf,
int width);
void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......
......@@ -519,29 +519,33 @@ void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
}
// Wrappers to handle odd sizes/alignments
#define MAKEYUVANY(NAMEANY, NAME, COPYROW) \
#define YUVANY(NAMEANY, I420TORGB_SSE, I420TORGB_C) \
void NAMEANY(const uint8* y_buf, \
const uint8* u_buf, \
const uint8* v_buf, \
uint8* rgb_buf, \
int width) { \
SIMD_ALIGNED(uint8 row[kMaxStride]); \
NAME(y_buf, u_buf, v_buf, row, width); \
COPYROW(row, rgb_buf, width << 2); \
int n = width & ~7; \
I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \
I420TORGB_C(y_buf + n, \
u_buf + (n >> 1), \
v_buf + (n >> 1), \
rgb_buf + n * 4, width & 7); \
}
#if defined(HAS_I420TOARGBROW_SSSE3)
MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3, CopyRow_X86)
MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3, CopyRow_X86)
MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3, CopyRow_X86)
YUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_Unaligned_SSSE3, I420ToARGBRow_C)
YUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_Unaligned_SSSE3, I420ToBGRARow_C)
YUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_Unaligned_SSSE3, I420ToABGRRow_C)
#endif
#if defined(HAS_I420TOARGBROW_NEON)
MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, CopyRow_C)
MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, CopyRow_C)
MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, CopyRow_C)
YUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, I420ToARGBRow_C)
YUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, I420ToBGRARow_C)
YUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, I420ToABGRRow_C)
#endif
#undef YUVANY
#define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \
#define RGBANY(NAMEANY, ARGBTORGB, BPP) \
void NAMEANY(const uint8* argb_buf, \
uint8* rgb_buf, \
int width) { \
......@@ -551,41 +555,45 @@ MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, CopyRow_C)
}
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
MAKEYUVANYRGB(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
MAKEYUVANYRGB(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
MAKEYUVANYRGB(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
#endif
#undef RGBANY
#ifdef HAS_ARGBTOYROW_SSSE3
#define MAKEYANY(NAMEANY, ARGBTOY_SSE, BPP) \
#define YANY(NAMEANY, ARGBTOY_SSE, BPP) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
ARGBTOY_SSE(src_argb, dst_y, width - 16); \
ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \
}
MAKEYANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
MAKEYANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
MAKEYANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
MAKEYANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
MAKEYANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
#undef YANY
#define MAKEUVANY(NAMEANY, ARGBTOUV_SSE, ARGBTOUV_C, BPP) \
void NAMEANY(const uint8* src_argb0, int src_stride_argb, \
#define UVANY(NAMEANY, ARGBTOUV_SSE, ARGBTOUV_C, BPP) \
void NAMEANY(const uint8* src_argb, int src_stride_argb, \
uint8* dst_u, uint8* dst_v, int width) { \
ARGBTOUV_SSE(src_argb0, src_stride_argb, dst_u, dst_v, width & ~15); \
ARGBTOUV_C(src_argb0 + (width & ~15) * BPP, src_stride_argb, \
dst_u + (width & ~15) / 2, dst_v + (width & ~15) / 2, \
int n = width & ~15; \
ARGBTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \
ARGBTOUV_C(src_argb + n * BPP, src_stride_argb, \
dst_u + (n >> 1), \
dst_v + (n >> 1), \
width & 15); \
}
MAKEUVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
MAKEUVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
MAKEUVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
MAKEUVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
MAKEUVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
#undef UVANY
#endif
#ifdef __cplusplus
......
......@@ -1389,6 +1389,109 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
);
}
void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqu %%xmm5,(%3) \n"
"movdqu %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqu %%xmm2,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......
......@@ -1388,6 +1388,126 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
}
}
__declspec(naked)
void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
convertloop:
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
align 16
convertloop:
YUVTORGB
// Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
punpcklbw xmm1, xmm0 // GB
punpcklbw xmm5, xmm2 // AR
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqu [edx], xmm5
movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
convertloop:
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG
punpcklbw xmm0, xmm5 // BA
movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqu [edx], xmm2
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment