Commit 25dc0585 authored by fbarchard@google.com's avatar fbarchard@google.com

RGBAToI420_SSSE3 and I420ToRGBA_SSSE3 implemented.

BUG=78
TESTED=gcl lint
Review URL: https://webrtc-codereview.appspot.com/796009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@359 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 13f38940
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 358
Version: 359
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 358
#define LIBYUV_VERSION 359
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -37,9 +37,6 @@ extern "C" {
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
// TODO(FBARCHARD): #define HAS_RGBATOARGBROW_SSSE3
// TODO(FBARCHARD): #define HAS_RGBATOUVROW_SSSE3
// TODO(FBARCHARD): #define HAS_RGBATOYROW_SSSE3
#define HAS_ARGBTORGBAROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
......@@ -61,7 +58,6 @@ extern "C" {
#define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOBGRAROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
// TODO(FBARCHARD): #define HAS_I422TORGBAROW_SSSE3
#define HAS_I444TOARGBROW_SSSE3
#define HAS_I411TOARGBROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
......@@ -91,10 +87,13 @@ extern "C" {
// The following are Windows only:
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
// TODO(fbarchard): Investigate possible issue in this function and reenable.
#define HAS_ARGBCOLORTABLEROW_X86
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV21TOARGBROW_SSSE3
#define HAS_I422TORGBAROW_SSSE3
#define HAS_RGBATOARGBROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#endif
// The following are disabled when SSSE3 is available:
......
......@@ -941,7 +941,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
}
#if defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_I422TOARGBROW_SSSE3
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
......@@ -949,13 +949,17 @@ Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
// TODO(fbarchard): NOLINT YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
#endif
#if defined(HAS_I422TOARGBROW_NEON)
#ifdef HAS_I422TORGBAROW_SSSE3
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
#endif
#ifdef HAS_I422TOARGBROW_NEON
YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
// TODO(fbarchard): NOLINT YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
#endif
#ifdef HAS_I422TORGBAROW_NEON
YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
#endif
#undef YANY
......@@ -987,7 +991,9 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
// TODO(fbarchard): YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
#ifdef HAS_RGBATOYROW_SSSE3
YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
#endif
YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
#undef YANY
......@@ -1006,7 +1012,9 @@ YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
// TODO(fbarchard): NOLINT UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
#ifdef HAS_RGBATOYROW_SSSE3
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
#endif
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
#undef UVANY
......
......@@ -59,6 +59,19 @@ static const vec8 kABGRToV = {
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
};
// Constants for RGBA.
static const vec8 kRGBAToY = {
0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
};
static const vec8 kRGBAToU = {
0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
};
static const vec8 kRGBAToV = {
0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
};
static const uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
......@@ -78,16 +91,16 @@ static const uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Shuffle table for converting ABGR to ARGB.
static const uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB.
static const uvec8 kShuffleMaskBGRAToARGB = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
// Shuffle table for converting ABGR to ARGB.
static const uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting RGBA to ARGB.
static const uvec8 kShuffleMaskRGBAToARGB = {
1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
......@@ -137,12 +150,12 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
}
__declspec(naked) __declspec(align(16))
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_abgr
mov eax, [esp + 4] // src_bgra
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskABGRToARGB
movdqa xmm5, kShuffleMaskBGRAToARGB
sub edx, eax
align 16
......@@ -158,12 +171,12 @@ __asm {
}
__declspec(naked) __declspec(align(16))
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_bgra
mov eax, [esp + 4] // src_abgr
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskBGRAToARGB
movdqa xmm5, kShuffleMaskABGRToARGB
sub edx, eax
align 16
......@@ -844,6 +857,74 @@ __asm {
}
}
__declspec(naked) __declspec(align(16))
void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
movdqa xmm4, kRGBAToY
align 16
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16))
void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
movdqa xmm4, kRGBAToY
align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16))
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
......@@ -1251,6 +1332,142 @@ __asm {
ret
}
}
__declspec(naked) __declspec(align(16))
void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, kRGBAToU
movdqa xmm6, kRGBAToV
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, kRGBAToU
movdqa xmm6, kRGBAToV
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
jg convertloop
pop edi
pop esi
ret
}
}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_I422TOARGBROW_SSSE3
......@@ -1847,6 +2064,47 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
}
}
__declspec(naked) __declspec(align(16))
void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* bgra_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // bgra
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
align 16
convertloop:
READYUV422
YUVTORGB
// Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
punpcklbw xmm1, xmm0 // GB
punpcklbw xmm5, xmm2 // AR
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqu [edx], xmm5
movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
......@@ -1889,10 +2147,10 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
}
__declspec(naked) __declspec(align(16))
void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* bgra_buf,
uint8* abgr_buf,
int width) {
__asm {
push esi
......@@ -1900,9 +2158,10 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // bgra
mov edx, [esp + 8 + 16] // abgr
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
......@@ -1910,15 +2169,55 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
READYUV422
YUVTORGB
// Step 3: Weave into BGRA
// Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG
punpcklbw xmm0, xmm5 // BA
movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqu [edx], xmm2
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void I422ToRGBARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgba_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgba
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
align 16
convertloop:
READYUV422
YUVTORGB
// Step 3: Weave into RGBA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
punpcklbw xmm1, xmm0 // GB
punpcklbw xmm5, xmm2 // AR
punpcklbw xmm1, xmm2 // GR
punpcklbw xmm5, xmm0 // AB
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqu [edx], xmm5
movdqu [edx + 16], xmm0
punpcklwd xmm5, xmm1 // RGBA first 4 pixels
punpckhwd xmm0, xmm1 // RGBA next 4 pixels
movdqa [edx], xmm5
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -1930,10 +2229,10 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
}
__declspec(naked) __declspec(align(16))
void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* abgr_buf,
uint8* rgba_buf,
int width) {
__asm {
push esi
......@@ -1941,10 +2240,9 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // abgr
mov edx, [esp + 8 + 16] // rgba
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
......@@ -1952,14 +2250,15 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
READYUV422
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG
punpcklbw xmm0, xmm5 // BA
movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqu [edx], xmm2
movdqu [edx + 16], xmm1
// Step 3: Weave into RGBA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
punpcklbw xmm1, xmm2 // GR
punpcklbw xmm5, xmm0 // AB
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // RGBA first 4 pixels
punpckhwd xmm0, xmm1 // RGBA next 4 pixels
movdqu [edx], xmm5
movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -1969,6 +2268,7 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
ret
}
}
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment