Commit ba3aeed3 authored by fbarchard@google.com's avatar fbarchard@google.com

gcc port of alpha blend and add align to row_win loops

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/439006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@207 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c6e7e2a8
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 206 Version: 207
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 206 #define LIBYUV_VERSION 207
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -36,6 +36,7 @@ extern "C" { ...@@ -36,6 +36,7 @@ extern "C" {
// http://www.fourcc.org/yuv.php // http://www.fourcc.org/yuv.php
// http://v4l2spec.bytesex.org/spec/book1.htm // http://v4l2spec.bytesex.org/spec/book1.htm
// http://developer.apple.com/quicktime/icefloe/dispatch020.html // http://developer.apple.com/quicktime/icefloe/dispatch020.html
// http://msdn.microsoft.com/en-us/library/windows/desktop/dd206750(v=vs.85).aspx#nv12
enum FourCC { enum FourCC {
// Canonical fourcc codes used in our code. // Canonical fourcc codes used in our code.
......
...@@ -62,9 +62,6 @@ extern "C" { ...@@ -62,9 +62,6 @@ extern "C" {
#define HAS_UYVYTOYROW_SSE2 #define HAS_UYVYTOYROW_SSE2
#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOUVROW_SSE2
#define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOUVROW_SSE2
#endif
#if defined(_MSC_VER)
#define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBBLENDROW_SSE2
#endif #endif
......
...@@ -1923,6 +1923,106 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1923,6 +1923,106 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
} }
#endif // HAS_YUY2TOYROW_SSE2 #endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
uint32 pixel = 0;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"sub %0,%1 \n"
"mov (%0),%3 \n"
"sub $0x1,%2 \n"
"je 8f \n" // last1
"cmp $0xff000000,%3 \n"
"jae 2f \n" // opaqueloop
"cmp $0xffffff,%3 \n"
"ja 3f \n" // translucientloop
// transparentloop
"1: \n"
"sub $0x1,%2 \n"
"lea 0x4(%0),%0 \n"
"je 8f \n" // last1
"mov (%0),%3 \n"
"cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop
"cmp $0xff000000,%3 \n"
"jb 3f \n" // translucientloop
// opaqueloop
"2: \n"
"mov %3,(%0,%1,1) \n"
"lea 0x4(%0),%0 \n"
"sub $0x1,%2 \n"
"je 8f \n" // last1
"mov (%0),%3 \n"
"cmp $0xff000000,%3 \n"
"jae 2b \n" // opaqueloop
"cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop
"nop \n"
// translucientloop
"3: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%1,1),%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpcklbw %%xmm1,%%xmm1 \n"
"pshuflw $0xff,%%xmm0,%%xmm2 \n"
"pshufhw $0xff,%%xmm2,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"pxor %%xmm4,%%xmm3 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"paddw %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0,(%0,%1,1) \n"
"lea 0x8(%0),%0 \n"
"sub $0x2,%2 \n"
"jbe 8f \n" // last1
"mov (%0),%3 \n"
"cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop
"cmp $0xff000000,%3 \n"
"jb 3b \n" // translucientloop
"jmp 2b \n" // opaqueloop
// last1
"8: \n"
"add $0x1,%2 \n"
"je 9f \n" // done
"movd %3,%%xmm0 \n"
"mov (%0,%1,1),%3 \n"
"movd %3,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpcklbw %%xmm1,%%xmm1 \n"
"pshuflw $0xff,%%xmm0,%%xmm2 \n"
"pshufhw $0xff,%%xmm2,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"pxor %%xmm4,%%xmm3 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"paddw %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%3 \n"
"mov %3,(%0,%1,1) \n"
// done
"9: \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
"+r"(pixel) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
);
}
#endif // HAS_ARGBBLENDROW_SSE2
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -108,6 +108,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -108,6 +108,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24 pslld xmm5, 24
align 16
convertloop: convertloop:
movq xmm0, qword ptr [eax] movq xmm0, qword ptr [eax]
lea eax, [eax + 8] lea eax, [eax + 8]
...@@ -134,6 +135,7 @@ __asm { ...@@ -134,6 +135,7 @@ __asm {
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskABGRToARGB movdqa xmm5, kShuffleMaskABGRToARGB
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -154,6 +156,7 @@ __asm { ...@@ -154,6 +156,7 @@ __asm {
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskBGRAToARGB movdqa xmm5, kShuffleMaskBGRAToARGB
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -176,6 +179,7 @@ __asm { ...@@ -176,6 +179,7 @@ __asm {
pslld xmm5, 24 pslld xmm5, 24
movdqa xmm4, kShuffleMaskRGB24ToARGB movdqa xmm4, kShuffleMaskRGB24ToARGB
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -215,6 +219,7 @@ __asm { ...@@ -215,6 +219,7 @@ __asm {
pslld xmm5, 24 pslld xmm5, 24
movdqa xmm4, kShuffleMaskRAWToARGB movdqa xmm4, kShuffleMaskRAWToARGB
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -274,6 +279,7 @@ __asm { ...@@ -274,6 +279,7 @@ __asm {
sub edx, eax sub edx, eax
sub edx, eax sub edx, eax
align 16
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
movdqa xmm1, xmm0 movdqa xmm1, xmm0
...@@ -323,6 +329,7 @@ __asm { ...@@ -323,6 +329,7 @@ __asm {
sub edx, eax sub edx, eax
sub edx, eax sub edx, eax
align 16
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqu xmm0, [eax] // fetch 8 pixels of 1555
movdqa xmm1, xmm0 movdqa xmm1, xmm0
...@@ -368,6 +375,7 @@ __asm { ...@@ -368,6 +375,7 @@ __asm {
sub edx, eax sub edx, eax
sub edx, eax sub edx, eax
align 16
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
movdqa xmm2, xmm0 movdqa xmm2, xmm0
...@@ -399,6 +407,7 @@ __asm { ...@@ -399,6 +407,7 @@ __asm {
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRGB24 movdqa xmm6, kShuffleMaskARGBToRGB24
align 16
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 16 pixels of argb movdqa xmm0, [eax] // fetch 16 pixels of argb
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -437,6 +446,7 @@ __asm { ...@@ -437,6 +446,7 @@ __asm {
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRAW movdqa xmm6, kShuffleMaskARGBToRAW
align 16
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 16 pixels of argb movdqa xmm0, [eax] // fetch 16 pixels of argb
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -482,6 +492,7 @@ __asm { ...@@ -482,6 +492,7 @@ __asm {
pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11 pslld xmm5, 11
align 16
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
...@@ -521,6 +532,7 @@ __asm { ...@@ -521,6 +532,7 @@ __asm {
pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
pslld xmm7, 15 pslld xmm7, 15
align 16
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
...@@ -558,6 +570,7 @@ __asm { ...@@ -558,6 +570,7 @@ __asm {
movdqa xmm3, xmm4 // generate mask 0x00f000f0 movdqa xmm3, xmm4 // generate mask 0x00f000f0
psrlw xmm3, 8 psrlw xmm3, 8
align 16
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 movdqa xmm1, xmm0
...@@ -586,6 +599,7 @@ __asm { ...@@ -586,6 +599,7 @@ __asm {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY movdqa xmm4, kARGBToY
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -619,6 +633,7 @@ __asm { ...@@ -619,6 +633,7 @@ __asm {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY movdqa xmm4, kARGBToY
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -652,6 +667,7 @@ __asm { ...@@ -652,6 +667,7 @@ __asm {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY movdqa xmm4, kBGRAToY
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -685,6 +701,7 @@ __asm { ...@@ -685,6 +701,7 @@ __asm {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY movdqa xmm4, kBGRAToY
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -718,6 +735,7 @@ __asm { ...@@ -718,6 +735,7 @@ __asm {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY movdqa xmm4, kABGRToY
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -751,6 +769,7 @@ __asm { ...@@ -751,6 +769,7 @@ __asm {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY movdqa xmm4, kABGRToY
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -791,6 +810,7 @@ __asm { ...@@ -791,6 +810,7 @@ __asm {
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 16
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -839,7 +859,6 @@ __asm { ...@@ -839,7 +859,6 @@ __asm {
} }
} }
__declspec(naked) __declspec(naked)
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
...@@ -856,6 +875,7 @@ __asm { ...@@ -856,6 +875,7 @@ __asm {
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 16
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -924,6 +944,7 @@ __asm { ...@@ -924,6 +944,7 @@ __asm {
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 16
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -988,6 +1009,7 @@ __asm { ...@@ -988,6 +1009,7 @@ __asm {
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 16
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1056,6 +1078,7 @@ __asm { ...@@ -1056,6 +1078,7 @@ __asm {
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 16
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -1104,7 +1127,6 @@ __asm { ...@@ -1104,7 +1127,6 @@ __asm {
} }
} }
__declspec(naked) __declspec(naked)
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
...@@ -1121,6 +1143,7 @@ __asm { ...@@ -1121,6 +1143,7 @@ __asm {
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 16
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1258,6 +1281,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1258,6 +1281,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 16
convertloop: convertloop:
YUVTORGB YUVTORGB
...@@ -1297,6 +1321,7 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1297,6 +1321,7 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
sub edi, esi sub edi, esi
pxor xmm4, xmm4 pxor xmm4, xmm4
align 16
convertloop: convertloop:
YUVTORGB YUVTORGB
...@@ -1338,6 +1363,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1338,6 +1363,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 16
convertloop: convertloop:
YUVTORGB YUVTORGB
...@@ -1378,6 +1404,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1378,6 +1404,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 16
convertloop: convertloop:
// Step 1: Find 4 UV contributions to 4 R,G,B values // Step 1: Find 4 UV contributions to 4 R,G,B values
movd xmm0, [esi] // U movd xmm0, [esi] // U
...@@ -1444,6 +1471,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -1444,6 +1471,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
mov edx, [esp + 8] // rgb mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
align 16
convertloop: convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax] movq xmm0, qword ptr [eax]
...@@ -1488,6 +1516,8 @@ __asm { ...@@ -1488,6 +1516,8 @@ __asm {
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
movdqa xmm5, kShuffleMirror movdqa xmm5, kShuffleMirror
lea eax, [eax - 16] lea eax, [eax - 16]
align 16
convertloop: convertloop:
movdqa xmm0, [eax + ecx] movdqa xmm0, [eax + ecx]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
...@@ -1510,6 +1540,8 @@ __asm { ...@@ -1510,6 +1540,8 @@ __asm {
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
lea eax, [eax - 16] lea eax, [eax - 16]
align 16
convertloop: convertloop:
movdqu xmm0, [eax + ecx] movdqu xmm0, [eax + ecx]
movdqa xmm1, xmm0 // swap bytes movdqa xmm1, xmm0 // swap bytes
...@@ -1547,6 +1579,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -1547,6 +1579,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
lea eax, [eax + ecx * 2 - 16] lea eax, [eax + ecx * 2 - 16]
sub edi, edx sub edi, edx
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
...@@ -1576,6 +1609,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -1576,6 +1609,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -1608,6 +1642,8 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -1608,6 +1642,8 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
sub edx, eax sub edx, eax
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -1650,6 +1686,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, ...@@ -1650,6 +1686,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -1680,6 +1717,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -1680,6 +1717,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -1718,6 +1756,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, ...@@ -1718,6 +1756,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -1748,6 +1787,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -1748,6 +1787,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -1784,6 +1824,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, ...@@ -1784,6 +1824,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -1814,6 +1855,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1814,6 +1855,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -1850,6 +1892,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, ...@@ -1850,6 +1892,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -1880,6 +1923,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1880,6 +1923,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -1912,7 +1956,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1912,7 +1956,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
#ifdef HAS_ARGBBLENDROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2
// TODO(fbarchard): Single multiply method b+a(f-b) // TODO(fbarchard): Single multiply method b+a(f-b)
// TODO(fbarchard): Unroll and pair // TODO(fbarchard): Unroll and pair
// TODO(fbarchard): Port to gcc // TODO(fbarchard): branch hints __emit 0x3E taken, 0x2E not taken
__declspec(naked) __declspec(naked)
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm { __asm {
...@@ -1922,7 +1966,6 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1922,7 +1966,6 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
mov ecx, [esp + 4 + 12] // width mov ecx, [esp + 4 + 12] // width
pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha
sub edx, esi sub edx, esi
mov eax, [esi] // get first pixel mov eax, [esi] // get first pixel
sub ecx, 1 // ensure there are at least 2 pixels sub ecx, 1 // ensure there are at least 2 pixels
je last1 // last pixel? je last1 // last pixel?
...@@ -1954,7 +1997,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1954,7 +1997,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
cmp eax, 0x00FFFFFF // transparent? cmp eax, 0x00FFFFFF // transparent?
jbe transparentloop jbe transparentloop
align 4 align 16
translucientloop: translucientloop:
movq xmm0, qword ptr [esi] // fetch 2 pixels movq xmm0, qword ptr [esi] // fetch 2 pixels
movq xmm1, qword ptr [esi + edx] movq xmm1, qword ptr [esi + edx]
...@@ -1980,7 +2023,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1980,7 +2023,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
jb translucientloop jb translucientloop
jmp opaqueloop jmp opaqueloop
align 4 align 16
last1: last1:
add ecx, 1 add ecx, 1
je done je done
......
...@@ -1256,6 +1256,8 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1256,6 +1256,8 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
#define HAS_SCALEADDROWS_SSE2 #define HAS_SCALEADDROWS_SSE2
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16xN bytes and produces 16 shorts at a time.
// TODO(fbarchard): support 1 rows
// TODO(fbarchard): align loops
__declspec(naked) __declspec(naked)
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width,
...@@ -1699,7 +1701,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1699,7 +1701,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, int src_height) { uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0; int tmp_height = 0;
intptr_t tmp_src = 0; intptr_t tmp_src = 0;
intptr_t tmp_src_stride = static_cast<intptr_t>(src_stride);
asm volatile ( asm volatile (
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n" "sub $0x1,%5 \n"
...@@ -1731,9 +1732,9 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1731,9 +1732,9 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(tmp_height), // %2 "+r"(tmp_height), // %2
"+r"(tmp_src), // %3 "+r"(tmp_src), // %3
"+rm"(src_width), // %4 "+r"(src_width), // %4
"+rm"(src_height) // %5 "+rm"(src_height) // %5
: "rm"(tmp_src_stride) // %6 : "rm"(static_cast<intptr_t>(src_stride)) // %6
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
......
...@@ -16,7 +16,7 @@ namespace libyuv { ...@@ -16,7 +16,7 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0])))) #define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0]))))
struct FourCCAliasEntry { struct FourCCAliasEntry {
uint32 alias; uint32 alias;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment