Commit 4c416e88 authored by fbarchard@google.com's avatar fbarchard@google.com

Fix for I444ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_Unaligned_SSSE3,…

Fix for I444ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_Unaligned_SSSE3 on Windows using movdqu instead of movdqa.  break YUVTORGB into 2 macros - one to fetch pixels, another to do YUV conversion.  Less duplicated source and lends itself to future YUV formats.
BUG=none
TEST=WebRtcVideoFrameTest.ConvertToARGBBufferStride
Review URL: https://webrtc-codereview.appspot.com/644004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@279 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 43279ffd
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 277
Version: 279
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 277
#define LIBYUV_VERSION 279
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -1212,7 +1212,6 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
#endif
);
}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_I422TOARGBROW_SSSE3
......@@ -1251,73 +1250,32 @@ struct {
{ YG, YG, YG, YG, YG, YG, YG, YG }
};
// Convert 8 pixels: 8 UV and 8 Y
#define YUV444TORGB \
// Read 8 UV from 411
#define READYUV444 \
"movq (%1),%%xmm0 \n" \
"movq (%1,%2,1),%%xmm1 \n" \
"lea 0x8(%1),%1 \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
"pmaddubsw 16(%5),%%xmm1 \n" \
"pmaddubsw 32(%5),%%xmm2 \n" \
"psubw 48(%5),%%xmm0 \n" \
"psubw 64(%5),%%xmm1 \n" \
"psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%5),%%xmm3 \n" \
"pmullw 112(%5),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
// Convert 8 pixels: 4 UV and 8 Y
#define YUV422TORGB \
// Read 4 UV from 422, upsample to 8 UV
#define READYUV422 \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x4(%1),%1 \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
"pmaddubsw 16(%5),%%xmm1 \n" \
"pmaddubsw 32(%5),%%xmm2 \n" \
"psubw 48(%5),%%xmm0 \n" \
"psubw 64(%5),%%xmm1 \n" \
"psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%5),%%xmm3 \n" \
"pmullw 112(%5),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
// Convert 8 pixels: 2 UV and 8 Y
#define YUV411TORGB \
// Read 2 UV from 411, upsample to 8 UV
#define READYUV411 \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x2(%1),%1 \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"punpckldq %%xmm0,%%xmm0 \n" \
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
......@@ -1352,7 +1310,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV444TORGB
READYUV444
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
......@@ -1387,7 +1346,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
READYUV422
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
......@@ -1422,7 +1382,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV411TORGB
READYUV411
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
......@@ -1457,7 +1418,8 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV444TORGB
READYUV444
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
......@@ -1492,7 +1454,8 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
READYUV422
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
......@@ -1527,7 +1490,8 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV411TORGB
READYUV411
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
......@@ -1562,7 +1526,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
READYUV422
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
......@@ -1598,7 +1563,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
READYUV422
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
......@@ -1633,7 +1599,8 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
READYUV422
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
......@@ -1669,7 +1636,8 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
READYUV422
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
......@@ -1741,7 +1709,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
#endif
);
}
#endif
#endif // HAS_YTOARGBROW_SSE2
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
......@@ -1772,7 +1740,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
#endif
);
}
#endif
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_SSE2
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
......@@ -1803,7 +1771,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
#endif
);
}
#endif
#endif // HAS_MIRRORROW_SSE2
#ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels.
......@@ -1838,7 +1806,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
#endif
);
}
#endif
#endif // HAS_MIRRORROW_UV_SSSE3
#ifdef HAS_ADDROW_SSE2
// dst and width aligned to 16
......@@ -1939,7 +1907,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
#endif
);
}
#endif
#endif // HAS_SPLITUV_SSE2
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
......@@ -1979,7 +1947,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
: "memory", "cc"
);
}
#endif
#endif // HAS_COPYROW_X86
#ifdef HAS_YUY2TOYROW_SSE2
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
......
......@@ -1199,6 +1199,7 @@ __asm {
ret
}
}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_I422TOARGBROW_SSSE3
......@@ -1237,80 +1238,36 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
// Convert 8 pixels: 8 UV and 8 Y
#define YUV444TORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
// Read 8 UV from 411
#define READYUV444 __asm { \
__asm movq xmm0, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, kUVBiasG \
__asm psubw xmm2, kUVBiasR \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \
__asm psubsw xmm3, kYSub16 \
__asm pmullw xmm3, kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// Convert 8 pixels: 4 UV and 8 Y
#define YUV422TORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
// Read 4 UV from 422, upsample to 8 UV
#define READYUV422 __asm { \
__asm movd xmm0, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, kUVBiasG \
__asm psubw xmm2, kUVBiasR \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \
__asm psubsw xmm3, kYSub16 \
__asm pmullw xmm3, kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// Convert 8 pixels: 2 UV and 8 Y
#define YUV411TORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
// Read 2 UV from 411, upsample to 8 UV
#define READYUV411 __asm { \
__asm movd xmm0, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 2] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
}
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
......@@ -1358,7 +1315,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV444TORGB
READYUV444
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
......@@ -1400,7 +1358,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV422TORGB
READYUV422
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
......@@ -1443,7 +1402,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV411TORGB
READYUV411
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
......@@ -1485,7 +1445,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV444TORGB
READYUV444
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
......@@ -1493,8 +1454,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -1527,7 +1488,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV422TORGB
READYUV422
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
......@@ -1535,8 +1497,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -1570,7 +1532,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV411TORGB
READYUV411
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
......@@ -1578,8 +1541,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
......@@ -1609,7 +1572,8 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV422TORGB
READYUV422
YUVTORGB
// Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
......@@ -1650,7 +1614,8 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV422TORGB
READYUV422
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG
......@@ -1689,7 +1654,8 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV422TORGB
READYUV422
YUVTORGB
// Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
......@@ -1730,7 +1696,8 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
align 16
convertloop:
YUV422TORGB
READYUV422
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG
......@@ -1796,7 +1763,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
}
}
#endif // HAS_YTOARGBROW_SSE2
#endif
#ifdef HAS_MIRRORROW_SSSE3
......@@ -1825,7 +1791,7 @@ __asm {
ret
}
}
#endif
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_SSE2
// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
......@@ -1855,7 +1821,7 @@ __asm {
ret
}
}
#endif
#endif // HAS_MIRRORROW_SSE2
#ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels.
......@@ -1891,7 +1857,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
ret
}
}
#endif
#endif // HAS_MIRRORROW_UV_SSSE3
#ifdef HAS_ADDROW_SSE2
// dst and width aligned to 16
......@@ -1988,7 +1954,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
ret
}
}
#endif
#endif // HAS_SPLITUV_SSE2
#ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
......@@ -2030,7 +1996,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
ret
}
}
#endif
#endif // HAS_COPYROW_X86
#ifdef HAS_YUY2TOYROW_SSE2
__declspec(naked) __declspec(align(16))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment