Commit 2f56d285 authored by fbarchard@google.com's avatar fbarchard@google.com

Macro to store ARGB value

BUG=396
TESTED=local windows build
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/38109004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1279 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 5ab38f92
......@@ -1526,6 +1526,17 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
// Store 8 ARGB values. Assumes XMM5 is zero.
#define STOREARGB \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklbw %%xmm5,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"punpcklwd %%xmm2,%%xmm0 \n" \
"punpckhwd %%xmm2,%%xmm1 \n" \
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -1538,14 +1549,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"1: \n"
READYUV444
YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
......@@ -1660,14 +1664,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"1: \n"
READYUV422
YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
......@@ -1693,14 +1690,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"1: \n"
READYUV411
YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
......@@ -1724,14 +1714,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
"1: \n"
READNV12
YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
......@@ -1754,14 +1737,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
"1: \n"
READNV12
YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
......
......@@ -316,7 +316,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
por xmm3, xmm5
movdqu [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
sub ecx, 16
jg convertloop
ret
}
......@@ -1772,6 +1772,19 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm packuswb xmm2, xmm2 /* R */ \
}
// Store 8 ARGB values.
#define STOREARGB __asm { \
/* Step 3: Weave into ARGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
__asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
__asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
__asm movdqu [edx], xmm0 \
__asm movdqu [edx + 16], xmm1 \
__asm lea edx, [edx + 32] \
}
// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
......@@ -1794,16 +1807,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
convertloop:
READYUV444
YUVTORGB(kYuvConstants)
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // ABGR first 4 pixels
punpckhwd xmm1, xmm2 // ABGR next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
STOREARGB
sub ecx, 8
jg convertloop
......@@ -1996,16 +2000,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
convertloop:
READYUV422
YUVTORGB(kYuvConstants)
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
STOREARGB
sub ecx, 8
jg convertloop
......@@ -2039,16 +2034,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
convertloop:
READYUV411 // modifies EBX
YUVTORGB(kYuvConstants)
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
STOREARGB
sub ecx, 8
jg convertloop
......@@ -2077,16 +2063,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
convertloop:
READNV12
YUVTORGB(kYuvConstants)
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
STOREARGB
sub ecx, 8
jg convertloop
......@@ -2113,16 +2090,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
convertloop:
READNV12
YUVTORGB(kYvuConstants)
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
STOREARGB
sub ecx, 8
jg convertloop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment