Commit 0e6ce93c authored by fbarchard@google.com's avatar fbarchard@google.com

address munge for rowreverse. And computer green mask based on red mask to save one shift.

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/363001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@144 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent d1943b39
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 143
Version: 144
License: BSD
License File: LICENSE
......
......@@ -655,14 +655,13 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa %3,%%xmm5 \n"
"lea -0x10(%0,%2,1),%0 \n"
"lea -0x10(%0),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"movdqa (%0,%2),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -681,10 +680,9 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"lea -0x10(%0,%2,1),%0 \n"
"lea -0x10(%0),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"movdqa (%0,%2),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"psllw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
......@@ -692,9 +690,9 @@ void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......
......@@ -238,120 +238,6 @@ __asm {
}
}
#ifdef SHIFT565
// Below shift/mask code is efficient and works, but more instructions than
// pmul method
// TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc
// 29 instructions
__declspec(naked)
void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_rgb565
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000 for Alpha
pslld xmm5, 24
pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red
psllw xmm4, 11
pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue
psrlw xmm6, 11
pcmpeqb xmm7, xmm7 // generate mask 0x00fc00fc for Green
psrlw xmm7, 10
psllw xmm7, 2
convertloop:
movdqa xmm0, [eax] // fetch 8 pixels of bgr565
lea eax, [eax + 16]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
pand xmm1, xmm4 // R in upper 5 bits
psrlw xmm2, 13 // R 3 bits
psllw xmm2, 8
por xmm1, xmm2
movdqa xmm2, xmm0
pand xmm2, xmm6 // mask B 5 bits
movdqa xmm3, xmm2
psllw xmm2, 3
psrlw xmm3, 2
por xmm2, xmm3
por xmm1, xmm2 // RB
psrlw xmm0, 3 // G in top 6 bits of lower byte
pand xmm0, xmm7 // mask G 6 bits
movdqa xmm2, xmm0
psrlw xmm2, 6
por xmm0, xmm2
por xmm0, xmm5 // AG
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
movdqa [edx], xmm1 // store 4 pixels of ARGB
movdqa [edx + 16], xmm2 // store next 4 pixels of ARGB
lea edx, [edx + 32]
sub ecx, 8
ja convertloop
ret
}
}
// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
// 33 instructions
__declspec(naked)
void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_argb1555
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 for Alpha
psllw xmm5, 8
pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red
psllw xmm4, 11
pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue
psrlw xmm6, 11
pcmpeqb xmm7, xmm7 // generate mask 0x00f800f8 for Green
psrlw xmm7, 11
psllw xmm7, 3
convertloop:
movdqa xmm0, [eax] // fetch 8 pixels of bgr565
lea eax, [eax + 16]
movdqa xmm1, xmm0
psllw xmm1, 1
movdqa xmm2, xmm0
pand xmm1, xmm4 // R in upper 5 bits
psrlw xmm2, 13 // R 3 bits
psllw xmm2, 8
por xmm1, xmm2
movdqa xmm2, xmm0
pand xmm2, xmm6 // mask B 5 bits
movdqa xmm3, xmm2
psllw xmm2, 3
psrlw xmm3, 2
por xmm2, xmm3
por xmm1, xmm2 // RB
movdqa xmm2, xmm0
psrlw xmm2, 2 // G in top 5 bits of lower byte
pand xmm2, xmm7 // mask G 5 bits
movdqa xmm3, xmm2
psrlw xmm3, 5
por xmm2, xmm3
psraw xmm0, 8 // A
pand xmm0, xmm5
por xmm0, xmm2 // AG
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
movdqa [edx], xmm1 // store 4 pixels of ARGB
movdqa [edx + 16], xmm2 // store next 4 pixels of ARGB
lea edx, [edx + 32]
sub ecx, 8
ja convertloop
ret
}
}
#endif
// pmul method to replicate bits
// Math to replicate bits
// (v << 8) | (v << 3)
......@@ -422,8 +308,7 @@ __asm {
pshufd xmm6, xmm6, 0
pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
psllw xmm3, 11
pcmpeqb xmm4, xmm4 // generate mask 0x03e003e0 for Green
psllw xmm4, 11
movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
psrlw xmm4, 6
pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
psllw xmm7, 8
......@@ -1305,14 +1190,13 @@ __asm {
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
movdqa xmm5, kShuffleReverse
lea eax, [eax + ecx - 16]
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
convertloop:
movdqa xmm0, [eax + ecx]
pshufb xmm0, xmm5
sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
......@@ -1327,10 +1211,9 @@ __asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
lea eax, [eax + ecx - 16]
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
convertloop:
movdqa xmm0, [eax + ecx]
movdqa xmm1, xmm0 // swap bytes
psllw xmm0, 8
psrlw xmm1, 8
......@@ -1338,9 +1221,9 @@ __asm {
pshuflw xmm0, xmm0, 0x1b // swap words
pshufhw xmm0, xmm0, 0x1b
pshufd xmm0, xmm0, 0x4e // swap qwords
sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment