Commit 510fe70c authored by fbarchard@google.com's avatar fbarchard@google.com

Assembly tuned for rgb565 etc functions. pipelines for atom/core2.

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/368002

git-svn-id: http://libyuv.googlecode.com/svn/trunk@146 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 882ddbd9
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 145 Version: 146
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -395,7 +395,7 @@ __asm { ...@@ -395,7 +395,7 @@ __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskARGBToRGB24 movdqa xmm6, kShuffleMaskARGBToRGB24
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 16 pixels of argb movdqa xmm0, [eax] // fetch 16 pixels of argb
...@@ -403,23 +403,23 @@ __asm { ...@@ -403,23 +403,23 @@ __asm {
movdqa xmm2, [eax + 32] movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48] movdqa xmm3, [eax + 48]
lea eax, [eax + 64] lea eax, [eax + 64]
pshufb xmm0, xmm5 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm5 pshufb xmm1, xmm6
pshufb xmm2, xmm5 pshufb xmm2, xmm6
pshufb xmm3, xmm5 pshufb xmm3, xmm6
movdqa xmm4, xmm1 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
pslldq xmm4, 12 psrldq xmm1, 4 // 8 bytes from 1
por xmm4, xmm0 pslldq xmm4, 12 // 4 bytes from 1 for 0
movdqa [edx], xmm4 // first 16 bytes movdqa xmm5, xmm2 // 8 bytes from 2 for 1
movdqa xmm4, xmm2 por xmm0, xmm4 // 4 bytes from 1 for 0
psrldq xmm1, 4 pslldq xmm5, 8 // 8 bytes from 2 for 1
pslldq xmm4, 8 movdqa [edx], xmm0 // store 0
por xmm1, xmm4 por xmm1, xmm5 // 8 bytes from 2 for 1
movdqa [edx + 16], xmm1 // middle 16 bytes psrldq xmm2, 8 // 4 bytes from 2
psrldq xmm2, 8 pslldq xmm3, 4 // 12 bytes from 3 for 2
pslldq xmm3, 4 por xmm2, xmm3 // 12 bytes from 3 for 2
por xmm2, xmm3 movdqa [edx + 16], xmm1 // store 1
movdqa [edx + 32], xmm2 // last 16 bytes movdqa [edx + 32], xmm2 // store 2
lea edx, [edx + 48] lea edx, [edx + 48]
sub ecx, 16 sub ecx, 16
ja convertloop ja convertloop
...@@ -434,7 +434,7 @@ __asm { ...@@ -434,7 +434,7 @@ __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskARGBToRAW movdqa xmm6, kShuffleMaskARGBToRAW
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 16 pixels of argb movdqa xmm0, [eax] // fetch 16 pixels of argb
...@@ -442,23 +442,23 @@ __asm { ...@@ -442,23 +442,23 @@ __asm {
movdqa xmm2, [eax + 32] movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48] movdqa xmm3, [eax + 48]
lea eax, [eax + 64] lea eax, [eax + 64]
pshufb xmm0, xmm5 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm5 pshufb xmm1, xmm6
pshufb xmm2, xmm5 pshufb xmm2, xmm6
pshufb xmm3, xmm5 pshufb xmm3, xmm6
movdqa xmm4, xmm1 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
pslldq xmm4, 12 psrldq xmm1, 4 // 8 bytes from 1
por xmm4, xmm0 pslldq xmm4, 12 // 4 bytes from 1 for 0
movdqa [edx], xmm4 // first 16 bytes movdqa xmm5, xmm2 // 8 bytes from 2 for 1
movdqa xmm4, xmm2 por xmm0, xmm4 // 4 bytes from 1 for 0
psrldq xmm1, 4 pslldq xmm5, 8 // 8 bytes from 2 for 1
pslldq xmm4, 8 movdqa [edx], xmm0 // store 0
por xmm1, xmm4 por xmm1, xmm5 // 8 bytes from 2 for 1
movdqa [edx + 16], xmm1 // middle 16 bytes psrldq xmm2, 8 // 4 bytes from 2
psrldq xmm2, 8 pslldq xmm3, 4 // 12 bytes from 3 for 2
pslldq xmm3, 4 por xmm2, xmm3 // 12 bytes from 3 for 2
por xmm2, xmm3 movdqa [edx + 16], xmm1 // store 1
movdqa [edx + 32], xmm2 // last 16 bytes movdqa [edx + 32], xmm2 // store 2
lea edx, [edx + 48] lea edx, [edx + 48]
sub ecx, 16 sub ecx, 16
ja convertloop ja convertloop
...@@ -467,36 +467,33 @@ __asm { ...@@ -467,36 +467,33 @@ __asm {
} }
// TODO(fbarchard): Port to gcc // TODO(fbarchard): Port to gcc
// TODO(fbarchard): Improve sign extension/packing
__declspec(naked) __declspec(naked)
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
pcmpeqb xmm3, xmm3 // generate mask 0x001f001f
psrlw xmm3, 11
pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0
psrlw xmm4, 10
psllw xmm4, 5
pcmpeqb xmm5, xmm5 // generate mask 0xf800f800
psllw xmm5, 11
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
psrld xmm3, 27
pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
psrld xmm4, 26
pslld xmm4, 5
pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
psrld xmm1, 3
pand xmm1, xmm3
movdqa xmm2, xmm0 // G movdqa xmm2, xmm0 // G
psrld xmm2, 5 pslld xmm0, 8 // R
pand xmm2, xmm4 psrld xmm1, 3 // B
por xmm1, xmm2 psrld xmm2, 5 // G
psrld xmm0, 8 // R psrad xmm0, 16 // R
pand xmm0, xmm5 pand xmm1, xmm3 // B
por xmm0, xmm1 pand xmm2, xmm4 // G
pslld xmm0, 16 pand xmm0, xmm5 // R
psrad xmm0, 16 por xmm1, xmm2 // BG
por xmm0, xmm1 // BGR
packssdw xmm0, xmm0 packssdw xmm0, xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
...@@ -512,37 +509,34 @@ __asm { ...@@ -512,37 +509,34 @@ __asm {
__declspec(naked) __declspec(naked)
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
pcmpeqb xmm3, xmm3 // generate mask 0x001f001f
psrlw xmm3, 11
movdqa xmm4, xmm3 // generate mask 0x03e003e0
psllw xmm4, 5
movdqa xmm5, xmm4 // generate mask 0x7c007c00
psllw xmm5, 5
pcmpeqb xmm6, xmm6 // generate mask 0x80008000
psllw xmm6, 15
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
psrld xmm4, 27
movdqa xmm5, xmm4 // generate mask 0x000003e0
pslld xmm5, 5
movdqa xmm6, xmm4 // generate mask 0x00007c00
pslld xmm6, 10
pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
pslld xmm7, 15
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
psrld xmm1, 3
pand xmm1, xmm3
movdqa xmm2, xmm0 // G movdqa xmm2, xmm0 // G
psrld xmm2, 6 movdqa xmm3, xmm0 // R
pand xmm2, xmm4 psrad xmm0, 16 // A
por xmm1, xmm2 psrld xmm1, 3 // B
movdqa xmm2, xmm0 // R psrld xmm2, 6 // G
psrld xmm2, 9 psrld xmm3, 9 // R
pand xmm2, xmm5 pand xmm0, xmm7 // A
por xmm1, xmm2 pand xmm1, xmm4 // B
psrld xmm0, 16 // A pand xmm2, xmm5 // G
pand xmm0, xmm6 pand xmm3, xmm6 // R
por xmm0, xmm1 por xmm0, xmm1 // BA
pslld xmm0, 16 por xmm2, xmm3 // GR
psrad xmm0, 16 por xmm0, xmm2 // BGRA
packssdw xmm0, xmm0 packssdw xmm0, xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
...@@ -557,15 +551,14 @@ __asm { ...@@ -557,15 +551,14 @@ __asm {
__declspec(naked) __declspec(naked)
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix
pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
psllw xmm4, 12 psllw xmm4, 12
movdqa xmm3, xmm4 // generate mask 0x00f000f0 movdqa xmm3, xmm4 // generate mask 0x00f000f0
psrlw xmm3, 8 psrlw xmm3, 8
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 movdqa xmm1, xmm0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment