Commit 24d2656b authored by fbarchard@google.com's avatar fbarchard@google.com

ARGBToRGB24 and ARGBToRAW optimized

BUG=none
TEST=media_unittest
Review URL: https://webrtc-codereview.appspot.com/348013

git-svn-id: http://libyuv.googlecode.com/svn/trunk@140 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8af21a57
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 137 Version: 140
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -1653,7 +1653,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, ...@@ -1653,7 +1653,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix); void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED) #if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 16) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
...@@ -1709,7 +1709,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, ...@@ -1709,7 +1709,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix); void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED) #if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 16) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
...@@ -1765,7 +1765,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -1765,7 +1765,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix); void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED) #if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 16) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
...@@ -1821,7 +1821,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, ...@@ -1821,7 +1821,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix); void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTOARGB1555ROW_SSE2_DISABLED) #if defined(HAS_ARGBTOARGB1555ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 16) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
...@@ -2195,7 +2195,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, ...@@ -2195,7 +2195,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb; src_stride_argb = -src_stride_argb;
} }
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix); void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED) #if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 16) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
...@@ -2225,7 +2225,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, ...@@ -2225,7 +2225,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb; src_stride_argb = -src_stride_argb;
} }
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix); void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED) #if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 16) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
...@@ -2276,10 +2276,10 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -2276,10 +2276,10 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix); void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED) #if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 16) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_rgb, 16) && IS_ALIGNED(dst_stride_rgb, 16)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2; ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
} else } else
#endif #endif
......
...@@ -523,16 +523,16 @@ __asm { ...@@ -523,16 +523,16 @@ __asm {
pshufb xmm2, xmm5 pshufb xmm2, xmm5
pshufb xmm3, xmm5 pshufb xmm3, xmm5
movdqa xmm4, xmm1 movdqa xmm4, xmm1
psllq xmm4, 12 pslldq xmm4, 12
por xmm4, xmm0 por xmm4, xmm0
movdqa [edx], xmm4 // first 16 bytes movdqa [edx], xmm4 // first 16 bytes
movdqa xmm4, xmm2 movdqa xmm4, xmm2
psrlq xmm1, 4 psrldq xmm1, 4
psllq xmm4, 8 pslldq xmm4, 8
por xmm1, xmm4 por xmm1, xmm4
movdqa [edx + 16], xmm1 // middle 16 bytes movdqa [edx + 16], xmm1 // middle 16 bytes
psrlq xmm2, 8 psrldq xmm2, 8
psllq xmm3, 4 pslldq xmm3, 4
por xmm2, xmm3 por xmm2, xmm3
movdqa [edx + 32], xmm2 // last 16 bytes movdqa [edx + 32], xmm2 // last 16 bytes
lea edx, [edx + 48] lea edx, [edx + 48]
...@@ -562,16 +562,16 @@ __asm { ...@@ -562,16 +562,16 @@ __asm {
pshufb xmm2, xmm5 pshufb xmm2, xmm5
pshufb xmm3, xmm5 pshufb xmm3, xmm5
movdqa xmm4, xmm1 movdqa xmm4, xmm1
psllq xmm4, 12 pslldq xmm4, 12
por xmm4, xmm0 por xmm4, xmm0
movdqa [edx], xmm4 // first 16 bytes movdqa [edx], xmm4 // first 16 bytes
movdqa xmm4, xmm2 movdqa xmm4, xmm2
psrlq xmm1, 4 psrldq xmm1, 4
psllq xmm4, 8 pslldq xmm4, 8
por xmm1, xmm4 por xmm1, xmm4
movdqa [edx + 16], xmm1 // middle 16 bytes movdqa [edx + 16], xmm1 // middle 16 bytes
psrlq xmm2, 8 psrldq xmm2, 8
psllq xmm3, 4 pslldq xmm3, 4
por xmm2, xmm3 por xmm2, xmm3
movdqa [edx + 32], xmm2 // last 16 bytes movdqa [edx + 32], xmm2 // last 16 bytes
lea edx, [edx + 48] lea edx, [edx + 48]
...@@ -582,6 +582,7 @@ __asm { ...@@ -582,6 +582,7 @@ __asm {
} }
// TODO(fbarchard): Port to gcc // TODO(fbarchard): Port to gcc
// TODO(fbarchard): Improve sign extension/packing
__declspec(naked) __declspec(naked)
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
...@@ -591,7 +592,7 @@ __asm { ...@@ -591,7 +592,7 @@ __asm {
psrlw xmm4, 10 psrlw xmm4, 10
psllw xmm4, 5 psllw xmm4, 5
pcmpeqb xmm5, xmm5 // generate mask 0xf800f800 pcmpeqb xmm5, xmm5 // generate mask 0xf800f800
psrlw xmm5, 11 psllw xmm5, 11
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
...@@ -599,20 +600,20 @@ __asm { ...@@ -599,20 +600,20 @@ __asm {
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
lea eax, [eax + 16]
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
psrlw xmm1, 3 psrld xmm1, 3
pand xmm1, xmm3 pand xmm1, xmm3
movdqa xmm2, xmm0 // G movdqa xmm2, xmm0 // G
psrlw xmm2, 5 psrld xmm2, 5
pand xmm2, xmm4 pand xmm2, xmm4
por xmm1, xmm2 por xmm1, xmm2
psrlw xmm0, 8 // R psrld xmm0, 8 // R
pand xmm0, xmm5 pand xmm0, xmm5
por xmm0, xmm1 por xmm0, xmm1
pslld xmm0, 16 pslld xmm0, 16
psrad xmm0, 16 psrad xmm0, 16
packssdw xmm0, xmm0 packssdw xmm0, xmm0
lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 4 sub ecx, 4
...@@ -622,6 +623,7 @@ __asm { ...@@ -622,6 +623,7 @@ __asm {
} }
// TODO(fbarchard): Port to gcc // TODO(fbarchard): Port to gcc
// TODO(fbarchard): Improve sign extension/packing
__declspec(naked) __declspec(naked)
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
...@@ -629,10 +631,10 @@ __asm { ...@@ -629,10 +631,10 @@ __asm {
psrlw xmm3, 11 psrlw xmm3, 11
movdqa xmm4, xmm3 // generate mask 0x03e003e0 movdqa xmm4, xmm3 // generate mask 0x03e003e0
psllw xmm4, 5 psllw xmm4, 5
movdqa xmm5, xmm3 // generate mask 0x7c007c00 movdqa xmm5, xmm4 // generate mask 0x7c007c00
psllw xmm5, 10 psllw xmm5, 5
pcmpeqb xmm6, xmm6 // generate mask 0x80008000 pcmpeqb xmm6, xmm6 // generate mask 0x80008000
psrlw xmm6, 15 psllw xmm6, 15
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
...@@ -640,26 +642,25 @@ __asm { ...@@ -640,26 +642,25 @@ __asm {
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
lea eax, [eax + 16]
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
psrlw xmm1, 3 psrld xmm1, 3
pand xmm1, xmm3 pand xmm1, xmm3
movdqa xmm2, xmm0 // G movdqa xmm2, xmm0 // G
psrlw xmm2, 6 psrld xmm2, 6
pand xmm2, xmm4 pand xmm2, xmm4
por xmm1, xmm2 por xmm1, xmm2
movdqa xmm2, xmm0 // R movdqa xmm2, xmm0 // R
psrlw xmm2, 9 psrld xmm2, 9
pand xmm2, xmm5 pand xmm2, xmm5
por xmm1, xmm2 por xmm1, xmm2
movdqa xmm2, xmm0 // A psrld xmm0, 16 // A
psrlw xmm2, 16 pand xmm0, xmm6
pand xmm2, xmm6 por xmm0, xmm1
por xmm1, xmm2
pslld xmm0, 16 pslld xmm0, 16
psrad xmm0, 16 psrad xmm0, 16
packssdw xmm1, xmm1 packssdw xmm0, xmm0
movq qword ptr [edx], xmm1 // store 4 pixels of ARGB1555 lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 4 sub ecx, 4
ja convertloop ja convertloop
...@@ -682,7 +683,6 @@ __asm { ...@@ -682,7 +683,6 @@ __asm {
convertloop: convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm0, [eax] // fetch 4 pixels of argb
lea eax, [eax + 16]
movdqa xmm1, xmm0 movdqa xmm1, xmm0
pand xmm0, xmm3 // low nibble pand xmm0, xmm3 // low nibble
pand xmm1, xmm4 // high nibble pand xmm1, xmm4 // high nibble
...@@ -690,6 +690,7 @@ __asm { ...@@ -690,6 +690,7 @@ __asm {
psrl xmm1, 8 psrl xmm1, 8
por xmm0, xmm1 por xmm0, xmm1
packuswb xmm0, xmm0 packuswb xmm0, xmm0
lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 4 sub ecx, 4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment