Commit 8670b1ae authored by fbarchard@google.com's avatar fbarchard@google.com

SSSE3 version of alpha blender does pshufb instead of shift and 2 pshufw.

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/446008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@219 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 67be98bd
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 218 Version: 219
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 218 #define LIBYUV_VERSION 219
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -176,6 +176,15 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb, ...@@ -176,6 +176,15 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlendRow = ARGBBlendRow_SSSE3;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
}
}
#endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
ARGBBlendRow(src_argb, dst_argb, width); ARGBBlendRow(src_argb, dst_argb, width);
......
...@@ -76,6 +76,10 @@ extern "C" { ...@@ -76,6 +76,10 @@ extern "C" {
#define HAS_I420TOABGRROW_NEON #define HAS_I420TOABGRROW_NEON
#endif #endif
#if defined(_MSC_VER) && !defined(YUV_DISABLE_ASM)
#define HAS_ARGBBLENDROW_SSSE3
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var #define SIMD_ALIGNED(var) __declspec(align(16)) var
typedef __declspec(align(16)) signed char vec8[16]; typedef __declspec(align(16)) signed char vec8[16];
...@@ -241,8 +245,11 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -241,8 +245,11 @@ void YToARGBRow_SSE2(const uint8* y_buf,
int width); int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
int width); int width);
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
......
...@@ -1961,7 +1961,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1961,7 +1961,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
#ifdef HAS_ARGBBLENDROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time // Blend 8 pixels at a time
// Destination aligned to 16 bytes, multiple of 4 pixels // Destination aligned to 16 bytes, multiple of 4 pixels
// TODO(fbarchard): SSSE3 version with pshufb for alpha and maybe pmaddubsw
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) { int width) {
...@@ -1988,7 +1987,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -1988,7 +1987,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm3, xmm3,0F5h // 8 alpha words pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h pshuflw xmm3, xmm3,0F5h
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256-alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [edx] // _a_g movdqa xmm1, [edx] // _a_g
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
...@@ -2006,12 +2005,12 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -2006,12 +2005,12 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
movdqa xmm2, [edx + 16] // _r_b movdqa xmm2, [edx + 16] // _r_b
psrlw xmm3, 8 // alpha psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h pshuflw xmm3, xmm3,0F5h
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256-alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [edx + 16] // _a_g movdqa xmm1, [edx + 16] // _a_g
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
...@@ -2058,7 +2057,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2058,7 +2057,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pshufhw xmm3, xmm3,0F5h // 8 alpha words pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h pshuflw xmm3, xmm3,0F5h
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256-alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movd xmm1, [edx] // _a_g movd xmm1, [edx] // _a_g
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
...@@ -2100,9 +2099,115 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2100,9 +2099,115 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
ARGBBlendRow1_SSE2(src_argb, dst_argb, width); ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
} }
} }
#endif // HAS_ARGBBLENDROW_SSE2 #endif // HAS_ARGBBLENDROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSSE3
// Blend 8 pixels at a time
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleAlpha = {
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
};
// Same as SSE2, but replaces
// psrlw xmm3, 8 // alpha
// pshufhw xmm3, xmm3,0F5h // 8 alpha words
// pshuflw xmm3, xmm3,0F5h
// with..
// pshufb xmm3, kShuffleAlpha // alpha
// Destination aligned to 16 bytes, multiple of 4 pixels
__declspec(naked) __declspec(align(16))
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
pcmpeqb xmm7, xmm7 // generate constant 1
psrlw xmm7, 15
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
psrlw xmm6, 8
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
psllw xmm5, 8
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
align 16
convertloop:
movdqu xmm3, [eax]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
pshufb xmm3, kShuffleAlpha // alpha
movdqa xmm2, [edx] // _r_b
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [edx] // _a_g
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx], xmm0
jle done
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqa xmm2, [edx + 16] // _r_b
pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [edx + 16] // _a_g
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
jg convertloop
done:
ret
}
}
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// Do 1 to 3 pixels to get destination aligned.
if ((uintptr_t)(dst_argb) & 15) {
int count = width;
if (((intptr_t)(dst_argb) & 3) == 0) {
count = (-(intptr_t)(dst_argb) >> 2) & 3;
}
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
src_argb += count * 4;
dst_argb += count * 4;
width -= count;
}
// Do multiple of 4 pixels
if (width & ~3) {
ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3);
}
// Do remaining 1 to 3 pixels
if (width & 3) {
src_argb += (width & ~3) * 4;
dst_argb += (width & ~3) * 4;
width &= 3;
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
}
}
#endif // HAS_ARGBBLENDROW_SSSE3
#endif // _M_IX86 #endif // _M_IX86
#ifdef __cplusplus #ifdef __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment