Commit fe9f4ad8 authored by fbarchard@google.com's avatar fbarchard@google.com

alphablend test for opaque and transparent

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/436005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@205 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 9198f375
......@@ -162,12 +162,6 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb,
ARGBBlendRow = ARGBBlendRow_SSE2;
}
#endif
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2)) {
ARGBBlendRow = ARGBBlendRow_SSSE3;
}
#endif
for (int y = 0; y < height; ++y) {
ARGBBlendRow(src_argb, dst_argb, width);
......
......@@ -996,6 +996,7 @@ void RotateUV270(const uint8* src, int src_stride,
width, height);
}
// Rotate 180 is a horizontal and vertical flip.
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
......
......@@ -65,7 +65,6 @@ extern "C" {
#endif
#if defined(_MSC_VER)
#define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBBLENDROW_SSE2
#endif
......@@ -244,7 +243,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
......
......@@ -514,76 +514,6 @@ void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
}
}
#if 0
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
for (int x = 0; x < width - 1; x += 2) {
uint32 f = *(uint32*)src_argb;
uint32 a = f >> 24;
if (a) {
const uint32 b = *(uint32*)dst_argb;
if (a < 255) {
const uint32 src_rb = f & 0x00ff00ff;
const uint32 dst_rb = b & 0x00ff00ff;
const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
0xff00ff00;
const uint32 src_g = f & 0x0000ff00;
const uint32 dst_g = b & 0x0000ff00;
const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
0x00ff0000);
f = ((out_rb | out_g) >> 8) | 0xff000000;
}
*(uint32*)dst_argb = f;
}
f = *(uint32*)(src_argb + 4);
a = f >> 24;
if (a) {
const uint32 b = *(uint32*)(dst_argb + 4);
if (a < 255) {
const uint32 src_rb = f & 0x00ff00ff;
const uint32 dst_rb = b & 0x00ff00ff;
const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
0xff00ff00;
const uint32 src_g = f & 0x0000ff00;
const uint32 dst_g = b & 0x0000ff00;
const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
0x00ff0000);
f = ((out_rb | out_g) >> 8) | 0xff000000;
}
*(uint32*)(dst_argb + 4) = f;
}
src_argb += 8;
dst_argb += 8;
}
if (width & 1) {
uint32 f = *(uint32*)src_argb;
uint32 a = f >> 24;
if (a) {
const uint32 b = *(uint32*)dst_argb;
if (a < 255) {
const uint32 src_rb = f & 0x00ff00ff;
const uint32 dst_rb = b & 0x00ff00ff;
const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
0xff00ff00;
const uint32 src_g = f & 0x0000ff00;
const uint32 dst_g = b & 0x0000ff00;
const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
0x00ff0000);
f = ((out_rb | out_g) >> 8) | 0xff000000;
}
*(uint32*)dst_argb = f;
}
}
}
#endif
// Wrappers to handle odd sizes/alignments
#define MAKEYUVANY(NAMEANY, NAME, COPYROW) \
void NAMEANY(const uint8* y_buf, \
......
......@@ -1909,75 +1909,55 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
}
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for copying alpha
static const uvec8 kShuffleAlpha = {
7u, 7u, 7u, 7u, 7u, 7u, 0x80, 0x80, 15u, 15u, 15u, 15u, 15u, 15u, 0x80, 0x80
};
__declspec(naked)
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, 0x00200020 // rounding constant for 8.6 fixed point
movd xmm3, eax
pshufd xmm3, xmm3, 0
mov eax, 0x3f3f3f3f // mask for alpha
movd xmm7, eax
pshufd xmm7, xmm7, 0
movdqa xmm4, kShuffleAlpha
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
pcmpeqb xmm6, xmm6 // generate 0x00010001 for negating
psrlw xmm6, 15
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
convertloop:
movq xmm0, qword ptr [eax] // fetch 2 pixels
movq xmm1, qword ptr [eax + edx]
punpcklbw xmm1, xmm0 // mix 2 pixels aArRgGbB_aArRgGbB
movdqa xmm2, xmm1 // alpha from byte 7 and 15
pshufb xmm2, xmm4
pxor xmm2, xmm5
psrlw xmm2, 2
pand xmm2, xmm7
paddw xmm2, xmm6 // -a = (a^255)+1
pmaddubsw xmm1, xmm2
paddw xmm1, xmm3 // round
psrlw xmm1, 6
packuswb xmm1, xmm1 // pack 2 pixels
sub ecx, 2
movq qword ptr [eax + edx], xmm1
lea eax, [eax + 8]
ja convertloop
ret
}
}
#endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBBLENDROW_SSE2
// TODO(fbarchard): Single multiply method b+a(f-b)
// TODO(fbarchard): Unroll and pair
// TODO(fbarchard): Test for transparent and opaque common cases
// TODO(fbarchard): Port to gcc
__declspec(naked)
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
push esi
mov esi, [esp + 4 + 4] // src_argb
mov edx, [esp + 4 + 8] // dst_argb
mov ecx, [esp + 4 + 12] // width
pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub edx, esi
mov eax, [esi] // get first pixel
sub ecx, 1 // ensure there are at least 2 pixels
je last1 // last pixel?
cmp eax, 0xFF000000 // opaque?
jae opaqueloop
cmp eax, 0x00FFFFFF // translucient?
ja translucientloop
align 16
transparentloop:
sub ecx, 1
lea esi, [esi + 4]
je last1
convertloop:
movq xmm0, qword ptr [eax] // fetch 2 pixels
movq xmm1, qword ptr [eax + edx]
mov eax, [esi] // handle remaining pixel
cmp eax, 0x00FFFFFF // transparent?
jbe transparentloop
cmp eax, 0xFF000000 // translucient?
jb translucientloop
align 16
opaqueloop:
mov dword ptr [esi + edx], eax
lea esi, [esi + 4]
sub ecx, 1
je last1
mov eax, [esi] // handle remaining pixel
cmp eax, 0xFF000000 // opaque?
jae opaqueloop
cmp eax, 0x00FFFFFF // transparent?
jbe transparentloop
align 4
translucientloop:
movq xmm0, qword ptr [esi] // fetch 2 pixels
movq xmm1, qword ptr [esi + edx]
punpcklbw xmm0, xmm0 // src 16 bits
punpcklbw xmm1, xmm1 // dst 16 bits
pshuflw xmm2, xmm0, 0xff // src alpha
......@@ -1989,19 +1969,25 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
paddw xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0 // pack 2 pixels
movq qword ptr [esi + edx], xmm0
lea esi, [esi + 8]
sub ecx, 2
movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8]
ja convertloop
jbe last1
mov eax, [esi] // handle remaining pixel
cmp eax, 0x00FFFFFF // transparent?
jbe transparentloop
cmp eax, 0xFF000000 // translucient?
jb translucientloop
jmp opaqueloop
align 4
last1:
add ecx, 1
je done
mov ecx, [eax] // handle remaining pixel
movd xmm0, ecx
mov ecx, [eax + edx]
movd xmm1, ecx
movd xmm0, eax
mov eax, [esi + edx]
movd xmm1, eax
punpcklbw xmm0, xmm0 // src 16 bits
punpcklbw xmm1, xmm1 // dst 16 bits
pshuflw xmm2, xmm0, 0xff // src alpha
......@@ -2012,17 +1998,16 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff)
paddw xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0 // pack 2 pixels
movd ecx, xmm0
mov dword ptr [eax + edx], ecx
packuswb xmm0, xmm0 // pack to bytes
movd eax, xmm0
mov dword ptr [esi + edx], eax
done:
pop esi
ret
}
}
#endif // HAS_ARGBBLENDROW_SSSE3
#endif // HAS_ARGBBLENDROW_SSE2
#endif // _M_IX86
......
......@@ -1702,18 +1702,18 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
intptr_t tmp_src_stride = static_cast<intptr_t>(src_stride);
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%6 \n"
"sub $0x1,%5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"mov %0,%3 \n"
"add %4,%0 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"mov %6,%2 \n"
"mov %5,%2 \n"
"2: \n"
"movdqa (%0),%%xmm2 \n"
"add %4,%0 \n"
"add %6,%0 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
......@@ -1725,16 +1725,15 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x10(%3),%0 \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%5 \n"
"sub $0x10,%4 \n"
"ja 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_height), // %2
"+r"(tmp_src), // %3
"+rm"(tmp_src_stride), // %4
"+rm"(src_width), // %5
"+rm"(src_height) // %6
:
"+rm"(src_width), // %4
"+rm"(src_height) // %5
: "+rm"(tmp_src_stride), // %6
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment