Commit 965fb914 authored by fbarchard@google.com's avatar fbarchard@google.com

alpha blend argb into argb

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/428009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@203 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 26becab4
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 201 Version: 203
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -133,6 +133,11 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, ...@@ -133,6 +133,11 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Alpha Blend ARGB
int ARGBBlend(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I422ToYUY2(const uint8* src_y, int src_stride_y, int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 201 #define LIBYUV_VERSION 203
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -140,6 +140,43 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, ...@@ -140,6 +140,43 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// Alpha Blend ARGB
int ARGBBlend(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
ARGBBlendRow_C;
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlendRow = ARGBBlendRow_SSE2;
}
#endif
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2)) {
ARGBBlendRow = ARGBBlendRow_SSSE3;
}
#endif
for (int y = 0; y < height; ++y) {
ARGBBlendRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert I422 to ARGB. // Convert I422 to ARGB.
int I422ToARGB(const uint8* src_y, int src_stride_y, int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
......
...@@ -64,6 +64,11 @@ extern "C" { ...@@ -64,6 +64,11 @@ extern "C" {
#define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOUVROW_SSE2
#endif #endif
#if defined(_MSC_VER)
#define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBBLENDROW_SSE2
#endif
// The following are available on Neon platforms // The following are available on Neon platforms
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_NEON #define HAS_MIRRORROW_NEON
...@@ -239,6 +244,10 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -239,6 +244,10 @@ void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
// 'Any' wrappers use memcpy() // 'Any' wrappers use memcpy()
void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
......
...@@ -452,6 +452,138 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { ...@@ -452,6 +452,138 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
} }
} }
#define BLENDER(f, b, a) (f * a + b * (a ^ 0xff) + 0x80) >> 8
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
for (int x = 0; x < width - 1; x += 2) {
uint32 a = src_argb[3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[0];
const uint32 fg = src_argb[1];
const uint32 fr = src_argb[2];
const uint32 bb = dst_argb[0];
const uint32 bg = dst_argb[1];
const uint32 br = dst_argb[2];
dst_argb[0] = BLENDER(fb, bb, a);
dst_argb[1] = BLENDER(fg, bg, a);
dst_argb[2] = BLENDER(fr, br, a);
dst_argb[3] = 255u;
} else {
*(uint32*)dst_argb = *(uint32*)src_argb;
}
}
a = src_argb[4 + 3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[4 + 0];
const uint32 fg = src_argb[4 + 1];
const uint32 fr = src_argb[4 + 2];
const uint32 bb = dst_argb[4 + 0];
const uint32 bg = dst_argb[4 + 1];
const uint32 br = dst_argb[4 + 2];
dst_argb[4 + 0] = BLENDER(fb, bb, a);
dst_argb[4 + 1] = BLENDER(fg, bg, a);
dst_argb[4 + 2] = BLENDER(fr, br, a);
dst_argb[4 + 3] = 255u;
} else {
*(uint32*)(dst_argb + 4) = *(uint32*)(src_argb + 4);
}
}
src_argb += 8;
dst_argb += 8;
}
if (width & 1) {
const uint32 a = src_argb[3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[0];
const uint32 fg = src_argb[1];
const uint32 fr = src_argb[2];
const uint32 bb = dst_argb[0];
const uint32 bg = dst_argb[1];
const uint32 br = dst_argb[2];
dst_argb[0] = BLENDER(fb, bb, a);
dst_argb[1] = BLENDER(fg, bg, a);
dst_argb[2] = BLENDER(fr, br, a);
dst_argb[3] = 255u;
} else {
*(uint32*)dst_argb = *(uint32*)src_argb;
}
}
}
}
#if 0
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
for (int x = 0; x < width - 1; x += 2) {
uint32 f = *(uint32*)src_argb;
uint32 a = f >> 24;
if (a) {
const uint32 b = *(uint32*)dst_argb;
if (a < 255) {
const uint32 src_rb = f & 0x00ff00ff;
const uint32 dst_rb = b & 0x00ff00ff;
const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
0xff00ff00;
const uint32 src_g = f & 0x0000ff00;
const uint32 dst_g = b & 0x0000ff00;
const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
0x00ff0000);
f = ((out_rb | out_g) >> 8) | 0xff000000;
}
*(uint32*)dst_argb = f;
}
f = *(uint32*)(src_argb + 4);
a = f >> 24;
if (a) {
const uint32 b = *(uint32*)(dst_argb + 4);
if (a < 255) {
const uint32 src_rb = f & 0x00ff00ff;
const uint32 dst_rb = b & 0x00ff00ff;
const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
0xff00ff00;
const uint32 src_g = f & 0x0000ff00;
const uint32 dst_g = b & 0x0000ff00;
const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
0x00ff0000);
f = ((out_rb | out_g) >> 8) | 0xff000000;
}
*(uint32*)(dst_argb + 4) = f;
}
src_argb += 8;
dst_argb += 8;
}
if (width & 1) {
uint32 f = *(uint32*)src_argb;
uint32 a = f >> 24;
if (a) {
const uint32 b = *(uint32*)dst_argb;
if (a < 255) {
const uint32 src_rb = f & 0x00ff00ff;
const uint32 dst_rb = b & 0x00ff00ff;
const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
0xff00ff00;
const uint32 src_g = f & 0x0000ff00;
const uint32 dst_g = b & 0x0000ff00;
const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
0x00ff0000);
f = ((out_rb | out_g) >> 8) | 0xff000000;
}
*(uint32*)dst_argb = f;
}
}
}
#endif
// Wrappers to handle odd sizes/alignments // Wrappers to handle odd sizes/alignments
#define MAKEYUVANY(NAMEANY, NAME, COPYROW) \ #define MAKEYUVANY(NAMEANY, NAME, COPYROW) \
void NAMEANY(const uint8* y_buf, \ void NAMEANY(const uint8* y_buf, \
......
...@@ -1909,6 +1909,121 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1909,6 +1909,121 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
} }
#endif // HAS_YUY2TOYROW_SSE2 #endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for copying alpha
static const uvec8 kShuffleAlpha = {
7u, 7u, 7u, 7u, 7u, 7u, 0x80, 0x80, 15u, 15u, 15u, 15u, 15u, 15u, 0x80, 0x80
};
__declspec(naked)
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, 0x00200020 // rounding constant for 8.6 fixed point
movd xmm3, eax
pshufd xmm3, xmm3, 0
mov eax, 0x3f3f3f3f // mask for alpha
movd xmm7, eax
pshufd xmm7, xmm7, 0
movdqa xmm4, kShuffleAlpha
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
pcmpeqb xmm6, xmm6 // generate 0x00010001 for negating
psrlw xmm6, 15
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
convertloop:
movq xmm0, qword ptr [eax] // fetch 2 pixels
movq xmm1, qword ptr [eax + edx]
punpcklbw xmm1, xmm0 // mix 2 pixels aArRgGbB_aArRgGbB
movdqa xmm2, xmm1 // alpha from byte 7 and 15
pshufb xmm2, xmm4
pxor xmm2, xmm5
psrlw xmm2, 2
pand xmm2, xmm7
paddw xmm2, xmm6 // -a = (a^255)+1
pmaddubsw xmm1, xmm2
paddw xmm1, xmm3 // round
psrlw xmm1, 6
packuswb xmm1, xmm1 // pack 2 pixels
sub ecx, 2
movq qword ptr [eax + edx], xmm1
lea eax, [eax + 8]
ja convertloop
ret
}
}
#endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBBLENDROW_SSE2
// TODO(fbarchard): Single multiply method b+a(f-b)
// TODO(fbarchard): Unroll and pair
// TODO(fbarchard): Test for transparent and opaque common cases
__declspec(naked)
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub ecx, 1
je last1
convertloop:
movq xmm0, qword ptr [eax] // fetch 2 pixels
movq xmm1, qword ptr [eax + edx]
punpcklbw xmm0, xmm0 // src 16 bits
punpcklbw xmm1, xmm1 // dst 16 bits
pshuflw xmm2, xmm0, 0xff // src alpha
pshufhw xmm2, xmm2, 0xff
movdqa xmm3, xmm2 // dst alpha
pxor xmm3, xmm4
pmulhuw xmm0, xmm2 // src * a
pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff)
paddw xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0 // pack 2 pixels
sub ecx, 2
movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8]
ja convertloop
last1:
add ecx, 1
je done
mov ecx, [eax] // handle remaining pixel
movd xmm0, ecx
mov ecx, [eax + edx]
movd xmm1, ecx
punpcklbw xmm0, xmm0 // src 16 bits
punpcklbw xmm1, xmm1 // dst 16 bits
pshuflw xmm2, xmm0, 0xff // src alpha
pshufhw xmm2, xmm2, 0xff
movdqa xmm3, xmm2 // dst alpha
pxor xmm3, xmm4
pmulhuw xmm0, xmm2 // src * a
pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff)
paddw xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0 // pack 2 pixels
movd ecx, xmm0
mov dword ptr [eax + edx], ecx
done:
ret
}
}
#endif // HAS_ARGBBLENDROW_SSSE3
#endif // _M_IX86 #endif // _M_IX86
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -1699,20 +1699,21 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1699,20 +1699,21 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, int src_height) { uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0; int tmp_height = 0;
intptr_t tmp_src = 0; intptr_t tmp_src = 0;
intptr_t tmp_src_stride = static_cast<intptr_t>(src_stride);
asm volatile ( asm volatile (
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n" "sub $0x1,%6 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"mov %0,%3 \n" "mov %0,%3 \n"
"add %6,%0 \n" "add %4,%0 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n" "punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n" "punpckhbw %%xmm4,%%xmm1 \n"
"mov %5,%2 \n" "mov %6,%2 \n"
"2: \n" "2: \n"
"movdqa (%0),%%xmm2 \n" "movdqa (%0),%%xmm2 \n"
"add %6,%0 \n" "add %4,%0 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n" "punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n" "punpckhbw %%xmm4,%%xmm3 \n"
...@@ -1724,15 +1725,16 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1724,15 +1725,16 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"lea 0x10(%3),%0 \n" "lea 0x10(%3),%0 \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x10,%4 \n" "sub $0x10,%5 \n"
"ja 1b \n" "ja 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(tmp_height), // %2 "+r"(tmp_height), // %2
"+r"(tmp_src), // %3 "+r"(tmp_src), // %3
"+r"(src_width), // %4 "+r"(tmp_src_stride), // %4
"+rm"(src_height) // %5 "+rm"(src_width), // %5
: "rm"(static_cast<intptr_t>(src_stride)) // %6 "+rm"(src_height) // %6
:
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
...@@ -1740,7 +1742,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1740,7 +1742,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
); );
} }
#if defined(__i386__) #if defined(__i386__)
extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment