Commit 787f8276 authored by fbarchard@google.com's avatar fbarchard@google.com

Unattenuate multiply alpha by 1.

BUG=190
TESTED=planar_test
Review URL: https://webrtc-codereview.appspot.com/1114005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@578 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 3c7bb050
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 577 Version: 578
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 576 #define LIBYUV_VERSION 578
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -1525,10 +1525,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1525,10 +1525,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
// g = (g * 255 + (a / 2)) / a; // g = (g * 255 + (a / 2)) / a;
// r = (r * 255 + (a / 2)) / a; // r = (r * 255 + (a / 2)) / a;
// Reciprocal method is off by 1 on some values. ie 125 // Reciprocal method is off by 1 on some values. ie 125
// 8.16 fixed point inverse table // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
#define T(a) 0x10000 / a #define T(a) 0x01000000 + (0x10000 / a)
uint32 fixed_invtbl8[256] = { uint32 fixed_invtbl8[256] = {
0xffff, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
...@@ -1559,7 +1559,7 @@ uint32 fixed_invtbl8[256] = { ...@@ -1559,7 +1559,7 @@ uint32 fixed_invtbl8[256] = {
T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 }; T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
#undef T #undef T
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
...@@ -1569,7 +1569,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -1569,7 +1569,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
uint32 r = src_argb[2]; uint32 r = src_argb[2];
const uint32 a = src_argb[3]; const uint32 a = src_argb[3];
if (a) { if (a) {
const uint32 ia = fixed_invtbl8[a]; // 8.16 fixed point const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.16 fixed point
b = (b * ia) >> 8; b = (b * ia) >> 8;
g = (g * ia) >> 8; g = (g * ia) >> 8;
r = (r * ia) >> 8; r = (r * ia) >> 8;
......
...@@ -3763,8 +3763,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -3763,8 +3763,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"movd 0x0(%4,%3,4),%%xmm2 \n" "movd 0x0(%4,%3,4),%%xmm2 \n"
"movzb 0x7(%0),%3 \n" "movzb 0x7(%0),%3 \n"
"movd 0x0(%4,%3,4),%%xmm3 \n" "movd 0x0(%4,%3,4),%%xmm3 \n"
"pshuflw $0xc0,%%xmm2,%%xmm2 \n" "pshuflw $0x40,%%xmm2,%%xmm2 \n"
"pshuflw $0xc0,%%xmm3,%%xmm3 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n" "movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n" "movdqa (%0),%%xmm1 \n"
...@@ -3773,14 +3773,11 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -3773,14 +3773,11 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"movd 0x0(%4,%3,4),%%xmm2 \n" "movd 0x0(%4,%3,4),%%xmm2 \n"
"movzb 0xf(%0),%3 \n" "movzb 0xf(%0),%3 \n"
"movd 0x0(%4,%3,4),%%xmm3 \n" "movd 0x0(%4,%3,4),%%xmm3 \n"
"pshuflw $0xc0,%%xmm2,%%xmm2 \n" "pshuflw $0x40,%%xmm2,%%xmm2 \n"
"pshuflw $0xc0,%%xmm3,%%xmm3 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n" "movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n"
"pand %%xmm4,%%xmm2 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n" "movdqa %%xmm0,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
......
...@@ -4431,8 +4431,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4431,8 +4431,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4] movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4] movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words pshuflw xmm2, xmm2,040h // first 4 inv_alpha words. 1, a, a, a
pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
movlhps xmm2, xmm3 movlhps xmm2, xmm3
pmulhuw xmm0, xmm2 // rgb * a pmulhuw xmm0, xmm2 // rgb * a
...@@ -4442,15 +4442,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4442,15 +4442,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
punpckhbw xmm1, xmm1 // next 2 punpckhbw xmm1, xmm1 // next 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4] movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4] movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words pshuflw xmm2, xmm2,040h // first 4 inv_alpha words
pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
movlhps xmm2, xmm3 movlhps xmm2, xmm3
pmulhuw xmm1, xmm2 // rgb * a pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // alphas
pand xmm2, xmm4
packuswb xmm0, xmm1 packuswb xmm0, xmm1
por xmm0, xmm2
sub ecx, 4 sub ecx, 4
movdqa [eax + edx], xmm0 movdqa [eax + edx], xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -4465,10 +4462,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4465,10 +4462,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBUNATTENUATEROW_AVX2 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
// Shuffle table duplicating alpha. // Shuffle table duplicating alpha.
static const ulvec8 kUnattenShuffleAlpha_AVX2 = { static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u, 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u, 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u,
8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u,
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
...@@ -4479,26 +4474,22 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4479,26 +4474,22 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
sub edx, eax sub edx, eax
vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
align 16 align 16
convertloop: convertloop:
vmovdqu ymm6, [eax] // read 8 pixels. vmovdqu ymm6, [eax] // read 8 pixels.
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffffffff for gather. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm7 // ymm7 cleared. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
vpunpcklwd ymm2, ymm3, ymm7 // low 4 inverted alphas. mutated. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
vpunpckhwd ymm3, ymm3, ymm7 // high 4 inverted alphas. mutated. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpand ymm6, ymm6, ymm5 // isolate alpha
vpackuswb ymm0, ymm0, ymm1 // unmutated. vpackuswb ymm0, ymm0, ymm1 // unmutated.
vpor ymm0, ymm0, ymm6 // copy original alpha
sub ecx, 8 sub ecx, 8
vmovdqu [eax + edx], ymm0 vmovdqu [eax + edx], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment