Commit 787f8276 authored by fbarchard@google.com's avatar fbarchard@google.com

Unattenuate multiply alpha by 1.

BUG=190
TESTED=planar_test
Review URL: https://webrtc-codereview.appspot.com/1114005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@578 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 3c7bb050
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 577
Version: 578
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 576
#define LIBYUV_VERSION 578
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -1525,10 +1525,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
// g = (g * 255 + (a / 2)) / a;
// r = (r * 255 + (a / 2)) / a;
// Reciprocal method is off by 1 on some values. ie 125
// 8.16 fixed point inverse table
#define T(a) 0x10000 / a
// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
#define T(a) 0x01000000 + (0x10000 / a)
uint32 fixed_invtbl8[256] = {
0xffff, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
......@@ -1559,7 +1559,7 @@ uint32 fixed_invtbl8[256] = {
T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 };
T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
#undef T
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
......@@ -1569,7 +1569,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
uint32 r = src_argb[2];
const uint32 a = src_argb[3];
if (a) {
const uint32 ia = fixed_invtbl8[a]; // 8.16 fixed point
const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.16 fixed point
b = (b * ia) >> 8;
g = (g * ia) >> 8;
r = (r * ia) >> 8;
......
......@@ -3763,8 +3763,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"movd 0x0(%4,%3,4),%%xmm2 \n"
"movzb 0x7(%0),%3 \n"
"movd 0x0(%4,%3,4),%%xmm3 \n"
"pshuflw $0xc0,%%xmm2,%%xmm2 \n"
"pshuflw $0xc0,%%xmm3,%%xmm3 \n"
"pshuflw $0x40,%%xmm2,%%xmm2 \n"
"pshuflw $0x40,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n"
......@@ -3773,14 +3773,11 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"movd 0x0(%4,%3,4),%%xmm2 \n"
"movzb 0xf(%0),%3 \n"
"movd 0x0(%4,%3,4),%%xmm3 \n"
"pshuflw $0xc0,%%xmm2,%%xmm2 \n"
"pshuflw $0xc0,%%xmm3,%%xmm3 \n"
"pshuflw $0x40,%%xmm2,%%xmm2 \n"
"pshuflw $0x40,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n"
"pand %%xmm4,%%xmm2 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n"
......
......@@ -4431,8 +4431,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
punpcklbw xmm0, xmm0 // first 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
pshuflw xmm2, xmm2,040h // first 4 inv_alpha words. 1, a, a, a
pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm0, xmm2 // rgb * a
......@@ -4442,15 +4442,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
punpckhbw xmm1, xmm1 // next 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
pshuflw xmm2, xmm2,040h // first 4 inv_alpha words
pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // alphas
pand xmm2, xmm4
packuswb xmm0, xmm1
por xmm0, xmm2
sub ecx, 4
movdqa [eax + edx], xmm0
lea eax, [eax + 16]
......@@ -4465,10 +4462,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u,
8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u,
0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u,
8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u,
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
};
__declspec(naked) __declspec(align(16))
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
......@@ -4479,26 +4474,22 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov ecx, [esp + 12] // width
sub edx, eax
vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
align 16
convertloop:
vmovdqu ymm6, [eax] // read 8 pixels.
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffffffff for gather.
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm7 // ymm7 cleared.
vpunpcklwd ymm2, ymm3, ymm7 // low 4 inverted alphas. mutated.
vpunpckhwd ymm3, ymm3, ymm7 // high 4 inverted alphas. mutated.
vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas
vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpand ymm6, ymm6, ymm5 // isolate alpha
vpackuswb ymm0, ymm0, ymm1 // unmutated.
vpor ymm0, ymm0, ymm6 // copy original alpha
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment