Commit 38157bdc authored by fbarchard@google.com's avatar fbarchard@google.com

Change Attenuate and Unattenuate to unaligned memory ops.

BUG=279
TEST=ARGBAttenuate_Unaligned
R=nfullagar@google.com, ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/2709004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@821 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent d2371686
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 820 Version: 821
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 820 #define LIBYUV_VERSION 821
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -1134,9 +1134,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1134,9 +1134,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3) #if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 && if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) { if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
...@@ -1191,9 +1189,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1191,9 +1189,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBUnattenuateRow_C; int width) = ARGBUnattenuateRow_C;
#if defined(HAS_ARGBUNATTENUATEROW_SSE2) #if defined(HAS_ARGBUNATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) { if (IS_ALIGNED(width, 4)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
......
...@@ -4117,17 +4117,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4117,17 +4117,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa "MEMACCESS(0)",%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n"
"movdqa "MEMACCESS(0)",%%xmm1 \n" "movdqu "MEMACCESS(0)",%%xmm1 \n"
"punpcklbw %%xmm1,%%xmm1 \n" "punpcklbw %%xmm1,%%xmm1 \n"
"pmulhuw %%xmm1,%%xmm0 \n" "pmulhuw %%xmm1,%%xmm0 \n"
"movdqa "MEMACCESS(0)",%%xmm1 \n" "movdqu "MEMACCESS(0)",%%xmm1 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"movdqa "MEMACCESS(0)",%%xmm2 \n" "movdqu "MEMACCESS(0)",%%xmm2 \n"
"punpckhbw %%xmm2,%%xmm2 \n" "punpckhbw %%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n"
"movdqa "MEMACCESS(0)",%%xmm2 \n" "movdqu "MEMACCESS(0)",%%xmm2 \n"
"lea "MEMLEA(0x10,0)",%0 \n" "lea "MEMLEA(0x10,0)",%0 \n"
"pand %%xmm3,%%xmm2 \n" "pand %%xmm3,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
...@@ -4135,7 +4135,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4135,7 +4135,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n" "por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,"MEMACCESS(1)" \n" "movdqu %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -4161,7 +4161,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4161,7 +4161,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa "MEMACCESS(0)",%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"movzb "MEMACCESS2(0x03,0)",%3 \n" "movzb "MEMACCESS2(0x03,0)",%3 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
...@@ -4171,7 +4171,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4171,7 +4171,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshuflw $0x40,%%xmm3,%%xmm3 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n" "movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"movdqa "MEMACCESS(0)",%%xmm1 \n" "movdqu "MEMACCESS(0)",%%xmm1 \n"
"movzb "MEMACCESS2(0x0b,0)",%3 \n" "movzb "MEMACCESS2(0x0b,0)",%3 \n"
"punpckhbw %%xmm1,%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n"
MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
...@@ -4184,7 +4184,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4184,7 +4184,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"lea "MEMLEA(0x10,0)",%0 \n" "lea "MEMLEA(0x10,0)",%0 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,"MEMACCESS(1)" \n" "movdqu %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
......
...@@ -4586,7 +4586,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4586,7 +4586,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm7, xmm7 // generate constant 1 pcmpeqb xmm7, xmm7 // generate constant 0x0001
psrlw xmm7, 15 psrlw xmm7, 15
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
psrlw xmm6, 8 psrlw xmm6, 8
...@@ -4788,17 +4788,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4788,17 +4788,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
pshufb xmm0, xmm4 // isolate first 2 alphas pshufb xmm0, xmm4 // isolate first 2 alphas
movdqa xmm1, [eax] // read 4 pixels movdqu xmm1, [eax] // read 4 pixels
punpcklbw xmm1, xmm1 // first 2 pixel rgbs punpcklbw xmm1, xmm1 // first 2 pixel rgbs
pmulhuw xmm0, xmm1 // rgb * a pmulhuw xmm0, xmm1 // rgb * a
movdqa xmm1, [eax] // read 4 pixels movdqu xmm1, [eax] // read 4 pixels
pshufb xmm1, xmm5 // isolate next 2 alphas pshufb xmm1, xmm5 // isolate next 2 alphas
movdqa xmm2, [eax] // read 4 pixels movdqu xmm2, [eax] // read 4 pixels
punpckhbw xmm2, xmm2 // next 2 pixel rgbs punpckhbw xmm2, xmm2 // next 2 pixel rgbs
pmulhuw xmm1, xmm2 // rgb * a pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // mask original alpha movdqu xmm2, [eax] // mask original alpha
lea eax, [eax + 16] lea eax, [eax + 16]
pand xmm2, xmm3 pand xmm2, xmm3
psrlw xmm0, 8 psrlw xmm0, 8
...@@ -4806,7 +4806,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4806,7 +4806,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
packuswb xmm0, xmm1 packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha por xmm0, xmm2 // copy original alpha
sub ecx, 4 sub ecx, 4
movdqa [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
...@@ -4874,7 +4874,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4874,7 +4874,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 3] // first alpha movzx esi, byte ptr [eax + 3] // first alpha
movzx edi, byte ptr [eax + 7] // second alpha movzx edi, byte ptr [eax + 7] // second alpha
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
...@@ -4885,7 +4885,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4885,7 +4885,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
movlhps xmm2, xmm3 movlhps xmm2, xmm3
pmulhuw xmm0, xmm2 // rgb * a pmulhuw xmm0, xmm2 // rgb * a
movdqa xmm1, [eax] // read 4 pixels movdqu xmm1, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 11] // third alpha movzx esi, byte ptr [eax + 11] // third alpha
movzx edi, byte ptr [eax + 15] // forth alpha movzx edi, byte ptr [eax + 15] // forth alpha
punpckhbw xmm1, xmm1 // next 2 punpckhbw xmm1, xmm1 // next 2
...@@ -4899,7 +4899,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -4899,7 +4899,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4 sub ecx, 4
movdqa [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
pop edi pop edi
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment