Commit 008ecea4 authored by fbarchard@google.com's avatar fbarchard@google.com

NaCL port of Attenuate

BUG=253
TEST=out\release\libyuv_unittest --gtest_filter=*Attenuate*
R=nfullagar@chromium.org

Review URL: https://webrtc-codereview.appspot.com/1970004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@745 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f8a86cb0
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 744 Version: 745
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -39,6 +39,7 @@ extern "C" { ...@@ -39,6 +39,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBATTENUATEROW_SSSE3
#endif #endif
// The following are available on all x86 platforms except NaCL x64: // The following are available on all x86 platforms except NaCL x64:
...@@ -114,7 +115,6 @@ extern "C" { ...@@ -114,7 +115,6 @@ extern "C" {
// Effects // Effects
#define HAS_ARGBADDROW_SSE2 #define HAS_ARGBADDROW_SSE2
#define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2
#define HAS_ARGBATTENUATEROW_SSSE3
#define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3
#define HAS_ARGBMIRRORROW_SSSE3 #define HAS_ARGBMIRRORROW_SSSE3
...@@ -188,8 +188,8 @@ extern "C" { ...@@ -188,8 +188,8 @@ extern "C" {
!defined(LIBYUV_SSSE3_ONLY) !defined(LIBYUV_SSSE3_ONLY)
// Available with NaCL: // Available with NaCL:
#define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBBLENDROW_SSE2
#if !(defined(__native_client__) && defined(__x86_64__))
#define HAS_ARGBATTENUATEROW_SSE2 #define HAS_ARGBATTENUATEROW_SSE2
#if !(defined(__native_client__) && defined(__x86_64__))
#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSE2
#endif #endif
#endif #endif
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 744 #define LIBYUV_VERSION 745
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -3770,7 +3770,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -3770,7 +3770,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// aligned to 16 bytes // aligned to 16 bytes
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n" "pslld $0x18,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
...@@ -3779,17 +3778,18 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3779,17 +3778,18 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm2 \n" "pshufhw $0xff,%%xmm0,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n" "movdqa "MEMACCESS(0)",%%xmm1 \n"
"punpckhbw %%xmm1,%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n"
"pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshufhw $0xff,%%xmm1,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n" "movdqa "MEMACCESS(0)",%%xmm2 \n"
"lea "MEMLEA(0x10,0)",%0 \n"
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"pand %%xmm4,%%xmm2 \n" "pand %%xmm4,%%xmm2 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
...@@ -3797,8 +3797,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3797,8 +3797,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n" "por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%0),%0 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -3825,7 +3825,6 @@ static uvec8 kShuffleAlpha1 = { ...@@ -3825,7 +3825,6 @@ static uvec8 kShuffleAlpha1 = {
// aligned to 16 bytes // aligned to 16 bytes
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm3,%%xmm3 \n" "pcmpeqb %%xmm3,%%xmm3 \n"
"pslld $0x18,%%xmm3 \n" "pslld $0x18,%%xmm3 \n"
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
...@@ -3834,25 +3833,26 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3834,25 +3833,26 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n" "movdqa "MEMACCESS(0)",%%xmm1 \n"
"punpcklbw %%xmm1,%%xmm1 \n" "punpcklbw %%xmm1,%%xmm1 \n"
"pmulhuw %%xmm1,%%xmm0 \n" "pmulhuw %%xmm1,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n" "movdqa "MEMACCESS(0)",%%xmm1 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n" "movdqa "MEMACCESS(0)",%%xmm2 \n"
"punpckhbw %%xmm2,%%xmm2 \n" "punpckhbw %%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n" "movdqa "MEMACCESS(0)",%%xmm2 \n"
"lea "MEMLEA(0x10,0)",%0 \n"
"pand %%xmm3,%%xmm2 \n" "pand %%xmm3,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n" "por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%0),%0 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
......
...@@ -4624,7 +4624,6 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4624,7 +4624,6 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
mov eax, [esp + 4] // src_argb0 mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
sub edx, eax
pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24 pslld xmm4, 24
pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
...@@ -4643,6 +4642,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4643,6 +4642,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pshuflw xmm2, xmm2, 0FFh pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm1, xmm2 // rgb * a pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // alphas movdqa xmm2, [eax] // alphas
lea eax, [eax + 16]
psrlw xmm0, 8 psrlw xmm0, 8
pand xmm2, xmm4 pand xmm2, xmm4
psrlw xmm1, 8 psrlw xmm1, 8
...@@ -4650,8 +4650,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4650,8 +4650,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pand xmm0, xmm5 // keep original alphas pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2 por xmm0, xmm2
sub ecx, 4 sub ecx, 4
movdqa [eax + edx], xmm0 movdqa [edx], xmm0
lea eax, [eax + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
ret ret
...@@ -4674,7 +4674,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4674,7 +4674,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
mov eax, [esp + 4] // src_argb0 mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
sub edx, eax
pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
pslld xmm3, 24 pslld xmm3, 24
movdqa xmm4, kShuffleAlpha0 movdqa xmm4, kShuffleAlpha0
...@@ -4693,14 +4692,15 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4693,14 +4692,15 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
punpckhbw xmm2, xmm2 // next 2 pixel rgbs punpckhbw xmm2, xmm2 // next 2 pixel rgbs
pmulhuw xmm1, xmm2 // rgb * a pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // mask original alpha movdqa xmm2, [eax] // mask original alpha
lea eax, [eax + 16]
pand xmm2, xmm3 pand xmm2, xmm3
psrlw xmm0, 8 psrlw xmm0, 8
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha por xmm0, xmm2 // copy original alpha
sub ecx, 4 sub ecx, 4
movdqa [eax + edx], xmm0 movdqa [edx], xmm0
lea eax, [eax + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
ret ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment