Commit f2c86d01 authored by fbarchard@google.com's avatar fbarchard@google.com

SSSE3 version using pshufb for ARGBAttenuateRow_SSSE3

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/490011

git-svn-id: http://libyuv.googlecode.com/svn/trunk@243 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8ed54222
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 242 Version: 243
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 242 #define LIBYUV_VERSION 243
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -893,6 +893,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -893,6 +893,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
ARGBAttenuateRow = ARGBAttenuateRow_SSE2; ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
} }
#endif #endif
#if defined(HAS_ARGBATTENUATE_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
}
#endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width); ARGBAttenuateRow(src_argb, dst_argb, width);
......
...@@ -69,6 +69,11 @@ extern "C" { ...@@ -69,6 +69,11 @@ extern "C" {
#define HAS_ARGBATTENUATE_SSE2 #define HAS_ARGBATTENUATE_SSE2
#endif #endif
// The following are available on Windows 32 bit
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBATTENUATE_SSSE3
#endif
// The following are available on Neon platforms // The following are available on Neon platforms
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_MIRRORROW_NEON #define HAS_MIRRORROW_NEON
...@@ -363,6 +368,7 @@ void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); ...@@ -363,6 +368,7 @@ void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
......
...@@ -2334,8 +2334,58 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2334,8 +2334,58 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
ret ret
} }
} }
#endif // HAS_ARGBATTENUATE_SSE2 #endif // HAS_ARGBATTENUATE_SSE2
#ifdef HAS_ARGBATTENUATE_SSSE3
// Shuffle table duplicating alpha
static const uvec8 kShuffleAlpha0 = {
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
};
static const uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
__declspec(naked) __declspec(align(16))
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
pslld xmm3, 24
movdqa xmm4, kShuffleAlpha0
movdqa xmm5, kShuffleAlpha1
align 16
convertloop:
movdqa xmm0, [eax] // read 4 pixels
pshufb xmm0, xmm4 // isolate first 2 alphas
movdqa xmm1, [eax] // read 4 pixels
punpcklbw xmm1, xmm1 // first 2 pixel rgbs
pmulhuw xmm0, xmm1 // rgb * a
movdqa xmm1, [eax] // read 4 pixels
pshufb xmm1, xmm5 // isolate next 2 alphas
movdqa xmm2, [eax] // read 4 pixels
punpckhbw xmm2, xmm2 // next 2 pixel rgbs
pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // mask original alpha
pand xmm2, xmm3
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha
sub ecx, 4
movdqa [eax + edx], xmm0
lea eax, [eax + 16]
jg convertloop
ret
}
}
#endif // HAS_ARGBATTENUATE_SSSE3
#endif // _M_IX86 #endif // _M_IX86
#ifdef __cplusplus #ifdef __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment