Commit 2cc1a2b2 authored by Frank Barchard's avatar Frank Barchard

Remove sse2 functions that also have ssse3

ARGBBlendRow_SSE2, ARGBAttenuateRow_SSE2, and MirrorRow_SSE2
Since vast majority of CPUs have SSSE3 now, removing the SSE2
improves the performance of CPU dispatching.

R=harryjin@google.com
BUG=none

Review URL: https://codereview.chromium.org/1377053003 .
parent d039ad6e
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1495 Version: 1496
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -121,7 +121,6 @@ extern "C" { ...@@ -121,7 +121,6 @@ extern "C" {
#define HAS_H422TOARGBROW_SSSE3 #define HAS_H422TOARGBROW_SSSE3
#define HAS_H422TOABGRROW_SSSE3 #define HAS_H422TOABGRROW_SSSE3
#define HAS_MERGEUVROW_SSE2 #define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORROW_UV_SSSE3 #define HAS_MIRRORROW_UV_SSSE3
#define HAS_MIRRORUVROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3
...@@ -181,8 +180,7 @@ extern "C" { ...@@ -181,8 +180,7 @@ extern "C" {
#define HAS_SOBELYROW_SSE2 #define HAS_SOBELYROW_SSE2
#endif #endif
// The following are available on x64 Visual C and clangcl. // The following are also available on x64 Visual C.
// TODO(fbarchard): Port to gcc.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__)) (!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
...@@ -262,16 +260,6 @@ extern "C" { ...@@ -262,16 +260,6 @@ extern "C" {
#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2
#endif #endif
// The following are disabled when SSSE3 is available:
// TODO(fbarchard): remove sse2. ssse3 is faster and well supported.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY)
#define HAS_ARGBATTENUATEROW_SSE2
#define HAS_ARGBBLENDROW_SSE2
#define HAS_MIRRORROW_SSE2
#endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
...@@ -822,7 +810,6 @@ void ARGBToUVJ422Row_C(const uint8* src_argb, ...@@ -822,7 +810,6 @@ void ARGBToUVJ422Row_C(const uint8* src_argb,
void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
void MirrorRow_NEON(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
void MirrorRow_C(const uint8* src, uint8* dst, int width); void MirrorRow_C(const uint8* src, uint8* dst, int width);
...@@ -1620,8 +1607,6 @@ void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); ...@@ -1620,8 +1607,6 @@ void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
...@@ -1941,7 +1926,6 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, ...@@ -1941,7 +1926,6 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
// Effects related row functions. // Effects related row functions.
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1495 #define LIBYUV_VERSION 1496
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -390,14 +390,6 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, ...@@ -390,14 +390,6 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
I422AlphaToARGBRow = I422AlphaToARGBRow_MIPS_DSPR2; I422AlphaToARGBRow = I422AlphaToARGBRow_MIPS_DSPR2;
} }
#endif #endif
#if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3) #if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
...@@ -424,7 +416,8 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, ...@@ -424,7 +416,8 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvConstants, width); I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvConstants,
width);
if (attenuate) { if (attenuate) {
ARGBAttenuateRow(dst_argb, dst_argb, width); ARGBAttenuateRow(dst_argb, dst_argb, width);
} }
...@@ -500,14 +493,6 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, ...@@ -500,14 +493,6 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
I422AlphaToABGRRow = I422AlphaToABGRRow_MIPS_DSPR2; I422AlphaToABGRRow = I422AlphaToABGRRow_MIPS_DSPR2;
} }
#endif #endif
#if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3) #if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
...@@ -534,7 +519,8 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, ...@@ -534,7 +519,8 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvConstants, width); I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvConstants,
width);
if (attenuate) { if (attenuate) {
ARGBAttenuateRow(dst_abgr, dst_abgr, width); ARGBAttenuateRow(dst_abgr, dst_abgr, width);
} }
......
...@@ -237,14 +237,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, ...@@ -237,14 +237,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_MIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MirrorRow = MirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSE2;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3) #if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3; MirrorRow = MirrorRow_Any_SSSE3;
...@@ -541,11 +533,6 @@ ARGBBlendRow GetARGBBlend() { ...@@ -541,11 +533,6 @@ ARGBBlendRow GetARGBBlend() {
return ARGBBlendRow; return ARGBBlendRow;
} }
#endif #endif
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlendRow = ARGBBlendRow_SSE2;
}
#endif
#if defined(HAS_ARGBBLENDROW_NEON) #if defined(HAS_ARGBBLENDROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBBlendRow = ARGBBlendRow_NEON; ARGBBlendRow = ARGBBlendRow_NEON;
...@@ -1267,14 +1254,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1267,14 +1254,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
height = 1; height = 1;
src_stride_argb = dst_stride_argb = 0; src_stride_argb = dst_stride_argb = 0;
} }
#if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3) #if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
......
...@@ -117,14 +117,6 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -117,14 +117,6 @@ void RotatePlane180(const uint8* src, int src_stride,
} }
} }
#endif #endif
#if defined(HAS_MIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MirrorRow = MirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSE2;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3) #if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3; MirrorRow = MirrorRow_Any_SSSE3;
......
...@@ -443,9 +443,6 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) ...@@ -443,9 +443,6 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif #endif
#ifdef HAS_ARGBATTENUATEROW_SSE2
ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
#endif
#ifdef HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
#endif #endif
...@@ -617,9 +614,6 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) ...@@ -617,9 +614,6 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
#ifdef HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSSE3
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif #endif
#ifdef HAS_MIRRORROW_SSE2
ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON #ifdef HAS_MIRRORROW_NEON
ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
#endif #endif
......
This diff is collapsed.
...@@ -3460,32 +3460,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3460,32 +3460,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
} }
#endif // HAS_MIRRORROW_AVX2 #endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORROW_SSE2
__declspec(naked)
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
convertloop:
movdqu xmm0, [eax - 16 + ecx]
movdqa xmm1, xmm0 // swap bytes
psllw xmm0, 8
psrlw xmm1, 8
por xmm0, xmm1
pshuflw xmm0, xmm0, 0x1b // swap words
pshufhw xmm0, xmm0, 0x1b
pshufd xmm0, xmm0, 0x4e // swap qwords
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
ret
}
}
#endif // HAS_MIRRORROW_SSE2
#ifdef HAS_MIRRORROW_UV_SSSE3 #ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels. // Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = { static const uvec8 kShuffleMirrorUV = {
...@@ -4382,107 +4356,14 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, ...@@ -4382,107 +4356,14 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
} }
#endif // HAS_YUY2TOYROW_SSE2 #endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time.
__declspec(naked)
void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm7, xmm7 // generate constant 1
psrlw xmm7, 15
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
psrlw xmm6, 8
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
psllw xmm5, 8
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
sub ecx, 4
jl convertloop4b // less than 4 pixels?
// 4 pixel loop.
convertloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
pshuflw xmm3, xmm3, 0F5h
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jge convertloop4
convertloop4b:
add ecx, 4 - 1
jl convertloop1b
// 1 pixel loop.
convertloop1:
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movd xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
pshuflw xmm3, xmm3, 0F5h
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
movd xmm1, [esi] // _a_g
lea esi, [esi + 4]
psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha
psrlw xmm2, 8 // _r_b convert to 8 bits again
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
jge convertloop1
convertloop1b:
pop esi
ret
}
}
#endif // HAS_ARGBBLENDROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSSE3 #ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha. // Shuffle table for isolating alpha.
static const uvec8 kShuffleAlpha = { static const uvec8 kShuffleAlpha = {
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
}; };
// Same as SSE2, but replaces:
// psrlw xmm3, 8 // alpha
// pshufhw xmm3, xmm3, 0F5h // 8 alpha words
// pshuflw xmm3, xmm3, 0F5h
// with..
// pshufb xmm3, kShuffleAlpha // alpha
// Blend 8 pixels at a time.
// Blend 8 pixels at a time.
__declspec(naked) __declspec(naked)
void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
...@@ -4564,48 +4445,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4564,48 +4445,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
} }
#endif // HAS_ARGBBLENDROW_SSSE3 #endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
__declspec(naked)
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
psrld xmm5, 8
convertloop:
movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm0 // first 2
pshufhw xmm2, xmm0, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm0, xmm2 // rgb * a
movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm1 // next 2 pixels
pshufhw xmm2, xmm1, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm1, xmm2 // rgb * a
movdqu xmm2, [eax] // alphas
lea eax, [eax + 16]
psrlw xmm0, 8
pand xmm2, xmm4
psrlw xmm1, 8
packuswb xmm0, xmm1
pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
ret
}
}
#endif // HAS_ARGBATTENUATEROW_SSE2
#ifdef HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha. // Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = { static const uvec8 kShuffleAlpha0 = {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment