Commit f6631bb8 authored by fbarchard@google.com's avatar fbarchard@google.com

CopyAlpha AVX2

BUG=none
TEST=Alpha*
R=ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/2392004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@812 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 88ce3c0c
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 811 Version: 812
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -167,8 +167,8 @@ extern "C" { ...@@ -167,8 +167,8 @@ extern "C" {
// Effects: // Effects:
// TODO(fbarchard): Optimize and enable // TODO(fbarchard): Optimize and enable
// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 // #define HAS_ARGBLUMACOLORTABLEROW_SSSE3
// TODO(fbarchard): Optimize and enable #define HAS_ARGBCOPYALPHAROW_SSE2
// #define HAS_ARGBCOPYALPHAROW_SSE2 #define HAS_ARGBCOPYALPHAROW_SSE41
// Caveat: Visual C 2012 required for AVX2. // Caveat: Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700 #if _MSC_VER >= 1700
...@@ -187,6 +187,7 @@ extern "C" { ...@@ -187,6 +187,7 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2 #define HAS_YUY2TOYROW_AVX2
#define HAS_ARGBCOPYALPHAROW_AVX2
// Effects: // Effects:
#define HAS_ARGBADDROW_AVX2 #define HAS_ARGBADDROW_AVX2
...@@ -701,6 +702,8 @@ void CopyRow_C(const uint8* src, uint8* dst, int count); ...@@ -701,6 +702,8 @@ void CopyRow_C(const uint8* src, uint8* dst, int count);
void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width); void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width);
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width); void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width);
void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width);
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width);
void SetRow_X86(uint8* dst, uint32 v32, int count); void SetRow_X86(uint8* dst, uint32 v32, int count);
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 811 #define LIBYUV_VERSION 812
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -2188,9 +2188,22 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, ...@@ -2188,9 +2188,22 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
ARGBCopyAlphaRow_C; ARGBCopyAlphaRow_C;
#if defined(HAS_ARGBCOPYALPHAROW_SSE2) #if defined(HAS_ARGBCOPYALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
IS_ALIGNED(width, 8)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
} }
#endif
#if defined(HAS_ARGBCOPYALPHAROW_SSE41)
if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(width, 8)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE41;
}
#endif
#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
ARGBCopyAlphaRow(src_argb, dst_argb, width); ARGBCopyAlphaRow(src_argb, dst_argb, width);
......
...@@ -3603,37 +3603,102 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) { ...@@ -3603,37 +3603,102 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
} }
#endif // HAS_COPYROW_X86 #endif // HAS_COPYROW_X86
#ifdef HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels // width in pixels
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov edx, edi
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov edi, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
pslld xmm5, 24 pslld xmm0, 24
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
psrld xmm1, 8
align 16 align 4
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm2, [eax]
movdqa xmm1, [eax + 16] movdqa xmm3, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
maskmovdqu xmm0, xmm5 movdqa xmm4, [edx]
lea edi, [edi + 16] movdqa xmm5, [edx + 16]
maskmovdqu xmm1, xmm5 pand xmm2, xmm0
lea edi, [edi + 16] pand xmm3, xmm0
pand xmm4, xmm1
pand xmm5, xmm1
por xmm2, xmm4
por xmm3, xmm5
movdqa [edx], xmm2
movdqa [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
mov edi, edx
ret ret
} }
} }
#endif // HAS_ARGBCOPYALPHAROW_SSE2 #endif // HAS_ARGBCOPYALPHAROW_SSE2
#ifdef HAS_ARGBCOPYALPHAROW_SSE41
// width in pixels
__declspec(naked) __declspec(align(16))
void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
pcmpeqb xmm0, xmm0 // generate mask 0x00ffffff
psrld xmm0, 8
align 4
convertloop:
movdqu xmm1, [eax]
movdqu xmm2, [eax + 16]
lea eax, [eax + 32]
pblendvb xmm1, [edx], xmm0
pblendvb xmm2, [edx + 16], xmm0
movdqu [edx], xmm1
movdqu [edx + 16], xmm2
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
ret
}
}
#endif // HAS_ARGBCOPYALPHAROW_SSE41
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels
__declspec(naked) __declspec(align(16))
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
vpcmpeqb ymm0, ymm0, ymm0 // generate mask 0x00ffffff
vpsrld ymm0, ymm0, 8
align 4
convertloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + 32]
lea eax, [eax + 64]
vpblendvb ymm1, ymm1, [edx], ymm0
vpblendvb ymm2, ymm2, [edx + 32], ymm0
vmovdqu [edx], ymm1
vmovdqu [edx + 32], ymm2
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_ARGBCOPYALPHAROW_AVX2
#ifdef HAS_SETROW_X86 #ifdef HAS_SETROW_X86
// SetRow8 writes 'count' bytes using a 32 bit value repeated. // SetRow8 writes 'count' bytes using a 32 bit value repeated.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment