Commit d5ee3dc9 authored by fbarchard@google.com's avatar fbarchard@google.com

AVX2 Attenuate

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/1101014

git-svn-id: http://libyuv.googlecode.com/svn/trunk@576 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent caf6e247
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 575 Version: 576
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -137,6 +137,9 @@ extern "C" { ...@@ -137,6 +137,9 @@ extern "C" {
#define HAS_UYVYTOUV422ROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2
#define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2 #define HAS_UYVYTOYROW_AVX2
// Effects
#define HAS_ARGBATTENUATEROW_AVX2
#endif #endif
#endif #endif
...@@ -1308,6 +1311,7 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, ...@@ -1308,6 +1311,7 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
int width); int width);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 575 #define LIBYUV_VERSION 576
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -1032,6 +1032,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1032,6 +1032,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBATTENUATEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
bool clear = true;
ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON) #if defined(HAS_ARGBATTENUATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) { if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
...@@ -1046,6 +1053,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1046,6 +1053,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
......
...@@ -4364,6 +4364,49 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4364,6 +4364,49 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
} }
#endif // HAS_ARGBATTENUATEROW_SSSE3 #endif // HAS_ARGBATTENUATEROW_SSSE3
#ifdef HAS_ARGBATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const ulvec8 kShuffleAlpha_AVX2 = {
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
};
__declspec(naked) __declspec(align(16))
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
vmovdqa ymm4, kShuffleAlpha_AVX2
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
align 16
convertloop:
vmovdqu ymm6, [eax] // read 8 pixels.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
vpshufb ymm2, ymm0, ymm4 // low 4 alphas
vpshufb ymm3, ymm1, ymm4 // high 4 alphas
vpmulhuw ymm0, ymm0, ymm2 // rgb * a
vpmulhuw ymm1, ymm1, ymm3 // rgb * a
vpand ymm6, ymm6, ymm5 // isolate alpha
vpsrlw ymm0, ymm0, 8
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // unmutated.
vpor ymm0, ymm0, ymm6 // copy original alpha
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
jg convertloop
ret
}
}
#endif // HAS_ARGBATTENUATEROW_AVX2
#ifdef HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time. // Unattenuate 4 pixels at a time.
// Aligned to 16 bytes. // Aligned to 16 bytes.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment