Commit c9562334 authored by fbarchard@google.com's avatar fbarchard@google.com

SplitUV 3 operand AVX2

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/1105005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@568 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 20828059
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 567 Version: 568
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -128,6 +128,7 @@ extern "C" { ...@@ -128,6 +128,7 @@ extern "C" {
// TODO(fbarchard): Hook these up to all functions. e.g. format conversion. // TODO(fbarchard): Hook these up to all functions. e.g. format conversion.
#define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOUVROW_AVX2
#define HAS_SPLITUVROW_AVX2
#endif #endif
#endif #endif
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 567 #define LIBYUV_VERSION 568
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -3238,6 +3238,81 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -3238,6 +3238,81 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
} }
} }
#ifdef HAS_SPLITUVROW_AVX2
__declspec(naked) __declspec(align(16))
void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqa ymm0, [eax]
vmovdqa ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // odd bytes
vpsrlw ymm3, ymm1, 8
vpand ymm0, ymm0, ymm5 // even bytes
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
vpermq ymm0, ymm0, 0xd8
vpermq ymm2, ymm2, 0xd8
vmovdqa [edx], ymm0
vmovdqa [edx + edi], ymm2
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
pop edi
ret
}
}
__declspec(naked) __declspec(align(16))
void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // odd bytes
vpsrlw ymm3, ymm1, 8
vpand ymm0, ymm0, ymm5 // even bytes
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
vpermq ymm0, ymm0, 0xd8
vpermq ymm2, ymm2, 0xd8
vmovdqu [edx], ymm0
vmovdqu [edx + edi], ymm2
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
pop edi
ret
}
}
#endif // HAS_SPLITUVROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width) { uint8* dst_uv, int width) {
......
...@@ -81,18 +81,14 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix ...@@ -81,18 +81,14 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
mov%1 m0, [src_uvq] mov%1 m0, [src_uvq]
mov%1 m1, [src_uvq + mmsize] mov%1 m1, [src_uvq + mmsize]
lea src_uvq, [src_uvq + mmsize * 2] lea src_uvq, [src_uvq + mmsize * 2]
mova m2, m0 psrlw m2, m0, 8 ; odd bytes
mova m3, m1 psrlw m3, m1, 8
pand m0, m0, m4 ; even bytes pand m0, m0, m4 ; even bytes
pand m1, m1, m4 pand m1, m1, m4
packuswb m0, m0, m1 packuswb m0, m0, m1
%if cpuflag(AVX2)
vpermq m0, m0, 0xd8
%endif
psrlw m2, m2, 8 ; odd bytes
psrlw m3, m3, 8
packuswb m2, m2, m3 packuswb m2, m2, m3
%if cpuflag(AVX2) %if cpuflag(AVX2)
vpermq m0, m0, 0xd8
vpermq m2, m2, 0xd8 vpermq m2, m2, 0xd8
%endif %endif
mov%1 [dst_uq], m0 mov%1 [dst_uq], m0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment