Commit c9562334 authored by fbarchard@google.com's avatar fbarchard@google.com

SplitUV 3 operand AVX2

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/1105005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@568 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 20828059
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 567
Version: 568
License: BSD
License File: LICENSE
......
......@@ -128,6 +128,7 @@ extern "C" {
// TODO(fbarchard): Hook these up to all functions. e.g. format conversion.
#define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOUVROW_AVX2
#define HAS_SPLITUVROW_AVX2
#endif
#endif
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 567
#define LIBYUV_VERSION 568
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -3238,6 +3238,81 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
}
}
#ifdef HAS_SPLITUVROW_AVX2
__declspec(naked) __declspec(align(16))
void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqa ymm0, [eax]
vmovdqa ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // odd bytes
vpsrlw ymm3, ymm1, 8
vpand ymm0, ymm0, ymm5 // even bytes
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
vpermq ymm0, ymm0, 0xd8
vpermq ymm2, ymm2, 0xd8
vmovdqa [edx], ymm0
vmovdqa [edx + edi], ymm2
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
pop edi
ret
}
}
__declspec(naked) __declspec(align(16))
void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // odd bytes
vpsrlw ymm3, ymm1, 8
vpand ymm0, ymm0, ymm5 // even bytes
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
vpermq ymm0, ymm0, 0xd8
vpermq ymm2, ymm2, 0xd8
vmovdqu [edx], ymm0
vmovdqu [edx + edi], ymm2
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
pop edi
ret
}
}
#endif // HAS_SPLITUVROW_AVX2
__declspec(naked) __declspec(align(16))
void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width) {
......
......@@ -81,18 +81,14 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
mov%1 m0, [src_uvq]
mov%1 m1, [src_uvq + mmsize]
lea src_uvq, [src_uvq + mmsize * 2]
mova m2, m0
mova m3, m1
psrlw m2, m0, 8 ; odd bytes
psrlw m3, m1, 8
pand m0, m0, m4 ; even bytes
pand m1, m1, m4
packuswb m0, m0, m1
%if cpuflag(AVX2)
vpermq m0, m0, 0xd8
%endif
psrlw m2, m2, 8 ; odd bytes
psrlw m3, m3, 8
packuswb m2, m2, m3
%if cpuflag(AVX2)
vpermq m0, m0, 0xd8
vpermq m2, m2, 0xd8
%endif
mov%1 [dst_uq], m0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment