Commit ef14972d authored by fbarchard@google.com's avatar fbarchard@google.com

MergeUV AVX2 use vextractf128 to store results to avoid shuffling.

BUG=none
TESTED=intel sde on unittests
R=brucedawson@google.com

Review URL: https://webrtc-codereview.appspot.com/33369004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1178 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 147f7b70
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1177
Version: 1178
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1177
#define LIBYUV_VERSION 1178
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -2446,8 +2446,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
}
#endif // HAS_SPLITUVROW_SSE2
// TODO(fbarchard): Consider vpunpcklbw, vpunpckhbw, store-low1, store-low2,
// extract-high1, extract-high2.
#ifdef HAS_MERGEUVROW_AVX2
void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
......@@ -2458,13 +2456,12 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
"lea " MEMLEA(0x20,0) ",%0 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
"vperm2i128 $0x20,%%ymm0,%%ymm2,%%ymm1 \n"
"vperm2i128 $0x31,%%ymm0,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm1," MEMACCESS(2) " \n"
"vmovdqu %%ymm2," MEMACCESS2(0x20,2) " \n"
"vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
"vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
"vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
"vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
......
......@@ -2686,10 +2686,10 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
lea eax, [eax + 32]
vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
vmovdqu [edi], ymm1
vmovdqu [edi + 32], ymm2
vextractf128 [edi], ymm2, 0 // bytes 0..15
vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
lea edi, [edi + 64]
sub ecx, 32
jg convertloop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment