Commit 7ff53f32 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

I422ToYUY2Row_AVX2 use vpmovzxbd instead of vpermq

I422ToYUY2Row_AVX2 optimized from 7 cycles per 32 pixels to 6 cycles.
Instead of 2 vpermq and vpunpcklbw:
vmovdqu    (%1),%%xmm2
vmovdqu    0x00(%1,%2,1),%%xmm3
lea        0x10(%1),%1
vpermq     $0xd8,%%ymm2,%%ymm2
vpermq     $0xd8,%%ymm3,%%ymm3
vpunpcklbw %%ymm3,%%ymm2,%%ymm2

..use vpmovzxbd to expand the bytes to shorts, then vpslld and vpor
vpmovzxbd  (%1),%%ymm2
vpmovzxbd  0x00(%1,%2,1),%%ymm3
vpslld     $0x10,%%ymm3,%%ymm3
vpor       %%ymm3,%%ymm2,%%ymm2
which reduces the port 5 bottleneck by 1 cycle.

Bug: libyuv:556
Test: out/Release/libyuv_unittest --gtest_filter=*I42?To*UY*Opt

I422ToYUY2Row_AVX2 optimization

Improve performance of AVX2 code by avoiding vpermq

Bug: libyuv:556
Test: /usr/local/google/home/fbarchard/iaca-lin64/bin/iaca.sh -reduceout -arch BDW out/Release/obj/libyuv_internal/row_gcc.o
Change-Id: Ie36732da23ecea1ffcc6b297bacc962780b59ef1
Reviewed-on: https://chromium-review.googlesource.com/898067
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent 664c7356
...@@ -5978,7 +5978,6 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, ...@@ -5978,7 +5978,6 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
asm volatile( asm volatile(
"sub %1,%2 \n" "sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%1),%%xmm2 \n" "movq (%1),%%xmm2 \n"
...@@ -6055,20 +6054,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, ...@@ -6055,20 +6054,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%1),%%xmm2 \n" "vpmovzxbw (%1),%%ymm2 \n"
"vmovdqu 0x00(%1,%2,1),%%xmm3 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm3 \n"
"lea 0x10(%1),%1 \n" "add $0x10,%1 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpsllw $0x8,%%ymm3,%%ymm3 \n"
"vpermq $0xd8,%%ymm3,%%ymm3 \n" "vpor %%ymm3,%%ymm2,%%ymm2 \n"
"vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"lea 0x20(%0),%0 \n" "add $0x20,%0 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
"vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%3) \n" "vextractf128 $0x0,%%ymm0,(%3) \n"
"vmovdqu %%ymm1,0x20(%3) \n" "vextractf128 $0x0,%%ymm1,0x10(%3) \n"
"vextractf128 $0x1,%%ymm0,0x20(%3) \n"
"vextractf128 $0x1,%%ymm1,0x30(%3) \n"
"lea 0x40(%3),%3 \n" "lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n" "sub $0x20,%4 \n"
"jg 1b \n" "jg 1b \n"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment