Commit 5790a765 authored by Frank Barchard's avatar Frank Barchard

I422ToUYVYRow_AVX2 use vpmovzxbd instead of vpermq

I422ToUYVYRow_AVX2 optimized from 7 cycles per 32 pixels to 4.6 cycles.
Instead of 2 vpermq and vpunpcklbw:
vmovdqu    (%1),%%xmm2
vmovdqu    0x00(%1,%2,1),%%xmm3
vpermq     $0xd8,%%ymm2,%%ymm2
vpermq     $0xd8,%%ymm3,%%ymm3
vpunpcklbw %%ymm3,%%ymm2,%%ymm2

..use vpmovzxbd to expand the bytes to shorts, then vpslld and vpor
vpmovzxbd  (%1),%%ymm2
vpmovzxbd  0x00(%1,%2,1),%%ymm3
vpslld     $0x10,%%ymm3,%%ymm3
vpor       %%ymm3,%%ymm2,%%ymm2
which reduces the port 5 bottleneck by 1 cycle.

Bug: libyuv:556
Test: out/Release/libyuv_unittest --gtest_filter=*I42?To*UY*Opt

Change-Id: I53799e53cc6b090a1a695c839094c193be3eecaf
Reviewed-on: https://chromium-review.googlesource.com/899873
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 7ff53f32
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1695 Version: 1696
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1695 #define LIBYUV_VERSION 1696
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -5973,19 +5973,20 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, ...@@ -5973,19 +5973,20 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
void I422ToYUY2Row_SSE2(const uint8_t* src_y, void I422ToYUY2Row_SSE2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_frame, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile( asm volatile(
"sub %1,%2 \n" "sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%1),%%xmm2 \n" "movq (%1),%%xmm2 \n"
"movq 0x00(%1,%2,1),%%xmm3 \n" "movq 0x00(%1,%2,1),%%xmm1 \n"
"lea 0x8(%1),%1 \n" "add $0x8,%1 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm2 \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n" "add $0x10,%0 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n" "punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n" "punpckhbw %%xmm2,%%xmm1 \n"
...@@ -5997,10 +5998,10 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, ...@@ -5997,10 +5998,10 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
"+r"(dst_frame), // %3 "+r"(dst_yuy2), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: :
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); : "memory", "cc", "xmm0", "xmm1", "xmm2");
} }
#endif // HAS_I422TOYUY2ROW_SSE2 #endif // HAS_I422TOYUY2ROW_SSE2
...@@ -6008,7 +6009,7 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, ...@@ -6008,7 +6009,7 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
void I422ToUYVYRow_SSE2(const uint8_t* src_y, void I422ToUYVYRow_SSE2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_frame, uint8_t* dst_uyvy,
int width) { int width) {
asm volatile( asm volatile(
...@@ -6017,12 +6018,12 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, ...@@ -6017,12 +6018,12 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%1),%%xmm2 \n" "movq (%1),%%xmm2 \n"
"movq 0x00(%1,%2,1),%%xmm3 \n" "movq 0x00(%1,%2,1),%%xmm1 \n"
"lea 0x8(%1),%1 \n" "add $0x8,%1 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm2 \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n" "movdqa %%xmm2,%%xmm1 \n"
"lea 0x10(%0),%0 \n" "add $0x10,%0 \n"
"punpcklbw %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n" "punpckhbw %%xmm0,%%xmm2 \n"
"movdqu %%xmm1,(%3) \n" "movdqu %%xmm1,(%3) \n"
...@@ -6033,20 +6034,18 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, ...@@ -6033,20 +6034,18 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
"+r"(dst_frame), // %3 "+r"(dst_uyvy), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: :
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); : "memory", "cc", "xmm0", "xmm1", "xmm2");
} }
#endif // HAS_I422TOUYVYROW_SSE2 #endif // HAS_I422TOUYVYROW_SSE2
#ifdef HAS_I422TOYUY2ROW_AVX2 #ifdef HAS_I422TOYUY2ROW_AVX2
// TODO(fbarchard): Consider vmovhps to avoid vpermq
void I422ToYUY2Row_AVX2(const uint8_t* src_y, void I422ToYUY2Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_frame, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile( asm volatile(
...@@ -6054,19 +6053,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, ...@@ -6054,19 +6053,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vpmovzxbw (%1),%%ymm2 \n" "vpmovzxbw (%1),%%ymm1 \n"
"vpmovzxbw 0x00(%1,%2,1),%%ymm3 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
"add $0x10,%1 \n" "add $0x10,%1 \n"
"vpsllw $0x8,%%ymm3,%%ymm3 \n" "vpsllw $0x8,%%ymm2,%%ymm2 \n"
"vpor %%ymm3,%%ymm2,%%ymm2 \n" "vpor %%ymm1,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"add $0x20,%0 \n" "add $0x20,%0 \n"
"vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
"vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
"vextractf128 $0x0,%%ymm0,(%3) \n" "vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm1,0x10(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm0,0x20(%3) \n" "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm1,0x30(%3) \n" "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
"lea 0x40(%3),%3 \n" "lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n" "sub $0x20,%4 \n"
"jg 1b \n" "jg 1b \n"
...@@ -6074,10 +6073,10 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, ...@@ -6074,10 +6073,10 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
"+r"(dst_frame), // %3 "+r"(dst_yuy2), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: :
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); : "memory", "cc", "xmm0", "xmm1", "xmm2");
} }
#endif // HAS_I422TOYUY2ROW_AVX2 #endif // HAS_I422TOYUY2ROW_AVX2
...@@ -6085,7 +6084,7 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, ...@@ -6085,7 +6084,7 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
void I422ToUYVYRow_AVX2(const uint8_t* src_y, void I422ToUYVYRow_AVX2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_frame, uint8_t* dst_uyvy,
int width) { int width) {
asm volatile( asm volatile(
...@@ -6093,20 +6092,19 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, ...@@ -6093,20 +6092,19 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%1),%%xmm2 \n" "vpmovzxbw (%1),%%ymm1 \n"
"vmovdqu 0x00(%1,%2,1),%%xmm3 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
"lea 0x10(%1),%1 \n" "add $0x10,%1 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpsllw $0x8,%%ymm2,%%ymm2 \n"
"vpermq $0xd8,%%ymm3,%%ymm3 \n" "vpor %%ymm1,%%ymm2,%%ymm2 \n"
"vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"lea 0x20(%0),%0 \n" "add $0x20,%0 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
"vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm1,(%3) \n" "vextractf128 $0x0,%%ymm1,(%3) \n"
"vmovdqu %%ymm2,0x20(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
"lea 0x40(%3),%3 \n" "lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n" "sub $0x20,%4 \n"
"jg 1b \n" "jg 1b \n"
...@@ -6114,10 +6112,10 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, ...@@ -6114,10 +6112,10 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
"+r"(dst_frame), // %3 "+r"(dst_uyvy), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: :
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); : "memory", "cc", "xmm0", "xmm1", "xmm2");
} }
#endif // HAS_I422TOUYVYROW_AVX2 #endif // HAS_I422TOUYVYROW_AVX2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment