Commit 664c7356 authored by Frank Barchard's avatar Frank Barchard Committed by Frank Barchard

I420ToYUY2_AVX2 port

I420 and I422 To YUY2 and UYVY ported from SSE2 to AVX2.

Was SSE2
I420ToYUY2_Opt (135 ms)
I420ToUYVY_Opt (148 ms)
I422ToYUY2_Opt (145 ms)
I422ToUYVY_Opt (142 ms)

Now AVX2
I420ToYUY2_Opt (133 ms)
I420ToUYVY_Opt (130 ms)
I422ToYUY2_Opt (127 ms)
I422ToUYVY_Opt (137 ms)

Bug: libyuv:556
Test: out/Release/libyuv_unittest --sandbox_unittests --gtest_filter=*I42?To*UY*Opt
Change-Id: Ic35f97cee02dc009fd98785589ba17c7cf50bb35
Reviewed-on: https://chromium-review.googlesource.com/892493
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent ffec313d
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1694
Version: 1695
License: BSD
License File: LICENSE
......
......@@ -276,6 +276,8 @@ extern "C" {
#define HAS_I210TOARGBROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
#define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2
#endif
......@@ -2412,8 +2414,12 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_argb,
void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_abgr,
uint8_t* dst_ar30,
int width);
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb,
uint8_t* dst_ar30,
int width);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_argb,
uint8_t* dst_rgb,
......@@ -2433,8 +2439,12 @@ void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_argb,
void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_argb,
uint8_t* dst_rgb,
int width);
void ABGRToAR30Row_Any_AVX2(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
void ABGRToAR30Row_Any_AVX2(const uint8_t* src_abgr,
uint8_t* dst_ar30,
int width);
void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb,
uint8_t* dst_ar30,
int width);
void ARGBToRGB24Row_Any_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb,
......@@ -2840,6 +2850,26 @@ void I422ToUYVYRow_Any_SSE2(const uint8_t* src_y,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width);
void I422ToYUY2Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width);
void I422ToUYVYRow_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width);
void I422ToYUY2Row_Any_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width);
void I422ToUYVYRow_Any_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width);
void I422ToYUY2Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1694
#define LIBYUV_VERSION 1695
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -219,6 +219,14 @@ int I422ToYUY2(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
I422ToYUY2Row = I422ToYUY2Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
......@@ -270,6 +278,14 @@ int I420ToYUY2(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
I422ToYUY2Row = I422ToYUY2Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
......@@ -341,6 +357,14 @@ int I422ToUYVY(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOUYVYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
I422ToUYVYRow = I422ToUYVYRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
......@@ -400,6 +424,14 @@ int I420ToUYVY(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOUYVYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
I422ToUYVYRow = I422ToUYVYRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
......
......@@ -583,6 +583,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
I422ToYUY2Row = I422ToYUY2Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
......@@ -712,6 +720,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_I422TOUYVYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
I422ToUYVYRow = I422ToUYVYRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
......
......@@ -96,6 +96,10 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_AVX2
ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
#endif
#ifdef HAS_I422TOYUY2ROW_NEON
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#endif
......
......@@ -6041,6 +6041,88 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
}
#endif // HAS_I422TOUYVYROW_SSE2
#ifdef HAS_I422TOYUY2ROW_AVX2
// TODO(fbarchard): Consider vmovhps to avoid vpermq
void I422ToYUY2Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_frame,
int width) {
asm volatile(
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%1),%%xmm2 \n"
"vmovdqu 0x00(%1,%2,1),%%xmm3 \n"
"lea 0x10(%1),%1 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n"
"lea 0x20(%0),%0 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
"vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%3) \n"
"vmovdqu %%ymm1,0x20(%3) \n"
"lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_frame), // %3
"+rm"(width) // %4
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
#endif // HAS_I422TOYUY2ROW_AVX2
#ifdef HAS_I422TOUYVYROW_AVX2
void I422ToUYVYRow_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_frame,
int width) {
asm volatile(
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%1),%%xmm2 \n"
"vmovdqu 0x00(%1,%2,1),%%xmm3 \n"
"lea 0x10(%1),%1 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n"
"lea 0x20(%0),%0 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
"vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm1,(%3) \n"
"vmovdqu %%ymm2,0x20(%3) \n"
"lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_frame), // %3
"+rm"(width) // %4
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
#endif // HAS_I422TOUYVYROW_AVX2
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
......
......@@ -1324,6 +1324,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"movd %%xmm0,(%0) \n"
LABELALIGN "99: \n" // clang-format error.
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+rm"(dst_width), // %2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment