Commit c95fd579 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

AVX2 port of I010ToAR30_AVX2

Was SSSE3 I420ToAR30_Opt (635 ms)
Now AVX2  I420ToAR30_Opt (446 ms)

Bug: libyuv:751
Test:  LibYUVConvertTest.H010ToAR30_Opt
Change-Id: I261be19ec981136a8f453ae0d3211532a790e5c5
Reviewed-on: https://chromium-review.googlesource.com/887750Reviewed-by: 's avatarFrank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 3f43ecc0
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1692 Version: 1693
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -272,6 +272,7 @@ extern "C" { ...@@ -272,6 +272,7 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2
#define HAS_I210TOARGBROW_AVX2 #define HAS_I210TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2 #define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2
#endif #endif
...@@ -1905,6 +1906,12 @@ void I210ToARGBRow_SSSE3(const uint16_t* src_y, ...@@ -1905,6 +1906,12 @@ void I210ToARGBRow_SSSE3(const uint16_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToAR30Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_AVX2(const uint16_t* src_y, void I210ToARGBRow_AVX2(const uint16_t* src_y,
const uint16_t* src_u, const uint16_t* src_u,
const uint16_t* src_v, const uint16_t* src_v,
...@@ -2073,6 +2080,12 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* src_y, ...@@ -2073,6 +2080,12 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToAR30Row_Any_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_Any_AVX2(const uint16_t* src_y, void I210ToARGBRow_Any_AVX2(const uint16_t* src_y,
const uint16_t* src_u, const uint16_t* src_u,
const uint16_t* src_v, const uint16_t* src_v,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1692 #define LIBYUV_VERSION 1693
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -148,6 +148,9 @@ ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) ...@@ -148,6 +148,9 @@ ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#ifdef HAS_I422TOAR30ROW_SSSE3 #ifdef HAS_I422TOAR30ROW_SSSE3
ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
#endif #endif
#ifdef HAS_I422TOAR30ROW_AVX2
ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
......
...@@ -2274,7 +2274,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, ...@@ -2274,7 +2274,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
"vmovdqa 192(%[yuvconstants]),%%ymm14 \n" "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
#define YUVTORGB_AVX2(yuvconstants) \ #define YUVTORGB16_AVX2(yuvconstants) \
"vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
"vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
"vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
...@@ -2284,13 +2284,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, ...@@ -2284,13 +2284,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpsraw $0x6,%%ymm0,%%ymm0 \n" \
"vpsraw $0x6,%%ymm1,%%ymm1 \n" \
"vpsraw $0x6,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2 \ #define YUVTORGB_REGS_AVX2 \
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
...@@ -2298,7 +2292,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, ...@@ -2298,7 +2292,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
#else // Convert 16 pixels: 16 UV and 16 Y. #else // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_SETUP_AVX2(yuvconstants) #define YUVTORGB_SETUP_AVX2(yuvconstants)
#define YUVTORGB_AVX2(yuvconstants) \ #define YUVTORGB16_AVX2(yuvconstants) \
"vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
"vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
"vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
...@@ -2311,15 +2305,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, ...@@ -2311,15 +2305,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2
#endif
#define YUVTORGB_AVX2(yuvconstants) \
YUVTORGB16_AVX2(yuvconstants) \
"vpsraw $0x6,%%ymm0,%%ymm0 \n" \ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
"vpsraw $0x6,%%ymm1,%%ymm1 \n" \ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
"vpsraw $0x6,%%ymm2,%%ymm2 \n" \ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2
#endif
// Store 16 ARGB values. // Store 16 ARGB values.
#define STOREARGB_AVX2 \ #define STOREARGB_AVX2 \
...@@ -2333,6 +2330,33 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, ...@@ -2333,6 +2330,33 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
"lea 0x40(%[dst_argb]), %[dst_argb] \n" "lea 0x40(%[dst_argb]), %[dst_argb] \n"
// Store 16 AR30 values.
#define STOREAR30_AVX2 \
"vpsraw $0x4,%%ymm0,%%ymm0 \n" \
"vpsraw $0x4,%%ymm1,%%ymm1 \n" \
"vpsraw $0x4,%%ymm2,%%ymm2 \n" \
"vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
"vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
"vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
"vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
"vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
"vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
"vpsllw $0x4,%%ymm2,%%ymm2 \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpermq $0xd8,%%ymm1,%%ymm1 \n" \
"vpermq $0xd8,%%ymm2,%%ymm2 \n" \
"vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
"vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
"vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
"vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
"vpslld $0xa,%%ymm1,%%ymm1 \n" \
"vpslld $0xa,%%ymm2,%%ymm2 \n" \
"vpor %%ymm1,%%ymm0,%%ymm0 \n" \
"vpor %%ymm2,%%ymm3,%%ymm3 \n" \
"vmovdqu %%ymm0,(%[dst_ar30]) \n" \
"vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
"lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
#ifdef HAS_I444TOARGBROW_AVX2 #ifdef HAS_I444TOARGBROW_AVX2
// 16 pixels // 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes). // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
...@@ -2402,6 +2426,46 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, ...@@ -2402,6 +2426,46 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
} }
#endif // HAS_I422TOARGBROW_AVX2 #endif // HAS_I422TOARGBROW_AVX2
#if defined(HAS_I422TOAR30ROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
"1: \n"
READYUV422_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_I422TOAR30ROW_AVX2
#if defined(HAS_I210TOARGBROW_AVX2) #if defined(HAS_I210TOARGBROW_AVX2)
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
......
...@@ -594,6 +594,7 @@ TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4) ...@@ -594,6 +594,7 @@ TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4)
TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4) TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4) TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)
TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1, 0, ARGB, 4) TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1, 0, ARGB, 4)
TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1, 0, ARGB, 4)
// TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1, 0, ABGR, 4) // TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1, 0, ABGR, 4)
#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment