Commit 00d526d4 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

H010ToARGB_AVX2 optimized conversion

AVX2 optimized 10 bit YUV to ARGB.

Bug: libyuv:751
Test: H010ToARGB unittest
Change-Id: I705630beb62714b52042c2a5dcdb8b7859e734ae
Reviewed-on: https://chromium-review.googlesource.com/852563
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent 55310f92
...@@ -279,6 +279,7 @@ extern "C" { ...@@ -279,6 +279,7 @@ extern "C" {
#define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_MERGEUVROW_16_AVX2 #define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2
#endif #endif
...@@ -1850,6 +1851,12 @@ void I210ToARGBRow_SSSE3(const uint16* src_y, ...@@ -1850,6 +1851,12 @@ void I210ToARGBRow_SSSE3(const uint16* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToARGBRow_AVX2(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2000,6 +2007,12 @@ void I210ToARGBRow_Any_SSSE3(const uint16* src_y, ...@@ -2000,6 +2007,12 @@ void I210ToARGBRow_Any_SSSE3(const uint16* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToARGBRow_Any_AVX2(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
......
...@@ -448,6 +448,14 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -448,6 +448,14 @@ static int H010ToAR30Matrix(const uint16* src_y,
} }
} }
#endif #endif
#if defined(HAS_I210TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I210ToARGBRow = I210ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I210ToARGBRow = I210ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOAR30ROW_SSSE3) #if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
...@@ -537,7 +545,14 @@ static int I010ToARGBMatrix(const uint16* src_y, ...@@ -537,7 +545,14 @@ static int I010ToARGBMatrix(const uint16* src_y,
} }
} }
#endif #endif
#if defined(HAS_I210TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I210ToARGBRow = I210ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I210ToARGBRow = I210ToARGBRow_AVX2;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
......
...@@ -194,9 +194,8 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) ...@@ -194,9 +194,8 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
#endif #endif
#undef ANY31C #undef ANY31C
// 64 byte per row for future AVX2
// Any 3 planes of 16 bit to 1 with yuvconstants // Any 3 planes of 16 bit to 1 with yuvconstants
// TODO(fbarchard): consider // TODO(fbarchard): consider sharing this code with ANY31C
#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ #define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, uint8* dst_ptr, \ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \ const struct YuvConstants* yuvconstants, int width) { \
...@@ -218,6 +217,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) ...@@ -218,6 +217,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
#ifdef HAS_I210TOARGBROW_SSSE3 #ifdef HAS_I210TOARGBROW_SSSE3
ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7) ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7)
#endif #endif
#ifdef HAS_I210TOARGBROW_AVX2
ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16, 2, 4, 15)
#endif
#undef ANY31CT #undef ANY31CT
// Any 2 planes to 1. // Any 2 planes to 1.
......
...@@ -1627,7 +1627,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, ...@@ -1627,7 +1627,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
// TODO(fbarchard): Consider shufb to replace pack/unpack // TODO(fbarchard): Consider shufb to replace pack/unpack
// TODO(fbarchard): Consider pmulhuw to replace psraw // TODO(fbarchard): Consider pmulhuw to replace psraw
// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
#define READYUV422_10 \ #define READYUV210 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
...@@ -1637,7 +1637,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, ...@@ -1637,7 +1637,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
"punpcklwd %%xmm0,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \
"movdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ "movdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
"psllw $0x6,%%xmm4 \n" \ "psllw $0x6,%%xmm4 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \ #define READYUVA422 \
...@@ -1892,7 +1892,7 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf, ...@@ -1892,7 +1892,7 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
READYUV422_10 READYUV210
YUVTORGB(yuvconstants) YUVTORGB(yuvconstants)
STOREARGB STOREARGB
"sub $0x8,%[width] \n" "sub $0x8,%[width] \n"
...@@ -1968,7 +1968,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1968,7 +1968,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
[dst_argb]"+r"(dst_argb), // %[dst_argb] [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS // Does not use r14. : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
// clang-format on // clang-format on
...@@ -2116,6 +2116,23 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2116,6 +2116,23 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 8 UV from 210 10 bit, upsample to 16 UV
// TODO(fbarchard): Consider vshufb to replace pack/unpack
// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
#define READYUV210_AVX2 \
"vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpermq $0xd8,%%ymm1,%%ymm1 \n" \
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
"vpsraw $0x2,%%ymm0,%%ymm0 \n" \
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
"vmovdqu " MEMACCESS([y_buf]) ",%%ymm4 \n" \
"vpsllw $0x6,%%ymm4,%%ymm4 \n" \
"lea " MEMLEA(0x20, [y_buf]) ",%[y_buf] \n"
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
#define READYUVA422_AVX2 \ #define READYUVA422_AVX2 \
"vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
...@@ -2308,6 +2325,41 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2308,6 +2325,41 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
} }
#endif // HAS_I422TOARGBROW_AVX2 #endif // HAS_I422TOARGBROW_AVX2
#if defined(HAS_I210TOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
void OMITFP I210ToARGBRow_AVX2(const uint16* y_buf,
const uint16* u_buf,
const uint16* v_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
READYUV210_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_I210TOARGBROW_AVX2
#if defined(HAS_I422ALPHATOARGBROW_AVX2) #if defined(HAS_I422ALPHATOARGBROW_AVX2)
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment