Commit 0bb2773a authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

AVX2 versions of ABGRToNV12 and ABGRToNV21

BUG=libyuv:833

Change-Id: I9b6653e9c304b4e0805b7d3c8408ce57009c8559
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1740682Reviewed-by: 's avatarHirokazu Honda <hiroh@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 9b63884a
......@@ -284,6 +284,8 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ABGRTOAR30ROW_AVX2
#define HAS_ABGRTOUVROW_AVX2
#define HAS_ABGRTOYROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2
......@@ -912,6 +914,8 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
......@@ -940,7 +944,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_MSA(const uint8_t* src_argb0,
void ARGBToUVRow_MSA(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
......@@ -949,7 +953,7 @@ void ARGBToUV444Row_MMI(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_MMI(const uint8_t* src_argb0,
void ARGBToUVRow_MMI(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
......@@ -999,32 +1003,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
void BGRAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
void ABGRToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
void RGBAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVRow_MSA(const uint8_t* src_rgb0,
void RAWToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
......@@ -1039,32 +1043,32 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
void BGRAToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
void ABGRToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
void RGBAToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVRow_MMI(const uint8_t* src_rgb0,
void RAWToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
......@@ -1096,29 +1100,29 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width);
void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
......@@ -1169,37 +1173,42 @@ void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
......@@ -1209,6 +1218,11 @@ void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
......@@ -1396,47 +1410,47 @@ void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_C(const uint8_t* src_rgb0,
void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_C(const uint8_t* src_rgb0,
void ARGBToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_C(const uint8_t* src_rgb0,
void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_C(const uint8_t* src_rgb0,
void ARGBToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_C(const uint8_t* src_rgb0,
void BGRAToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_C(const uint8_t* src_rgb0,
void ABGRToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_C(const uint8_t* src_rgb0,
void RGBAToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVRow_C(const uint8_t* src_rgb0,
void RGB24ToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVRow_C(const uint8_t* src_rgb0,
void RAWToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
......
......@@ -580,6 +580,9 @@ ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ABGRTOYROW_AVX2
ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYJROW_AVX2
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
#endif
......@@ -1273,6 +1276,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ABGRTOUVROW_AVX2
ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif
......
......@@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
}
#endif // HAS_ARGBTOYROW_AVX2
#ifdef HAS_ABGRTOYROW_AVX2
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kAddY16), // %4
"m"(kPermdARGBToY_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ABGRTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
......@@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ABGRTOUVROW_AVX2
void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsraw $0x8,%%ymm1,%%ymm1 \n"
"vpsraw $0x8,%%ymm0,%%ymm0 \n"
"vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpshufb %8,%%ymm0,%%ymm0 \n"
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_abgr)), // %4
"m"(kAddUV128), // %5
"m"(kABGRToV), // %6
"m"(kABGRToU), // %7
"m"(kShufARGBToUV_AVX) // %8
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ABGRTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment