Commit 0bb2773a authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

AVX2 versions of ABGRToNV12 and ABGRToNV21

BUG=libyuv:833

Change-Id: I9b6653e9c304b4e0805b7d3c8408ce57009c8559
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1740682Reviewed-by: 's avatarHirokazu Honda <hiroh@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 9b63884a
This diff is collapsed.
...@@ -580,6 +580,9 @@ ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) ...@@ -580,6 +580,9 @@ ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
#ifdef HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif #endif
#ifdef HAS_ABGRTOYROW_AVX2
ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
#endif #endif
...@@ -1273,6 +1276,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) ...@@ -1273,6 +1276,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
#ifdef HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif #endif
#ifdef HAS_ABGRTOUVROW_AVX2
ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif #endif
......
...@@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { ...@@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
} }
#endif // HAS_ARGBTOYROW_AVX2 #endif // HAS_ARGBTOYROW_AVX2
#ifdef HAS_ABGRTOYROW_AVX2
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kAddY16), // %4
"m"(kPermdARGBToY_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ABGRTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
...@@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, ...@@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
} }
#endif // HAS_ARGBTOUVROW_AVX2 #endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ABGRTOUVROW_AVX2
void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsraw $0x8,%%ymm1,%%ymm1 \n"
"vpsraw $0x8,%%ymm0,%%ymm0 \n"
"vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpshufb %8,%%ymm0,%%ymm0 \n"
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_abgr)), // %4
"m"(kAddUV128), // %5
"m"(kABGRToV), // %6
"m"(kABGRToU), // %7
"m"(kShufARGBToUV_AVX) // %8
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ABGRTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb, int src_stride_argb,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment