Commit 0bb2773a authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

AVX2 versions of ABGRToNV12 and ABGRToNV21

BUG=libyuv:833

Change-Id: I9b6653e9c304b4e0805b7d3c8408ce57009c8559
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1740682Reviewed-by: 's avatarHirokazu Honda <hiroh@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 9b63884a
...@@ -284,6 +284,8 @@ extern "C" { ...@@ -284,6 +284,8 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ABGRTOAR30ROW_AVX2 #define HAS_ABGRTOAR30ROW_AVX2
#define HAS_ABGRTOUVROW_AVX2
#define HAS_ABGRTOYROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2
...@@ -912,6 +914,8 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, ...@@ -912,6 +914,8 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
...@@ -940,7 +944,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb, ...@@ -940,7 +944,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVRow_MSA(const uint8_t* src_argb0, void ARGBToUVRow_MSA(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
...@@ -949,7 +953,7 @@ void ARGBToUV444Row_MMI(const uint8_t* src_argb, ...@@ -949,7 +953,7 @@ void ARGBToUV444Row_MMI(const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVRow_MMI(const uint8_t* src_argb0, void ARGBToUVRow_MMI(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
...@@ -999,32 +1003,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, ...@@ -999,32 +1003,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void BGRAToUVRow_MSA(const uint8_t* src_rgb0, void BGRAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ABGRToUVRow_MSA(const uint8_t* src_rgb0, void ABGRToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RGBAToUVRow_MSA(const uint8_t* src_rgb0, void RGBAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RAWToUVRow_MSA(const uint8_t* src_rgb0, void RAWToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
...@@ -1039,32 +1043,32 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, ...@@ -1039,32 +1043,32 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void BGRAToUVRow_MMI(const uint8_t* src_rgb0, void BGRAToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ABGRToUVRow_MMI(const uint8_t* src_rgb0, void ABGRToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RGBAToUVRow_MMI(const uint8_t* src_rgb0, void RGBAToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RAWToUVRow_MMI(const uint8_t* src_rgb0, void RAWToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
...@@ -1096,29 +1100,29 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, ...@@ -1096,29 +1100,29 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y, uint8_t* dst_y,
int width); int width);
void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
...@@ -1169,37 +1173,42 @@ void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr, ...@@ -1169,37 +1173,42 @@ void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int width); int width);
void ARGBToUVRow_AVX2(const uint8_t* src_argb0, void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
int src_stride_bgra, int src_stride_bgra,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
int src_stride_abgr, int src_stride_abgr,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
int src_stride_rgba, int src_stride_rgba,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
...@@ -1209,6 +1218,11 @@ void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, ...@@ -1209,6 +1218,11 @@ void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr, int src_stride_ptr,
uint8_t* dst_u, uint8_t* dst_u,
...@@ -1396,47 +1410,47 @@ void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr, ...@@ -1396,47 +1410,47 @@ void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVRow_C(const uint8_t* src_rgb0, void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVJRow_C(const uint8_t* src_rgb0, void ARGBToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVRow_C(const uint8_t* src_rgb0, void ARGBToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ARGBToUVJRow_C(const uint8_t* src_rgb0, void ARGBToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void BGRAToUVRow_C(const uint8_t* src_rgb0, void BGRAToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void ABGRToUVRow_C(const uint8_t* src_rgb0, void ABGRToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RGBAToUVRow_C(const uint8_t* src_rgb0, void RGBAToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RGB24ToUVRow_C(const uint8_t* src_rgb0, void RGB24ToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void RAWToUVRow_C(const uint8_t* src_rgb0, void RAWToUVRow_C(const uint8_t* src_rgb,
int src_stride_rgb, int src_stride_rgb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
......
...@@ -580,6 +580,9 @@ ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) ...@@ -580,6 +580,9 @@ ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
#ifdef HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif #endif
#ifdef HAS_ABGRTOYROW_AVX2
ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
#endif #endif
...@@ -1273,6 +1276,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) ...@@ -1273,6 +1276,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
#ifdef HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif #endif
#ifdef HAS_ABGRTOUVROW_AVX2
ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif #endif
......
...@@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { ...@@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
} }
#endif // HAS_ARGBTOYROW_AVX2 #endif // HAS_ARGBTOYROW_AVX2
#ifdef HAS_ABGRTOYROW_AVX2
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kAddY16), // %4
"m"(kPermdARGBToY_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ABGRTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
...@@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, ...@@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
} }
#endif // HAS_ARGBTOUVROW_AVX2 #endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ABGRTOUVROW_AVX2
void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsraw $0x8,%%ymm1,%%ymm1 \n"
"vpsraw $0x8,%%ymm0,%%ymm0 \n"
"vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpshufb %8,%%ymm0,%%ymm0 \n"
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_abgr)), // %4
"m"(kAddUV128), // %5
"m"(kABGRToV), // %6
"m"(kABGRToU), // %7
"m"(kShufARGBToUV_AVX) // %8
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ABGRTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb, int src_stride_argb,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment