Commit fec9121b authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

SwapUV AVX2 and SSSE3

Based on ARGBShuffle but with count adjusted and new shuffle mask

BUG=libyuv:809

Change-Id: Idd936ee6bedcf285607a68c2fc54d876b4becc01
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1711882Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 22ae4bfa
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1733 Version: 1734
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -275,6 +275,7 @@ extern "C" { ...@@ -275,6 +275,7 @@ extern "C" {
#define HAS_I422TOAR30ROW_SSSE3 #define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
#endif #endif
// The following are available for AVX2 gcc/clang x86 platforms: // The following are available for AVX2 gcc/clang x86 platforms:
...@@ -295,6 +296,7 @@ extern "C" { ...@@ -295,6 +296,7 @@ extern "C" {
#define HAS_I422TOYUY2ROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2 #define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2
#define HAS_SWAPUVROW_AVX2
// TODO(fbarchard): Fix AVX2 version of YUV24 // TODO(fbarchard): Fix AVX2 version of YUV24
// #define HAS_NV21TOYUV24ROW_AVX2 // #define HAS_NV21TOYUV24ROW_AVX2
#endif #endif
...@@ -3374,6 +3376,10 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, ...@@ -3374,6 +3376,10 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_Any_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_Any_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToUVRow_C(const uint8_t* src_ayuv, void AYUVToUVRow_C(const uint8_t* src_ayuv,
int stride_ayuv, int stride_ayuv,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1733 #define LIBYUV_VERSION 1734
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -527,6 +527,22 @@ void SwapUVPlane(const uint8_t* src_uv, ...@@ -527,6 +527,22 @@ void SwapUVPlane(const uint8_t* src_uv,
src_stride_uv = dst_stride_vu = 0; src_stride_uv = dst_stride_vu = 0;
} }
#if defined(HAS_SWAPUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
SwapUVRow = SwapUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
SwapUVRow = SwapUVRow_SSSE3;
}
}
#endif
#if defined(HAS_SWAPUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SwapUVRow = SwapUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SwapUVRow = SwapUVRow_AVX2;
}
}
#endif
#if defined(HAS_SWAPUVROW_NEON) #if defined(HAS_SWAPUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
SwapUVRow = SwapUVRow_Any_NEON; SwapUVRow = SwapUVRow_Any_NEON;
......
...@@ -710,6 +710,12 @@ ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) ...@@ -710,6 +710,12 @@ ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
#ifdef HAS_AYUVTOYROW_NEON #ifdef HAS_AYUVTOYROW_NEON
ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_SWAPUVROW_SSSE3
ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
#endif
#ifdef HAS_SWAPUVROW_AVX2
ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
#endif
#ifdef HAS_SWAPUVROW_NEON #ifdef HAS_SWAPUVROW_NEON
ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15) ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
#endif #endif
......
...@@ -6790,6 +6790,68 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, ...@@ -6790,6 +6790,68 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
} }
#endif // HAS_NV21TOYUV24ROW_AVX2 #endif // HAS_NV21TOYUV24ROW_AVX2
#ifdef HAS_SWAPUVROW_SSSE3
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
void SwapUVRow_SSSE3(const uint8_t* src_uv,
uint8_t* dst_vu,
int width) {
asm volatile(
"movdqu %3,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
: "m"(kShuffleUVToVU) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_SWAPUVROW_SSSE3
#ifdef HAS_SWAPUVROW_AVX2
void SwapUVRow_AVX2(const uint8_t* src_uv,
uint8_t* dst_vu,
int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
: "m"(kShuffleUVToVU) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_SWAPUVROW_AVX2
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -994,6 +994,7 @@ TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2) ...@@ -994,6 +994,7 @@ TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2) TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4) TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4) TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
// TODO(fbarchard): Investigate high error on Win32.
TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 10) TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 10)
TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4) TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment