Commit a2891ec7 authored by Frank Barchard's avatar Frank Barchard

Add MSA optimized YUY2ToI422, YUY2ToI420, UYVYToI422, UYVYToI420 functions

R=fbarchard@google.com
BUG=libyuv:634

Performance gains as below,

YUY2ToI422, YUY2ToI420 :-

YUY2ToYRow_MSA          : ~10x
YUY2ToUVRow_MSA         : ~11x
YUY2ToUV422Row_MSA      : ~9x
YUY2ToYRow_Any_MSA      : ~6x
YUY2ToUVRow_Any_MSA     : ~5x
YUY2ToUV422Row_Any_MSA  : ~4x

UYVYToI422, UYVYToI420 :-

UYVYToYRow_MSA          : ~10x
UYVYToUVRow_MSA         : ~11x
UYVYToUV422Row_MSA      : ~9x
UYVYToYRow_Any_MSA      : ~6x
UYVYToUVRow_Any_MSA     : ~5x
UYVYToUV422Row_Any_MSA  : ~4x

Review URL: https://codereview.chromium.org/2397693002 .
parent 3b88a19a
...@@ -166,12 +166,12 @@ ia32 ...@@ -166,12 +166,12 @@ ia32
mipsel mipsel
gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
ninja -j7 -v -C out/Debug libyuv_unittest ninja -j7 -v -C out/Debug libyuv_unittest
ninja -j7 -v -C out/Release libyuv_unittest ninja -j7 -v -C out/Release libyuv_unittest
gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
ninja -j7 -v -C out/Debug libyuv_unittest ninja -j7 -v -C out/Debug libyuv_unittest
ninja -j7 -v -C out/Release libyuv_unittest ninja -j7 -v -C out/Release libyuv_unittest
......
...@@ -372,6 +372,12 @@ extern "C" { ...@@ -372,6 +372,12 @@ extern "C" {
#define HAS_ARGBMIRRORROW_MSA #define HAS_ARGBMIRRORROW_MSA
#define HAS_I422TOYUY2ROW_MSA #define HAS_I422TOYUY2ROW_MSA
#define HAS_I422TOUYVYROW_MSA #define HAS_I422TOUYVYROW_MSA
#define HAS_YUY2TOYROW_MSA
#define HAS_YUY2TOUVROW_MSA
#define HAS_YUY2TOUV422ROW_MSA
#define HAS_UYVYTOYROW_MSA
#define HAS_UYVYTOUVROW_MSA
#endif #endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
...@@ -1669,6 +1675,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1669,6 +1675,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_MSA(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int width);
void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int width);
void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
...@@ -1689,6 +1700,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -1689,6 +1700,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int width);
void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int width);
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
...@@ -1709,6 +1725,11 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1709,6 +1725,11 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void UYVYToUV422Row_NEON(const uint8* src_uyvy, void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
void UYVYToUVRow_MSA(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int width);
void UYVYToUV422Row_MSA(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int width);
void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
...@@ -1730,6 +1751,11 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -1730,6 +1751,11 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
void UYVYToUVRow_Any_MSA(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int width);
void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int width);
void I422ToYUY2Row_C(const uint8* src_y, void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
......
...@@ -392,6 +392,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -392,6 +392,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
} }
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
YUY2ToYRow = YUY2ToYRow_MSA;
YUY2ToUVRow = YUY2ToUVRow_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
...@@ -457,6 +467,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -457,6 +467,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
} }
} }
#endif #endif
#if defined(HAS_UYVYTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToYRow = UYVYToYRow_Any_MSA;
UYVYToUVRow = UYVYToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
UYVYToYRow = UYVYToYRow_MSA;
UYVYToUVRow = UYVYToUVRow_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
......
...@@ -482,6 +482,16 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -482,6 +482,16 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
} }
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
if (IS_ALIGNED(width, 32)) {
YUY2ToYRow = YUY2ToYRow_MSA;
YUY2ToUV422Row = YUY2ToUV422Row_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
...@@ -556,6 +566,16 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -556,6 +566,16 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
} }
} }
#endif #endif
#if defined(HAS_UYVYTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToYRow = UYVYToYRow_Any_MSA;
UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
if (IS_ALIGNED(width, 32)) {
UYVYToYRow = UYVYToYRow_MSA;
UYVYToUV422Row = UYVYToUV422Row_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
......
...@@ -442,6 +442,12 @@ ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) ...@@ -442,6 +442,12 @@ ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
#ifdef HAS_UYVYTOYROW_NEON #ifdef HAS_UYVYTOYROW_NEON
ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15) ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
#endif #endif
#ifdef HAS_YUY2TOYROW_MSA
ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
#endif
#ifdef HAS_UYVYTOYROW_MSA
ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 0, 2, 1, 31)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON #ifdef HAS_RGB24TOARGBROW_NEON
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
#endif #endif
...@@ -763,6 +769,10 @@ ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) ...@@ -763,6 +769,10 @@ ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
#endif #endif
#ifdef HAS_YUY2TOUV422ROW_MSA
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
#endif
#undef ANY12 #undef ANY12
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
...@@ -848,6 +858,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) ...@@ -848,6 +858,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
#ifdef HAS_UYVYTOUVROW_NEON #ifdef HAS_UYVYTOUVROW_NEON
ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
#endif #endif
#ifdef HAS_YUY2TOUVROW_MSA
ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
#endif
#ifdef HAS_UYVYTOUVROW_MSA
ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
#endif
#undef ANY12S #undef ANY12S
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -101,6 +101,126 @@ void I422ToUYVYRow_MSA(const uint8* src_y, ...@@ -101,6 +101,126 @@ void I422ToUYVYRow_MSA(const uint8* src_y,
} }
} }
void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
ST_UB2(dst0, dst1, dst_y, 16);
src_yuy2 += 64;
dst_y += 32;
}
}
void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) {
const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
src2 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
src3 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
vec0 = __msa_aver_u_b(src0, src2);
vec1 = __msa_aver_u_b(src1, src3);
dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_yuy2 += 64;
src_yuy2_next += 64;
dst_u += 16;
dst_v += 16;
}
}
void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_yuy2 += 64;
dst_u += 16;
dst_v += 16;
}
}
void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
dst0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
ST_UB2(dst0, dst1, dst_y, 16);
src_uyvy += 64;
dst_y += 32;
}
}
void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int width) {
const uint8 *src_uyvy_next = src_uyvy + src_stride_uyvy;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
src2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
src3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
vec0 = __msa_aver_u_b(src0, src2);
vec1 = __msa_aver_u_b(src1, src3);
dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_uyvy += 64;
src_uyvy_next += 64;
dst_u += 16;
dst_v += 16;
}
}
void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_uyvy += 64;
dst_u += 16;
dst_v += 16;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment