Commit eed66b20 authored by Manojkumar Bhosale's avatar Manojkumar Bhosale Committed by Commit Bot

Add MSA optimized I444/I400/J400/YUY2/UYVY to ARGB row functions

BUG=libyuv:634

Change-Id: Ida80027c36a938a3bcf6f4480626f8eb9495e1be

Performance Gain (vs C auto-vectorized)
I444ToARGBRow_MSA       - ~1.6x
I444ToARGBRow_Any_MSA   - ~1.6x
I400ToARGBRow_MSA       - ~5.5x
I400ToARGBRow_Any_MSA   - ~5.3x
J400ToARGBRow_MSA       - ~1.0x
J400ToARGBRow_Any_MSA   - ~1.0x
YUY2ToARGBRow_MSA       - ~1.6x
YUY2ToARGBRow_Any_MSA   - ~1.6x
UYVYToARGBRow_MSA       - ~1.6x
UYVYToARGBRow_Any_MSA   - ~1.6x

Performance Gain (vs C non-vectorized)
I444ToARGBRow_MSA       - ~7.3x
I444ToARGBRow_Any_MSA   - ~7.1x
I400ToARGBRow_MSA       - ~5.5x
I400ToARGBRow_Any_MSA   - ~5.2x
J400ToARGBRow_MSA       - ~6.8x
J400ToARGBRow_Any_MSA   - ~5.7x
YUY2ToARGBRow_MSA       - ~7.2x
YUY2ToARGBRow_Any_MSA   - ~7.0x
UYVYToARGBRow_MSA       - ~7.1x
UYVYToARGBRow_Any_MSA   - ~6.9x

Change-Id: Ida80027c36a938a3bcf6f4480626f8eb9495e1be
Reviewed-on: https://chromium-review.googlesource.com/439246Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent bbe8c233
...@@ -442,6 +442,11 @@ extern "C" { ...@@ -442,6 +442,11 @@ extern "C" {
#define HAS_BGRATOUVROW_MSA #define HAS_BGRATOUVROW_MSA
#define HAS_ABGRTOUVROW_MSA #define HAS_ABGRTOUVROW_MSA
#define HAS_RGBATOUVROW_MSA #define HAS_RGBATOUVROW_MSA
#define HAS_I444TOARGBROW_MSA
#define HAS_I400TOARGBROW_MSA
#define HAS_J400TOARGBROW_MSA
#define HAS_YUY2TOARGBROW_MSA
#define HAS_UYVYTOARGBROW_MSA
#endif #endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
...@@ -754,6 +759,12 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -754,6 +759,12 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I444ToARGBRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_DSPR2(const uint8* src_y, void I444ToARGBRow_DSPR2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -836,6 +847,14 @@ void NV21ToARGBRow_MSA(const uint8* src_y, ...@@ -836,6 +847,14 @@ void NV21ToARGBRow_MSA(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_MSA(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
...@@ -1679,10 +1698,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width); ...@@ -1679,10 +1698,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I444ToARGBRow_C(const uint8* src_y, void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
...@@ -2079,9 +2100,11 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); ...@@ -2079,9 +2100,11 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, void ARGBBlendRow_SSSE3(const uint8* src_argb,
...@@ -2413,6 +2436,12 @@ void I422ToARGBRow_DSPR2(const uint8* src_y, ...@@ -2413,6 +2436,12 @@ void I422ToARGBRow_DSPR2(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I444ToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_MSA(const uint8* src_y, void I422ToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -2471,6 +2500,14 @@ void NV21ToARGBRow_Any_MSA(const uint8* src_y, ...@@ -2471,6 +2500,14 @@ void NV21ToARGBRow_Any_MSA(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void YUY2ToARGBRow_Any_MSA(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_Any_MSA(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
......
...@@ -493,6 +493,14 @@ static int I444ToARGBMatrix(const uint8* src_y, ...@@ -493,6 +493,14 @@ static int I444ToARGBMatrix(const uint8* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I444ToARGBRow = I444ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
...@@ -773,6 +781,14 @@ int I400ToARGB(const uint8* src_y, ...@@ -773,6 +781,14 @@ int I400ToARGB(const uint8* src_y,
} }
} }
#endif #endif
#if defined(HAS_I400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I400ToARGBRow = I400ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
I400ToARGBRow = I400ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width); I400ToARGBRow(src_y, dst_argb, width);
...@@ -831,6 +847,14 @@ int J400ToARGB(const uint8* src_y, ...@@ -831,6 +847,14 @@ int J400ToARGB(const uint8* src_y,
J400ToARGBRow = J400ToARGBRow_NEON; J400ToARGBRow = J400ToARGBRow_NEON;
} }
} }
#endif
#if defined(HAS_J400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
J400ToARGBRow = J400ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
J400ToARGBRow = J400ToARGBRow_MSA;
}
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
J400ToARGBRow(src_y, dst_argb, width); J400ToARGBRow(src_y, dst_argb, width);
...@@ -1540,6 +1564,14 @@ int YUY2ToARGB(const uint8* src_yuy2, ...@@ -1540,6 +1564,14 @@ int YUY2ToARGB(const uint8* src_yuy2,
YUY2ToARGBRow = YUY2ToARGBRow_NEON; YUY2ToARGBRow = YUY2ToARGBRow_NEON;
} }
} }
#endif
#if defined(HAS_YUY2TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
YUY2ToARGBRow = YUY2ToARGBRow_MSA;
}
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
...@@ -1599,6 +1631,14 @@ int UYVYToARGB(const uint8* src_uyvy, ...@@ -1599,6 +1631,14 @@ int UYVYToARGB(const uint8* src_uyvy,
UYVYToARGBRow = UYVYToARGBRow_NEON; UYVYToARGBRow = UYVYToARGBRow_NEON;
} }
} }
#endif
#if defined(HAS_UYVYTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
UYVYToARGBRow = UYVYToARGBRow_MSA;
}
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
......
...@@ -174,6 +174,7 @@ ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7) ...@@ -174,6 +174,7 @@ ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
#endif #endif
#ifdef HAS_I422TOARGBROW_MSA #ifdef HAS_I422TOARGBROW_MSA
ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
...@@ -422,6 +423,8 @@ ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) ...@@ -422,6 +423,8 @@ ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
#endif #endif
#if defined(HAS_RAWTORGB24ROW_NEON) #if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
...@@ -759,6 +762,10 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) ...@@ -759,6 +762,10 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif #endif
#if defined(HAS_YUY2TOARGBROW_MSA)
ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
#endif
#undef ANY11C #undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride. // Any 1 to 1 interpolate. Takes 2 rows of source via stride.
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment