Commit eed66b20 authored by Manojkumar Bhosale's avatar Manojkumar Bhosale Committed by Commit Bot

Add MSA optimized I444/I400/J400/YUY2/UYVY to ARGB row functions

BUG=libyuv:634

Change-Id: Ida80027c36a938a3bcf6f4480626f8eb9495e1be

Performance Gain (vs C auto-vectorized)
I444ToARGBRow_MSA       - ~1.6x
I444ToARGBRow_Any_MSA   - ~1.6x
I400ToARGBRow_MSA       - ~5.5x
I400ToARGBRow_Any_MSA   - ~5.3x
J400ToARGBRow_MSA       - ~1.0x
J400ToARGBRow_Any_MSA   - ~1.0x
YUY2ToARGBRow_MSA       - ~1.6x
YUY2ToARGBRow_Any_MSA   - ~1.6x
UYVYToARGBRow_MSA       - ~1.6x
UYVYToARGBRow_Any_MSA   - ~1.6x

Performance Gain (vs C non-vectorized)
I444ToARGBRow_MSA       - ~7.3x
I444ToARGBRow_Any_MSA   - ~7.1x
I400ToARGBRow_MSA       - ~5.5x
I400ToARGBRow_Any_MSA   - ~5.2x
J400ToARGBRow_MSA       - ~6.8x
J400ToARGBRow_Any_MSA   - ~5.7x
YUY2ToARGBRow_MSA       - ~7.2x
YUY2ToARGBRow_Any_MSA   - ~7.0x
UYVYToARGBRow_MSA       - ~7.1x
UYVYToARGBRow_Any_MSA   - ~6.9x

Change-Id: Ida80027c36a938a3bcf6f4480626f8eb9495e1be
Reviewed-on: https://chromium-review.googlesource.com/439246Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent bbe8c233
......@@ -442,6 +442,11 @@ extern "C" {
#define HAS_BGRATOUVROW_MSA
#define HAS_ABGRTOUVROW_MSA
#define HAS_RGBATOUVROW_MSA
#define HAS_I444TOARGBROW_MSA
#define HAS_I400TOARGBROW_MSA
#define HAS_J400TOARGBROW_MSA
#define HAS_YUY2TOARGBROW_MSA
#define HAS_UYVYTOARGBROW_MSA
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
......@@ -754,6 +759,12 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_DSPR2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -836,6 +847,14 @@ void NV21ToARGBRow_MSA(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_MSA(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
......@@ -1679,10 +1698,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
......@@ -2079,9 +2100,11 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb,
......@@ -2413,6 +2436,12 @@ void I422ToARGBRow_DSPR2(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -2471,6 +2500,14 @@ void NV21ToARGBRow_Any_MSA(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_Any_MSA(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_Any_MSA(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
......
......@@ -493,6 +493,14 @@ static int I444ToARGBMatrix(const uint8* src_y,
}
}
#endif
#if defined(HAS_I444TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I444ToARGBRow = I444ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
......@@ -773,6 +781,14 @@ int I400ToARGB(const uint8* src_y,
}
}
#endif
#if defined(HAS_I400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I400ToARGBRow = I400ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
I400ToARGBRow = I400ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width);
......@@ -831,6 +847,14 @@ int J400ToARGB(const uint8* src_y,
J400ToARGBRow = J400ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_J400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
J400ToARGBRow = J400ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
J400ToARGBRow = J400ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
J400ToARGBRow(src_y, dst_argb, width);
......@@ -1540,6 +1564,14 @@ int YUY2ToARGB(const uint8* src_yuy2,
YUY2ToARGBRow = YUY2ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_YUY2TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
YUY2ToARGBRow = YUY2ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
......@@ -1599,6 +1631,14 @@ int UYVYToARGB(const uint8* src_uyvy,
UYVYToARGBRow = UYVYToARGBRow_NEON;
}
}
#endif
#if defined(HAS_UYVYTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
UYVYToARGBRow = UYVYToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
......
......@@ -174,6 +174,7 @@ ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_MSA
ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
......@@ -422,6 +423,8 @@ ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
......@@ -759,6 +762,10 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif
#if defined(HAS_YUY2TOARGBROW_MSA)
ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
#endif
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment