Commit 54ce8f23 authored by Manojkumar Bhosale's avatar Manojkumar Bhosale

Add MSA optimized ARGB/ABGR/BGRA/RGBA To Y/UV row functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gain (vs C auto-vectorized)
ARGBToYJRow_MSA       - ~3.2x
ARGBToYJRow_Any_MSA   - ~2.7x
BGRAToYRow_MSA        - ~3.2x
BGRAToYRow_Any_MSA    - ~2.7x
ABGRToYRow_MSA        - ~3.2x
ABGRToYRow_Any_MSA    - ~2.6x
RGBAToYRow_MSA        - ~3.1x
RGBAToYRow_Any_MSA    - ~2.7x
ARGBToUVJRow_MSA      - ~5.5x
ARGBToUVJRow_Any_MSA  - ~4.5x
BGRAToUVRow_MSA       - ~2.1x
BGRAToUVRow_Any_MSA   - ~2.0x
ABGRToUVRow_MSA       - ~2.1x
ABGRToUVRow_Any_MSA   - ~1.9x
RGBAToUVRow_MSA       - ~2.2x
RGBAToUVRow_Any_MSA   - ~1.9x

Performance Gain (vs C non-vectorized)
ARGBToYJRow_MSA       - ~10.9x
ARGBToYJRow_Any_MSA   -  ~9.2x
BGRAToYRow_MSA        - ~10.9x
BGRAToYRow_Any_MSA    -  ~9.3x
ABGRToYRow_MSA        - ~11.0x
ABGRToYRow_Any_MSA    -  ~9.3x
RGBAToYRow_MSA        - ~10.9x
RGBAToYRow_Any_MSA    -  ~9.1x
ARGBToUVJRow_MSA      - ~12.4x
ARGBToUVJRow_Any_MSA  - ~10.5x
BGRAToUVRow_MSA       -  ~4.7x
BGRAToUVRow_Any_MSA   -  ~4.4x
ABGRToUVRow_MSA       -  ~4.7x
ABGRToUVRow_Any_MSA   -  ~4.5x
RGBAToUVRow_MSA       -  ~4.8x
RGBAToUVRow_Any_MSA   -  ~4.4x

Review-Url: https://codereview.chromium.org/2641153003 .
parent 03510421
......@@ -434,6 +434,14 @@ extern "C" {
#define HAS_SOBELROW_MSA
#define HAS_SOBELTOPLANEROW_MSA
#define HAS_SOBELXYROW_MSA
#define HAS_ARGBTOYJROW_MSA
#define HAS_BGRATOYROW_MSA
#define HAS_ABGRTOYROW_MSA
#define HAS_RGBATOYROW_MSA
#define HAS_ARGBTOUVJROW_MSA
#define HAS_BGRATOUVROW_MSA
#define HAS_ABGRTOUVROW_MSA
#define HAS_RGBATOUVROW_MSA
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
......@@ -843,6 +851,7 @@ void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYJRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToUV444Row_NEON(const uint8* src_argb,
uint8* dst_u,
uint8* dst_v,
......@@ -906,6 +915,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
uint8* dst_u,
uint8* dst_v,
int width);
void ARGBToUVJRow_MSA(const uint8* src_argb,
int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width);
void BGRAToUVRow_MSA(const uint8* src_bgra,
int src_stride_bgra,
uint8* dst_u,
uint8* dst_v,
int width);
void ABGRToUVRow_MSA(const uint8* src_abgr,
int src_stride_abgr,
uint8* dst_u,
uint8* dst_v,
int width);
void RGBAToUVRow_MSA(const uint8* src_rgba,
int src_stride_rgba,
uint8* dst_u,
uint8* dst_v,
int width);
void RGB24ToUVRow_MSA(const uint8* src_rgb24,
int src_stride_rgb24,
uint8* dst_u,
......@@ -934,6 +963,9 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
void BGRAToYRow_MSA(const uint8* src_bgra, uint8* dst_y, int width);
void ABGRToYRow_MSA(const uint8* src_abgr, uint8* dst_y, int width);
void RGBAToYRow_MSA(const uint8* src_rgba, uint8* dst_y, int width);
void RGB24ToYRow_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
void RAWToYRow_MSA(const uint8* src_raw, uint8* dst_y, int width);
void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width);
......@@ -997,6 +1029,10 @@ void RGBAToYRow_Any_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444,
uint8* dst_y,
int width);
void BGRAToYRow_Any_MSA(const uint8* src_bgra, uint8* dst_y, int width);
void ABGRToYRow_Any_MSA(const uint8* src_abgr, uint8* dst_y, int width);
void RGBAToYRow_Any_MSA(const uint8* src_rgba, uint8* dst_y, int width);
void ARGBToYJRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
void RGB24ToYRow_Any_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
void RAWToYRow_Any_MSA(const uint8* src_raw, uint8* dst_y, int width);
......@@ -1136,6 +1172,26 @@ void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
uint8* dst_u,
uint8* dst_v,
int width);
void ARGBToUVJRow_Any_MSA(const uint8* src_argb,
int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width);
void BGRAToUVRow_Any_MSA(const uint8* src_bgra,
int src_stride_bgra,
uint8* dst_u,
uint8* dst_v,
int width);
void ABGRToUVRow_Any_MSA(const uint8* src_abgr,
int src_stride_abgr,
uint8* dst_u,
uint8* dst_v,
int width);
void RGBAToUVRow_Any_MSA(const uint8* src_rgba,
int src_stride_rgba,
uint8* dst_u,
uint8* dst_v,
int width);
void RGB24ToUVRow_Any_MSA(const uint8* src_rgb24,
int src_stride_rgb24,
uint8* dst_u,
......
......@@ -696,6 +696,22 @@ int BGRAToI420(const uint8* src_bgra,
}
}
#endif
#if defined(HAS_BGRATOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
BGRAToYRow = BGRAToYRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
BGRAToYRow = BGRAToYRow_MSA;
}
}
#endif
#if defined(HAS_BGRATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
BGRAToUVRow = BGRAToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
BGRAToUVRow = BGRAToUVRow_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
......@@ -781,6 +797,22 @@ int ABGRToI420(const uint8* src_abgr,
}
}
#endif
#if defined(HAS_ABGRTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_MSA;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToUVRow = ABGRToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
......@@ -866,6 +898,22 @@ int RGBAToI420(const uint8* src_rgba,
}
}
#endif
#if defined(HAS_RGBATOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGBAToYRow = RGBAToYRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
RGBAToYRow = RGBAToYRow_MSA;
}
}
#endif
#if defined(HAS_RGBATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGBAToUVRow = RGBAToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
......
......@@ -1352,6 +1352,22 @@ int ARGBToJ420(const uint8* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
......@@ -1436,6 +1452,22 @@ int ARGBToJ422(const uint8* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
......@@ -1497,6 +1529,14 @@ int ARGBToJ400(const uint8* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBToYJRow(src_argb, dst_yj, width);
......
......@@ -2564,6 +2564,14 @@ static int ARGBSobelize(const uint8* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
}
#endif
#if defined(HAS_SOBELYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
......
......@@ -460,15 +460,27 @@ ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
#ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBTOYJROW_MSA
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
#endif
#ifdef HAS_BGRATOYROW_NEON
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_BGRATOYROW_MSA
ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
#endif
#ifdef HAS_ABGRTOYROW_NEON
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ABGRTOYROW_MSA
ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
#endif
#ifdef HAS_RGBATOYROW_NEON
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_RGBATOYROW_MSA
ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
#endif
#ifdef HAS_RGB24TOYROW_NEON
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
#endif
......@@ -952,15 +964,27 @@ ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVJROW_MSA
ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
#endif
#ifdef HAS_BGRATOUVROW_NEON
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_MSA
ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
#endif
#ifdef HAS_ABGRTOUVROW_NEON
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_MSA
ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
#endif
#ifdef HAS_RGBATOUVROW_NEON
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_MSA
ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
#endif
#ifdef HAS_RGB24TOUVROW_NEON
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
#endif
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment