Commit 7018f5be authored by Frank Barchard's avatar Frank Barchard

Add MSA optimized I422ToYUY2Row, I422ToUYVYRow functions

R=fbarchard@google.com
BUG=libyuv:634

Performance gains :-

I422ToYUY2Row_MSA     - ~12x
I422ToYUY2Row_Any_MSA - ~7x

I422ToUYVYRow_MSA     - ~12x
I422ToUYVYRow_Any_MSA - ~7x

Review URL: https://codereview.chromium.org/2378753004 .
parent aa197ee1
...@@ -71,6 +71,19 @@ ...@@ -71,6 +71,19 @@
} }
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
}
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
#endif // INCLUDE_LIBYUV_MACROS_MSA_H_ #endif // INCLUDE_LIBYUV_MACROS_MSA_H_
...@@ -370,6 +370,8 @@ extern "C" { ...@@ -370,6 +370,8 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_MIRRORROW_MSA #define HAS_MIRRORROW_MSA
#define HAS_ARGBMIRRORROW_MSA #define HAS_ARGBMIRRORROW_MSA
#define HAS_I422TOYUY2ROW_MSA
#define HAS_I422TOUYVYROW_MSA
#endif #endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
...@@ -1769,6 +1771,22 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, ...@@ -1769,6 +1771,22 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
uint8* dst_uyvy, int width); uint8* dst_uyvy, int width);
void I422ToYUY2Row_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_yuy2, int width);
void I422ToUYVYRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_uyvy, int width);
void I422ToYUY2Row_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_yuy2, int width);
void I422ToUYVYRow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_uyvy, int width);
// Effects related row functions. // Effects related row functions.
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
......
...@@ -237,6 +237,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, ...@@ -237,6 +237,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOYUY2ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
if (IS_ALIGNED(width, 32)) {
I422ToYUY2Row = I422ToYUY2Row_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
...@@ -298,6 +306,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, ...@@ -298,6 +306,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOUYVYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
I422ToUYVYRow = I422ToUYVYRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
...@@ -345,6 +361,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, ...@@ -345,6 +361,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOUYVYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
I422ToUYVYRow = I422ToUYVYRow_MSA;
}
}
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
......
...@@ -553,6 +553,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, ...@@ -553,6 +553,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_I422TOYUY2ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
if (IS_ALIGNED(width, 32)) {
I422ToYUY2Row = I422ToYUY2Row_MSA;
}
}
#endif
{ {
// Allocate a rows of yuv. // Allocate a rows of yuv.
...@@ -655,6 +663,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, ...@@ -655,6 +663,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_I422TOUYVYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
I422ToUYVYRow = I422ToUYVYRow_MSA;
}
}
#endif
{ {
// Allocate a rows of yuv. // Allocate a rows of yuv.
......
...@@ -80,9 +80,15 @@ ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) ...@@ -80,9 +80,15 @@ ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#ifdef HAS_I422TOYUY2ROW_NEON #ifdef HAS_I422TOYUY2ROW_NEON
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#endif #endif
#ifdef HAS_I422TOYUY2ROW_MSA
ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
#endif
#ifdef HAS_I422TOUYVYROW_NEON #ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif #endif
#ifdef HAS_I422TOUYVYROW_MSA
ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
#endif
#ifdef HAS_BLENDPLANEROW_AVX2 #ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif #endif
......
...@@ -53,6 +53,54 @@ void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) { ...@@ -53,6 +53,54 @@ void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
} }
} }
void I422ToYUY2Row_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_yuy2,
int width) {
int x;
v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
for (x = 0; x < width; x += 32) {
src_u0 = LD_UB(src_u);
src_v0 = LD_UB(src_v);
LD_UB2(src_y, 16, src_y0, src_y1);
ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
src_u += 16;
src_v += 16;
src_y += 32;
dst_yuy2 += 64;
}
}
void I422ToUYVYRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_uyvy,
int width) {
int x;
v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
for (x = 0; x < width; x += 32) {
src_u0 = LD_UB(src_u);
src_v0 = LD_UB(src_v);
LD_UB2(src_y, 16, src_y0, src_y1);
ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
src_u += 16;
src_v += 16;
src_y += 32;
dst_uyvy += 64;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment