Commit f5d5bd88 authored by Frank Barchard's avatar Frank Barchard

Add MSA optimized I422ToARGBRow_MSA and I422ToRGBARow_MSA functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gains :- (vs C vectorized)

I422ToARGBRow_MSA     : ~1.6x
I422ToRGBARow_MSA     : ~1.6x

I422ToARGBRow_Any_MSA : ~1.58x
I422ToRGBARow_Any_MSA : ~1.6x

Performance Gains :- (vs C non-vectorized)

I422ToARGBRow_MSA     : ~7x
I422ToRGBARow_MSA     : ~7x

I422ToARGBRow_Any_MSA : ~6.9x
I422ToRGBARow_Any_MSA : ~6.8x

Regarding performance measurement, We have created standalone tests which pass in row's data from a 1920x1080 filled buffer to both the C and MSA functions. And such N iterations are executed to get more accurate timings of C vs MSA.

Review URL: https://codereview.chromium.org/2430313005 .
parent 451af5e9
......@@ -15,6 +15,98 @@
#include <stdint.h>
#include <msa.h>
#if (__mips_isa_rev >= 6)
#define LW(psrc) ( { \
uint8 *psrc_lw_m = (uint8 *) (psrc); \
uint32 val_m; \
\
asm volatile ( \
"lw %[val_m], %[psrc_lw_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_lw_m] "m" (*psrc_lw_m) \
); \
\
val_m; \
} )
#if (__mips == 64)
#define LD(psrc) ( { \
uint8 *psrc_ld_m = (uint8 *) (psrc); \
uint64 val_m = 0; \
\
asm volatile ( \
"ld %[val_m], %[psrc_ld_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_ld_m] "m" (*psrc_ld_m) \
); \
\
val_m; \
} )
#else // !(__mips == 64)
#define LD(psrc) ( { \
uint8 *psrc_ld_m = (uint8 *) (psrc); \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
\
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
\
val_m = (uint64) (val1_m); \
val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); \
val_m = (uint64) (val_m | (uint64) val0_m); \
\
val_m; \
} )
#endif // (__mips == 64)
#else // !(__mips_isa_rev >= 6)
#define LW(psrc) ( { \
uint8 *psrc_lw_m = (uint8 *) (psrc); \
uint32 val_m; \
\
asm volatile ( \
"ulw %[val_m], %[psrc_lw_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_lw_m] "m" (*psrc_lw_m) \
); \
\
val_m; \
} )
#if (__mips == 64)
#define LD(psrc) ( { \
uint8 *psrc_ld_m = (uint8 *) (psrc); \
uint64 val_m = 0; \
\
asm volatile ( \
"uld %[val_m], %[psrc_ld_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_ld_m] "m" (*psrc_ld_m) \
); \
\
val_m; \
} )
#else // !(__mips == 64)
#define LD(psrc) ( { \
uint8 *psrc_ld_m = (uint8 *) (psrc); \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
\
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
\
val_m = (uint64) (val1_m); \
val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); \
val_m = (uint64) (val_m | (uint64) val0_m); \
\
val_m; \
} )
#endif // (__mips == 64)
#endif // (__mips_isa_rev >= 6)
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
......
......@@ -382,7 +382,8 @@ extern "C" {
#define HAS_ARGB4444TOARGBROW_MSA
#define HAS_ARGBTOYROW_MSA
#define HAS_ARGBTOUVROW_MSA
#define HAS_I422TOARGBROW_MSA
#define HAS_I422TORGBAROW_MSA
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
......@@ -651,6 +652,18 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
const struct YuvConstants* yuvconstants,
int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
......@@ -1629,6 +1642,18 @@ void I422ToARGBRow_DSPR2(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
......
......@@ -102,6 +102,14 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
......@@ -272,6 +280,14 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
......
......@@ -459,6 +459,14 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
I422ToRGBARow = I422ToRGBARow_DSPR2;
}
#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
......@@ -848,6 +856,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
......
......@@ -1243,6 +1243,14 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
I422ToRGBARow = I422ToRGBARow_DSPR2;
}
#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
......
......@@ -165,6 +165,10 @@ ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_MSA
ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
#endif
#undef ANY31C
// Any 2 planes to 1.
......
......@@ -19,6 +19,66 @@ namespace libyuv {
extern "C" {
#endif
#define I422TORGB(in0, in1, in2, ub, vr, ug, vg, \
bb, bg, br, yg, out0, out1, out2) { \
v8i16 vec0_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \
v4i32 max_val_m = __msa_ldi_w(255); \
v8i16 zero_m = { 0 }; \
\
in1 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in1); \
in2 = (v16u8) __msa_ilvr_b((v16i8) in2, (v16i8) in2); \
vec0_m = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in0); \
reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \
reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \
reg0_m *= vec_yg; \
reg1_m *= vec_yg; \
reg0_m = __msa_srai_w(reg0_m, 16); \
reg1_m = __msa_srai_w(reg1_m, 16); \
reg4_m = reg0_m + br; \
reg5_m = reg1_m + br; \
reg2_m = reg0_m + bg; \
reg3_m = reg1_m + bg; \
reg0_m += bb; \
reg1_m += bb; \
vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in1); \
reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \
reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \
vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in2); \
reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \
reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \
reg0_m -= reg6_m * ub; \
reg1_m -= reg7_m * ub; \
reg2_m -= reg6_m * ug; \
reg3_m -= reg7_m * ug; \
reg4_m -= reg8_m * vr; \
reg5_m -= reg9_m * vr; \
reg2_m -= reg8_m * vg; \
reg3_m -= reg9_m * vg; \
reg0_m = __msa_srai_w(reg0_m, 6); \
reg1_m = __msa_srai_w(reg1_m, 6); \
reg2_m = __msa_srai_w(reg2_m, 6); \
reg3_m = __msa_srai_w(reg3_m, 6); \
reg4_m = __msa_srai_w(reg4_m, 6); \
reg5_m = __msa_srai_w(reg5_m, 6); \
reg0_m = __msa_maxi_s_w(reg0_m, 0); \
reg1_m = __msa_maxi_s_w(reg1_m, 0); \
reg2_m = __msa_maxi_s_w(reg2_m, 0); \
reg3_m = __msa_maxi_s_w(reg3_m, 0); \
reg4_m = __msa_maxi_s_w(reg4_m, 0); \
reg5_m = __msa_maxi_s_w(reg5_m, 0); \
reg0_m = __msa_min_s_w(reg0_m, max_val_m); \
reg1_m = __msa_min_s_w(reg1_m, max_val_m); \
reg2_m = __msa_min_s_w(reg2_m, max_val_m); \
reg3_m = __msa_min_s_w(reg3_m, max_val_m); \
reg4_m = __msa_min_s_w(reg4_m, max_val_m); \
reg5_m = __msa_min_s_w(reg5_m, max_val_m); \
out0 = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \
out1 = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \
out2 = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \
}
void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
......@@ -101,6 +161,90 @@ void I422ToUYVYRow_MSA(const uint8* src_y,
}
}
void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) {
int x;
int32 data_u, data_v;
int64 data_y;
v16u8 src0, src1, src2, dst0, dst1;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v16u8 const_255 = (v16u8) __msa_ldi_b(255);
v4i32 zero = { 0 };
vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]);
vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]);
vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
for (x = 0; x < width; x += 8) {
data_y = LD(src_y);
data_u = LW(src_u);
data_v = LW(src_v);
src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y);
src1 = (v16u8) __msa_insert_w(zero, 0, data_u);
src2 = (v16u8) __msa_insert_w(zero, 0, data_v);
I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
vec0 = (v8i16) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0);
vec1 = (v8i16) __msa_ilvev_b((v16i8) const_255, (v16i8) vec2);
dst0 = (v16u8) __msa_ilvr_h((v8i16) vec1, (v8i16) vec0);
dst1 = (v16u8) __msa_ilvl_h((v8i16) vec1, (v8i16) vec0);
ST_UB2(dst0, dst1, rgb_buf, 16);
src_y += 8;
src_u += 4;
src_v += 4;
rgb_buf += 32;
}
}
void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) {
int x;
int64 data_y;
int32 data_u, data_v;
v16u8 src0, src1, src2, dst0, dst1;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v16u8 const_255 = (v16u8) __msa_ldi_b(255);
v4i32 zero = { 0 };
vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]);
vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]);
vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
for (x = 0; x < width; x += 8) {
data_y = LD(src_y);
data_u = LW(src_u);
data_v = LW(src_v);
src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y);
src1 = (v16u8) __msa_insert_w(zero, 0, data_u);
src2 = (v16u8) __msa_insert_w(zero, 0, data_v);
I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
vec0 = (v8i16) __msa_ilvev_b((v16i8) vec0, (v16i8) const_255);
vec1 = (v8i16) __msa_ilvev_b((v16i8) vec2, (v16i8) vec1);
dst0 = (v16u8) __msa_ilvr_h(vec1, vec0);
dst1 = (v16u8) __msa_ilvl_h(vec1, vec0);
ST_UB2(dst0, dst1, rgb_buf, 16);
src_y += 8;
src_u += 4;
src_v += 4;
rgb_buf += 32;
}
}
void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
......
......@@ -474,6 +474,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment