Commit eed66b20 authored by Manojkumar Bhosale's avatar Manojkumar Bhosale Committed by Commit Bot

Add MSA optimized I444/I400/J400/YUY2/UYVY to ARGB row functions

BUG=libyuv:634

Change-Id: Ida80027c36a938a3bcf6f4480626f8eb9495e1be

Performance Gain (vs C auto-vectorized)
I444ToARGBRow_MSA       - ~1.6x
I444ToARGBRow_Any_MSA   - ~1.6x
I400ToARGBRow_MSA       - ~5.5x
I400ToARGBRow_Any_MSA   - ~5.3x
J400ToARGBRow_MSA       - ~1.0x
J400ToARGBRow_Any_MSA   - ~1.0x
YUY2ToARGBRow_MSA       - ~1.6x
YUY2ToARGBRow_Any_MSA   - ~1.6x
UYVYToARGBRow_MSA       - ~1.6x
UYVYToARGBRow_Any_MSA   - ~1.6x

Performance Gain (vs C non-vectorized)
I444ToARGBRow_MSA       - ~7.3x
I444ToARGBRow_Any_MSA   - ~7.1x
I400ToARGBRow_MSA       - ~5.5x
I400ToARGBRow_Any_MSA   - ~5.2x
J400ToARGBRow_MSA       - ~6.8x
J400ToARGBRow_Any_MSA   - ~5.7x
YUY2ToARGBRow_MSA       - ~7.2x
YUY2ToARGBRow_Any_MSA   - ~7.0x
UYVYToARGBRow_MSA       - ~7.1x
UYVYToARGBRow_Any_MSA   - ~6.9x

Change-Id: Ida80027c36a938a3bcf6f4480626f8eb9495e1be
Reviewed-on: https://chromium-review.googlesource.com/439246Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent bbe8c233
...@@ -442,6 +442,11 @@ extern "C" { ...@@ -442,6 +442,11 @@ extern "C" {
#define HAS_BGRATOUVROW_MSA #define HAS_BGRATOUVROW_MSA
#define HAS_ABGRTOUVROW_MSA #define HAS_ABGRTOUVROW_MSA
#define HAS_RGBATOUVROW_MSA #define HAS_RGBATOUVROW_MSA
#define HAS_I444TOARGBROW_MSA
#define HAS_I400TOARGBROW_MSA
#define HAS_J400TOARGBROW_MSA
#define HAS_YUY2TOARGBROW_MSA
#define HAS_UYVYTOARGBROW_MSA
#endif #endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
...@@ -754,6 +759,12 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, ...@@ -754,6 +759,12 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I444ToARGBRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_DSPR2(const uint8* src_y, void I444ToARGBRow_DSPR2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -836,6 +847,14 @@ void NV21ToARGBRow_MSA(const uint8* src_y, ...@@ -836,6 +847,14 @@ void NV21ToARGBRow_MSA(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_MSA(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
...@@ -1679,10 +1698,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width); ...@@ -1679,10 +1698,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I444ToARGBRow_C(const uint8* src_y, void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
...@@ -2079,9 +2100,11 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); ...@@ -2079,9 +2100,11 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, void ARGBBlendRow_SSSE3(const uint8* src_argb,
...@@ -2413,6 +2436,12 @@ void I422ToARGBRow_DSPR2(const uint8* src_y, ...@@ -2413,6 +2436,12 @@ void I422ToARGBRow_DSPR2(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I444ToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_MSA(const uint8* src_y, void I422ToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -2471,6 +2500,14 @@ void NV21ToARGBRow_Any_MSA(const uint8* src_y, ...@@ -2471,6 +2500,14 @@ void NV21ToARGBRow_Any_MSA(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void YUY2ToARGBRow_Any_MSA(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_Any_MSA(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
......
...@@ -493,6 +493,14 @@ static int I444ToARGBMatrix(const uint8* src_y, ...@@ -493,6 +493,14 @@ static int I444ToARGBMatrix(const uint8* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I444ToARGBRow = I444ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
...@@ -773,6 +781,14 @@ int I400ToARGB(const uint8* src_y, ...@@ -773,6 +781,14 @@ int I400ToARGB(const uint8* src_y,
} }
} }
#endif #endif
#if defined(HAS_I400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I400ToARGBRow = I400ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
I400ToARGBRow = I400ToARGBRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width); I400ToARGBRow(src_y, dst_argb, width);
...@@ -831,6 +847,14 @@ int J400ToARGB(const uint8* src_y, ...@@ -831,6 +847,14 @@ int J400ToARGB(const uint8* src_y,
J400ToARGBRow = J400ToARGBRow_NEON; J400ToARGBRow = J400ToARGBRow_NEON;
} }
} }
#endif
#if defined(HAS_J400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
J400ToARGBRow = J400ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
J400ToARGBRow = J400ToARGBRow_MSA;
}
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
J400ToARGBRow(src_y, dst_argb, width); J400ToARGBRow(src_y, dst_argb, width);
...@@ -1540,6 +1564,14 @@ int YUY2ToARGB(const uint8* src_yuy2, ...@@ -1540,6 +1564,14 @@ int YUY2ToARGB(const uint8* src_yuy2,
YUY2ToARGBRow = YUY2ToARGBRow_NEON; YUY2ToARGBRow = YUY2ToARGBRow_NEON;
} }
} }
#endif
#if defined(HAS_YUY2TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
YUY2ToARGBRow = YUY2ToARGBRow_MSA;
}
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
...@@ -1599,6 +1631,14 @@ int UYVYToARGB(const uint8* src_uyvy, ...@@ -1599,6 +1631,14 @@ int UYVYToARGB(const uint8* src_uyvy,
UYVYToARGBRow = UYVYToARGBRow_NEON; UYVYToARGBRow = UYVYToARGBRow_NEON;
} }
} }
#endif
#if defined(HAS_UYVYTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
UYVYToARGBRow = UYVYToARGBRow_MSA;
}
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
......
...@@ -174,6 +174,7 @@ ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7) ...@@ -174,6 +174,7 @@ ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
#endif #endif
#ifdef HAS_I422TOARGBROW_MSA #ifdef HAS_I422TOARGBROW_MSA
ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
...@@ -422,6 +423,8 @@ ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) ...@@ -422,6 +423,8 @@ ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
#endif #endif
#if defined(HAS_RAWTORGB24ROW_NEON) #if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
...@@ -759,6 +762,10 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) ...@@ -759,6 +762,10 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif #endif
#if defined(HAS_YUY2TOARGBROW_MSA)
ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
#endif
#undef ANY11C #undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride. // Any 1 to 1 interpolate. Takes 2 rows of source via stride.
......
...@@ -48,13 +48,31 @@ extern "C" { ...@@ -48,13 +48,31 @@ extern "C" {
out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
} }
// Clip input vector elements between 0 to 255
#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
{ \
v4i32 max_m = __msa_ldi_w(0xFF); \
\
in0 = __msa_maxi_s_w(in0, 0); \
in1 = __msa_maxi_s_w(in1, 0); \
in2 = __msa_maxi_s_w(in2, 0); \
in3 = __msa_maxi_s_w(in3, 0); \
in4 = __msa_maxi_s_w(in4, 0); \
in5 = __msa_maxi_s_w(in5, 0); \
in0 = __msa_min_s_w(max_m, in0); \
in1 = __msa_min_s_w(max_m, in1); \
in2 = __msa_min_s_w(max_m, in2); \
in3 = __msa_min_s_w(max_m, in3); \
in4 = __msa_min_s_w(max_m, in4); \
in5 = __msa_min_s_w(max_m, in5); \
}
// Convert 8 pixels of YUV 420 to RGB. // Convert 8 pixels of YUV 420 to RGB.
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
{ \ { \
v8i16 vec0_m, vec1_m; \ v8i16 vec0_m, vec1_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
v4i32 reg5_m, reg6_m, reg7_m; \ v4i32 reg5_m, reg6_m, reg7_m; \
v4i32 max_m = __msa_ldi_w(255); \
v16i8 zero_m = {0}; \ v16i8 zero_m = {0}; \
\ \
vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
...@@ -94,18 +112,7 @@ extern "C" { ...@@ -94,18 +112,7 @@ extern "C" {
reg4_m = __msa_srai_w(reg4_m, 6); \ reg4_m = __msa_srai_w(reg4_m, 6); \
reg2_m = __msa_srai_w(reg2_m, 6); \ reg2_m = __msa_srai_w(reg2_m, 6); \
reg3_m = __msa_srai_w(reg3_m, 6); \ reg3_m = __msa_srai_w(reg3_m, 6); \
reg5_m = __msa_maxi_s_w(reg5_m, 0); \ CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
reg6_m = __msa_maxi_s_w(reg6_m, 0); \
reg7_m = __msa_maxi_s_w(reg7_m, 0); \
reg4_m = __msa_maxi_s_w(reg4_m, 0); \
reg2_m = __msa_maxi_s_w(reg2_m, 0); \
reg3_m = __msa_maxi_s_w(reg3_m, 0); \
reg5_m = __msa_min_s_w(max_m, reg5_m); \
reg6_m = __msa_min_s_w(max_m, reg6_m); \
reg7_m = __msa_min_s_w(max_m, reg7_m); \
reg4_m = __msa_min_s_w(max_m, reg4_m); \
reg2_m = __msa_min_s_w(max_m, reg2_m); \
reg3_m = __msa_min_s_w(max_m, reg3_m); \
out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
...@@ -263,6 +270,19 @@ extern "C" { ...@@ -263,6 +270,19 @@ extern "C" {
u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
} }
// Load I444 pixel data
#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
{ \
uint64 y_m, u_m, v_m; \
v2i64 zero_m = {0}; \
y_m = LD(psrc_y); \
u_m = LD(psrc_u); \
v_m = LD(psrc_v); \
out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \
out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \
out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \
}
void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
int x; int x;
v16u8 src0, src1, src2, src3; v16u8 src0, src1, src2, src3;
...@@ -2632,6 +2652,203 @@ void RGBAToUVRow_MSA(const uint8* src_rgb0, ...@@ -2632,6 +2652,203 @@ void RGBAToUVRow_MSA(const uint8* src_rgb0,
} }
} }
void I444ToARGBRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2, dst0, dst1;
v8u16 vec0, vec1, vec2;
v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 zero = {0};
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 8) {
READI444(src_y, src_u, src_v, src0, src1, src2);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
reg0 *= vec_yg;
reg1 *= vec_yg;
reg0 = __msa_srai_w(reg0, 16);
reg1 = __msa_srai_w(reg1, 16);
reg4 = reg0 + vec_br;
reg5 = reg1 + vec_br;
reg2 = reg0 + vec_bg;
reg3 = reg1 + vec_bg;
reg0 += vec_bb;
reg1 += vec_bb;
vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
reg0 -= reg6 * vec_ub;
reg1 -= reg7 * vec_ub;
reg2 -= reg6 * vec_ug;
reg3 -= reg7 * vec_ug;
reg4 -= reg8 * vec_vr;
reg5 -= reg9 * vec_vr;
reg2 -= reg8 * vec_vg;
reg3 -= reg9 * vec_vg;
reg0 = __msa_srai_w(reg0, 6);
reg1 = __msa_srai_w(reg1, 6);
reg2 = __msa_srai_w(reg2, 6);
reg3 = __msa_srai_w(reg3, 6);
reg4 = __msa_srai_w(reg4, 6);
reg5 = __msa_srai_w(reg5, 6);
CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
ST_UB2(dst0, dst1, rgb_buf, 16);
src_y += 8;
src_u += 8;
src_v += 8;
rgb_buf += 32;
}
}
void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
int x;
v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
v8i16 vec0, vec1;
v4i32 reg0, reg1, reg2, reg3;
v4i32 vec_yg = __msa_fill_w(0x4A35);
v8i16 vec_ygb = __msa_fill_h(0xFB78);
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 max = __msa_ldi_h(0xFF);
v8i16 zero = {0};
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
reg0 *= vec_yg;
reg1 *= vec_yg;
reg2 *= vec_yg;
reg3 *= vec_yg;
reg0 = __msa_srai_w(reg0, 16);
reg1 = __msa_srai_w(reg1, 16);
reg2 = __msa_srai_w(reg2, 16);
reg3 = __msa_srai_w(reg3, 16);
vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec0 += vec_ygb;
vec1 += vec_ygb;
vec0 = __msa_srai_h(vec0, 6);
vec1 = __msa_srai_h(vec1, 6);
vec0 = __msa_maxi_s_h(vec0, 0);
vec1 = __msa_maxi_s_h(vec1, 0);
vec0 = __msa_min_s_h(max, vec0);
vec1 = __msa_min_s_h(max, vec1);
res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
src_y += 16;
rgb_buf += 64;
}
}
void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
int x;
v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
src_y += 16;
dst_argb += 64;
}
}
void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
src_yuy2 += 16;
rgb_buf += 32;
}
}
void UYVYToARGBRow_MSA(const uint8* src_uyvy,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
src_uyvy += 16;
rgb_buf += 32;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment