Commit a899dea2 authored by Manojkumar Bhosale's avatar Manojkumar Bhosale

Add MSA optimized ARGB Attenuate/RGB565/Shuffle/Shader/Gray/Sepia row functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gain (vs C vectorized)
ARGBAttenuateRow_MSA          - ~1.1x
ARGBAttenuateRow_Any_MSA      - ~1.1x
ARGBToRGB565DitherRow_MSA     - ~6.4x
ARGBToRGB565DitherRow_Any_MSA - ~6.2x
ARGBShuffleRow_MSA            - ~5.1x
ARGBShuffleRow_Any_MSA        - ~1.9x
ARGBShadeRow_MSA              - ~1.1x
ARGBGrayRow_MSA               - ~2.6x
ARGBSepiaRow_MSA              - ~11.6x

Performance Gain (vs C non-vectorized)
ARGBAttenuateRow_MSA          - ~2.46x
ARGBAttenuateRow_Any_MSA      - ~2.45x
ARGBToRGB565DitherRow_MSA     - ~9.4x
ARGBToRGB565DitherRow_Any_MSA - ~12.5x
ARGBShuffleRow_MSA            - ~5.2x
ARGBShuffleRow_Any_MSA        - ~1.9x
ARGBShadeRow_MSA              - ~4.3x
ARGBGrayRow_MSA               - ~10.5x
ARGBSepiaRow_MSA              - ~12.2x

Review-Url: https://codereview.chromium.org/2559693002 .
parent 6fa5e4eb
......@@ -393,6 +393,12 @@ extern "C" {
#define HAS_ARGBMULTIPLYROW_MSA
#define HAS_ARGBADDROW_MSA
#define HAS_ARGBSUBTRACTROW_MSA
#define HAS_ARGBATTENUATEROW_MSA
#define HAS_ARGBTORGB565DITHERROW_MSA
#define HAS_ARGBSHUFFLEROW_MSA
#define HAS_ARGBSHADEROW_MSA
#define HAS_ARGBGRAYROW_MSA
#define HAS_ARGBSEPIAROW_MSA
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
......@@ -1192,6 +1198,10 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
int width);
void ARGBShuffleRow_MSA(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
int width);
void ARGBShuffleRow_Any_SSE2(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
......@@ -1208,6 +1218,10 @@ void ARGBShuffleRow_Any_NEON(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
int width);
void ARGBShuffleRow_Any_MSA(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
int width);
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
......@@ -1326,6 +1340,10 @@ void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
uint8* dst_rgb,
const uint32 dither4,
int width);
void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
......@@ -1947,6 +1965,10 @@ void ARGBToARGB1555Row_Any_MSA(const uint8* src_argb,
void ARGBToARGB4444Row_Any_MSA(const uint8* src_argb,
uint8* dst_rgb,
int width);
void ARGBToRGB565DitherRow_Any_MSA(const uint8* src_argb,
uint8* dst_rgb,
const uint32 dither4,
int width);
void I444ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_u,
......@@ -2344,6 +2366,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb,
uint8* dst_argb,
int width);
......@@ -2356,6 +2379,9 @@ void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb,
void ARGBAttenuateRow_Any_NEON(const uint8* src_argb,
uint8* dst_argb,
int width);
void ARGBAttenuateRow_Any_MSA(const uint8* src_argb,
uint8* dst_argb,
int width);
// Inverse table for unattenuate, shared by C and SSE2.
extern const uint32 fixed_invtbl8[256];
......@@ -2372,10 +2398,12 @@ void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb,
void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBSepiaRow_C(uint8* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
void ARGBSepiaRow_MSA(uint8* dst_argb, int width);
void ARGBColorMatrixRow_C(const uint8* src_argb,
uint8* dst_argb,
......@@ -2424,6 +2452,10 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
uint8* dst_argb,
int width,
uint32 value);
void ARGBShadeRow_MSA(const uint8* src_argb,
uint8* dst_argb,
int width,
uint32 value);
// Used for blur.
void CumulativeSumToAverageRow_SSE2(const int32* topleft,
......
......@@ -646,6 +646,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y,
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
......
......@@ -975,6 +975,14 @@ int I420ToRGB565Dither(const uint8* src_y,
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
}
}
#endif
{
// Allocate a row of argb.
......
......@@ -977,6 +977,15 @@ int ARGBToRGB565Dither(const uint8* src_argb,
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)),
......
......@@ -1696,6 +1696,14 @@ int ARGBAttenuate(const uint8* src_argb,
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width);
......@@ -1791,6 +1799,11 @@ int ARGBGrayTo(const uint8* src_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
#if defined(HAS_ARGBGRAYROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(src_argb, dst_argb, width);
......@@ -1831,6 +1844,12 @@ int ARGBGray(uint8* dst_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
#if defined(HAS_ARGBGRAYROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(dst, dst, width);
dst += dst_stride_argb;
......@@ -1868,6 +1887,12 @@ int ARGBSepia(uint8* dst_argb,
ARGBSepiaRow = ARGBSepiaRow_NEON;
}
#endif
#if defined(HAS_ARGBSEPIAROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_MSA;
}
#endif
for (y = 0; y < height; ++y) {
ARGBSepiaRow(dst, width);
dst += dst_stride_argb;
......@@ -2261,6 +2286,11 @@ int ARGBShade(const uint8* src_argb,
ARGBShadeRow = ARGBShadeRow_NEON;
}
#endif
#if defined(HAS_ARGBSHADEROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_MSA;
}
#endif
for (y = 0; y < height; ++y) {
ARGBShadeRow(src_argb, dst_argb, width, value);
......@@ -2455,6 +2485,14 @@ int ARGBShuffle(const uint8* src_bgra,
}
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBShuffleRow = ARGBShuffleRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
......
......@@ -502,6 +502,9 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_NEON
ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBATTENUATEROW_MSA
ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
#endif
......@@ -582,6 +585,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_NEON,
2,
7)
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
ANY11P(ARGBToRGB565DitherRow_Any_MSA,
ARGBToRGB565DitherRow_MSA,
const uint32,
4,
2,
7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSE2
ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
#endif
......@@ -594,6 +605,9 @@ ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
#ifdef HAS_ARGBSHUFFLEROW_NEON
ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_MSA
ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
#endif
#undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
......
......@@ -1038,6 +1038,244 @@ void ARGBSubtractRow_MSA(const uint8* src_argb0,
}
}
void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
int x;
v16u8 src0, src1, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v8i16 zero = {0};
v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
vec4 = (v8u16)__msa_fill_h(vec0[3]);
vec5 = (v8u16)__msa_fill_h(vec0[7]);
vec6 = (v8u16)__msa_fill_h(vec1[3]);
vec7 = (v8u16)__msa_fill_h(vec1[7]);
vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
vec6 = (v8u16)__msa_fill_h(vec2[3]);
vec7 = (v8u16)__msa_fill_h(vec2[7]);
vec8 = (v8u16)__msa_fill_h(vec3[3]);
vec9 = (v8u16)__msa_fill_h(vec3[7]);
vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
dst0 = __msa_bmnz_v(dst0, src0, mask);
dst1 = __msa_bmnz_v(dst1, src1, mask);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb += 32;
dst_argb += 32;
}
}
void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
uint8* dst_rgb,
uint32 dither4,
int width) {
int x;
v16u8 src0, src1, dst0, vec0, vec1;
v8i16 vec_d0;
v8i16 reg0, reg1, reg2;
v16i8 zero = {0};
v8i16 max = __msa_ldi_h(0xFF);
vec_d0 = (v8i16)__msa_fill_w(dither4);
vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
reg0 += vec_d0;
reg1 += vec_d0;
reg2 += vec_d0;
reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
reg0 = __msa_srai_h(reg0, 3);
reg2 = __msa_srai_h(reg2, 3);
reg1 = __msa_srai_h(reg1, 2);
reg2 = __msa_slli_h(reg2, 11);
reg1 = __msa_slli_h(reg1, 5);
reg0 |= reg1;
dst0 = (v16u8)(reg0 | reg2);
ST_UB(dst0, dst_rgb);
src_argb += 32;
dst_rgb += 16;
}
}
void ARGBShuffleRow_MSA(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
int width) {
int x;
v16u8 src0, src1, dst0, dst1;
v16i8 vec0;
v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
int32 val = LW((int32*)shuffler);
vec0 = (v16i8)__msa_fill_w(val);
shuffler_vec += vec0;
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb += 32;
dst_argb += 32;
}
}
void ARGBShadeRow_MSA(const uint8* src_argb,
uint8* dst_argb,
int width,
uint32 value) {
int x;
v16u8 src0, dst0;
v8u16 vec0, vec1;
v4u32 reg0, reg1, reg2, reg3, rgba_scale;
v8i16 zero = {0};
rgba_scale[0] = value;
rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
reg0 *= rgba_scale;
reg1 *= rgba_scale;
reg2 *= rgba_scale;
reg3 *= rgba_scale;
reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_argb);
src_argb += 16;
dst_argb += 16;
}
}
void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0, dst1;
v8u16 reg0;
v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb += 32;
dst_argb += 32;
}
}
void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
int x;
v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
v8u16 reg0, reg1, reg2;
v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
ST_UB2(dst0, dst1, dst_argb, 16);
dst_argb += 32;
}
}
void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
uint8* dst_argb,
int width) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment