Commit 83f460be authored by Manojkumar Bhosale's avatar Manojkumar Bhosale

Add MSA optimized ARGB Multiply/Add/Subtract row functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gain (vs C vectorized)
ARGBMultiplyRow_MSA       - 1.4x
ARGBAddRow_MSA            - 8.6x
ARGBSubtractRow_MSA       - 8.6x

ARGBMultiplyRow_Any_MSA   - 1.35x
ARGBAddRow_Any_MSA        - 7.3x
ARGBSubtractRow_Any_MSA   - 7.2x

Performance Gain (vs C non-vectorized)
ARGBMultiplyRow_MSA       - 4.4x
ARGBAddRow_MSA            - 27x
ARGBSubtractRow_MSA       - 22x

ARGBMultiplyRow_Any_MSA   - 3.5x
ARGBAddRow_Any_MSA        - 23x
ARGBSubtractRow_Any_MSA   - 18x

Review URL: https://codereview.chromium.org/2529983002 .
parent da0c29da
......@@ -390,6 +390,9 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_MSA
#define HAS_ARGBTOARGB4444ROW_MSA
#define HAS_ARGBTOUV444ROW_MSA
#define HAS_ARGBMULTIPLYROW_MSA
#define HAS_ARGBADDROW_MSA
#define HAS_ARGBSUBTRACTROW_MSA
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
......@@ -1809,6 +1812,14 @@ void ARGBMultiplyRow_Any_NEON(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBMultiplyRow_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBMultiplyRow_Any_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
// ARGB add images.
void ARGBAddRow_C(const uint8* src_argb,
......@@ -1839,6 +1850,14 @@ void ARGBAddRow_Any_NEON(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBAddRow_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBAddRow_Any_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
// ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
......@@ -1870,6 +1889,14 @@ void ARGBSubtractRow_Any_NEON(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBSubtractRow_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBSubtractRow_Any_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
......
......@@ -1115,6 +1115,14 @@ int ARGBMultiply(const uint8* src_argb0,
}
}
#endif
#if defined(HAS_ARGBMULTIPLYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_MSA;
}
}
#endif
// Multiply plane
for (y = 0; y < height; ++y) {
......@@ -1184,6 +1192,14 @@ int ARGBAdd(const uint8* src_argb0,
}
}
#endif
#if defined(HAS_ARGBADDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBAddRow = ARGBAddRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_MSA;
}
}
#endif
// Add plane
for (y = 0; y < height; ++y) {
......@@ -1248,6 +1264,14 @@ int ARGBSubtract(const uint8* src_argb0,
}
}
#endif
#if defined(HAS_ARGBSUBTRACTROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_MSA;
}
}
#endif
// Subtract plane
for (y = 0; y < height; ++y) {
......
......@@ -234,6 +234,15 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
#ifdef HAS_ARGBSUBTRACTROW_NEON
ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBMULTIPLYROW_MSA
ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
#endif
#ifdef HAS_ARGBADDROW_MSA
ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_MSA
ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
#endif
......
......@@ -957,6 +957,87 @@ void ARGBToUV444Row_MSA(const uint8* src_argb,
}
}
void ARGBMultiplyRow_MSA(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1, dst0;
v8u16 vec0, vec1, vec2, vec3;
v4u32 reg0, reg1, reg2, reg3;
v8i16 zero = {0};
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_argb);
src_argb0 += 16;
src_argb1 += 16;
dst_argb += 16;
}
}
void ARGBAddRow_MSA(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
dst0 = __msa_adds_u_b(src0, src2);
dst1 = __msa_adds_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
void ARGBSubtractRow_MSA(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
dst0 = __msa_subs_u_b(src0, src2);
dst1 = __msa_subs_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
uint8* dst_argb,
int width) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment