Commit 83f460be authored by Manojkumar Bhosale's avatar Manojkumar Bhosale

Add MSA optimized ARGB Multiply/Add/Subtract row functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gain (vs C vectorized)
ARGBMultiplyRow_MSA       - 1.4x
ARGBAddRow_MSA            - 8.6x
ARGBSubtractRow_MSA       - 8.6x

ARGBMultiplyRow_Any_MSA   - 1.35x
ARGBAddRow_Any_MSA        - 7.3x
ARGBSubtractRow_Any_MSA   - 7.2x

Performance Gain (vs C non-vectorized)
ARGBMultiplyRow_MSA       - 4.4x
ARGBAddRow_MSA            - 27x
ARGBSubtractRow_MSA       - 22x

ARGBMultiplyRow_Any_MSA   - 3.5x
ARGBAddRow_Any_MSA        - 23x
ARGBSubtractRow_Any_MSA   - 18x

Review URL: https://codereview.chromium.org/2529983002 .
parent da0c29da
...@@ -390,6 +390,9 @@ extern "C" { ...@@ -390,6 +390,9 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_MSA #define HAS_ARGBTOARGB1555ROW_MSA
#define HAS_ARGBTOARGB4444ROW_MSA #define HAS_ARGBTOARGB4444ROW_MSA
#define HAS_ARGBTOUV444ROW_MSA #define HAS_ARGBTOUV444ROW_MSA
#define HAS_ARGBMULTIPLYROW_MSA
#define HAS_ARGBADDROW_MSA
#define HAS_ARGBSUBTRACTROW_MSA
#endif #endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
...@@ -1809,6 +1812,14 @@ void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, ...@@ -1809,6 +1812,14 @@ void ARGBMultiplyRow_Any_NEON(const uint8* src_argb,
const uint8* src_argb1, const uint8* src_argb1,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void ARGBMultiplyRow_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBMultiplyRow_Any_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
// ARGB add images. // ARGB add images.
void ARGBAddRow_C(const uint8* src_argb, void ARGBAddRow_C(const uint8* src_argb,
...@@ -1839,6 +1850,14 @@ void ARGBAddRow_Any_NEON(const uint8* src_argb, ...@@ -1839,6 +1850,14 @@ void ARGBAddRow_Any_NEON(const uint8* src_argb,
const uint8* src_argb1, const uint8* src_argb1,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void ARGBAddRow_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBAddRow_Any_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
// ARGB subtract images. Same API as Blend, but these require // ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2. // pointer and width alignment for SSE2.
...@@ -1870,6 +1889,14 @@ void ARGBSubtractRow_Any_NEON(const uint8* src_argb, ...@@ -1870,6 +1889,14 @@ void ARGBSubtractRow_Any_NEON(const uint8* src_argb,
const uint8* src_argb1, const uint8* src_argb1,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void ARGBSubtractRow_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBSubtractRow_Any_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
......
...@@ -1115,6 +1115,14 @@ int ARGBMultiply(const uint8* src_argb0, ...@@ -1115,6 +1115,14 @@ int ARGBMultiply(const uint8* src_argb0,
} }
} }
#endif #endif
#if defined(HAS_ARGBMULTIPLYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_MSA;
}
}
#endif
// Multiply plane // Multiply plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
...@@ -1184,6 +1192,14 @@ int ARGBAdd(const uint8* src_argb0, ...@@ -1184,6 +1192,14 @@ int ARGBAdd(const uint8* src_argb0,
} }
} }
#endif #endif
#if defined(HAS_ARGBADDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBAddRow = ARGBAddRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_MSA;
}
}
#endif
// Add plane // Add plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
...@@ -1248,6 +1264,14 @@ int ARGBSubtract(const uint8* src_argb0, ...@@ -1248,6 +1264,14 @@ int ARGBSubtract(const uint8* src_argb0,
} }
} }
#endif #endif
#if defined(HAS_ARGBSUBTRACTROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_MSA;
}
}
#endif
// Subtract plane // Subtract plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
......
...@@ -234,6 +234,15 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) ...@@ -234,6 +234,15 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
#ifdef HAS_ARGBSUBTRACTROW_NEON #ifdef HAS_ARGBSUBTRACTROW_NEON
ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
#endif #endif
#ifdef HAS_ARGBMULTIPLYROW_MSA
ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
#endif
#ifdef HAS_ARGBADDROW_MSA
ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_MSA
ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_SOBELROW_SSE2 #ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
#endif #endif
......
...@@ -957,6 +957,87 @@ void ARGBToUV444Row_MSA(const uint8* src_argb, ...@@ -957,6 +957,87 @@ void ARGBToUV444Row_MSA(const uint8* src_argb,
} }
} }
void ARGBMultiplyRow_MSA(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1, dst0;
v8u16 vec0, vec1, vec2, vec3;
v4u32 reg0, reg1, reg2, reg3;
v8i16 zero = {0};
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_argb);
src_argb0 += 16;
src_argb1 += 16;
dst_argb += 16;
}
}
void ARGBAddRow_MSA(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
dst0 = __msa_adds_u_b(src0, src2);
dst1 = __msa_adds_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
void ARGBSubtractRow_MSA(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
dst0 = __msa_subs_u_b(src0, src2);
dst1 = __msa_subs_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment