Commit 288bfbef authored by Manojkumar Bhosale's avatar Manojkumar Bhosale

Add MSA optimized remaining scale row functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gain (vs C vectorized)
ScaleRowDown2_MSA            - ~22.3x
ScaleRowDown2_Any_MSA        - ~19.9x
ScaleRowDown2Linear_MSA      - ~31.2x
ScaleRowDown2Linear_Any_MSA  - ~29.4x
ScaleRowDown2Box_MSA         - ~20.1x
ScaleRowDown2Box_Any_MSA     - ~19.6x
ScaleRowDown4_MSA            - ~11.7x
ScaleRowDown4_Any_MSA        - ~11.2x
ScaleRowDown4Box_MSA         - ~15.1x
ScaleRowDown4Box_Any_MSA     - ~15.1x
ScaleRowDown38_MSA           - ~1x
ScaleRowDown38_Any_MSA       - ~1x
ScaleRowDown38_2_Box_MSA     - ~1.7x
ScaleRowDown38_2_Box_Any_MSA - ~1.7x
ScaleRowDown38_3_Box_MSA     - ~1.7x
ScaleRowDown38_3_Box_Any_MSA - ~1.7x
ScaleAddRow_MSA              - ~1.2x
ScaleAddRow_Any_MSA          - ~1.15x

Performance Gain (vs C non-vectorized)
ScaleRowDown2_MSA            - ~22.4x
ScaleRowDown2_Any_MSA        - ~19.8x
ScaleRowDown2Linear_MSA      - ~31.6x
ScaleRowDown2Linear_Any_MSA  - ~29.4x
ScaleRowDown2Box_MSA         - ~20.1x
ScaleRowDown2Box_Any_MSA     - ~19.6x
ScaleRowDown4_MSA            - ~11.7x
ScaleRowDown4_Any_MSA        - ~11.2x
ScaleRowDown4Box_MSA         - ~15.1x
ScaleRowDown4Box_Any_MSA     - ~15.1x
ScaleRowDown38_MSA           - ~3.2x
ScaleRowDown38_Any_MSA       - ~3.2x
ScaleRowDown38_2_Box_MSA     - ~2.4x
ScaleRowDown38_2_Box_Any_MSA - ~2.3x
ScaleRowDown38_3_Box_MSA     - ~2.9x
ScaleRowDown38_3_Box_Any_MSA - ~2.8x
ScaleAddRow_MSA              - ~8x
ScaleAddRow_Any_MSA          - ~7.46x

Review-Url: https://codereview.chromium.org/2559683002 .
parent bd108758
......@@ -51,30 +51,28 @@
})
#endif // (__mips == 64)
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); \
uint32_t val_m = (val); \
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
\
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
})
#if (__mips == 64)
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \
uint64_t val_m = (val); \
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
\
: [pdst_sd_m] "=m"(*pdst_sd_m) \
: [val_m] "r"(val_m)); \
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint64_t val_m = (val); \
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
: [pdst_sd_m] "=m"(*pdst_sd_m) \
: [val_m] "r"(val_m)); \
})
#else // !(__mips == 64)
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
......@@ -118,18 +116,18 @@
})
#endif // (__mips == 64)
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); \
uint32_t val_m = (val); \
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
})
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
......@@ -145,6 +143,9 @@
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
/* Description : Load two vectors with 16 'byte' sized elements
Arguments : Inputs - psrc, stride
Outputs - out0, out1
......@@ -186,6 +187,18 @@
}
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 8 halfword elements from 'in0' to (pdst)
Store 8 halfword elements from 'in1' to (pdst + stride)
*/
#define ST_H2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_H(RTYPE, in0, (pdst)); \
ST_H(RTYPE, in1, (pdst) + stride); \
}
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
......
......@@ -106,6 +106,10 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_SCALEARGBROWDOWN2_MSA
#define HAS_SCALEARGBROWDOWNEVEN_MSA
#define HAS_SCALEROWDOWN2_MSA
#define HAS_SCALEROWDOWN4_MSA
#define HAS_SCALEROWDOWN38_MSA
#define HAS_SCALEADDROW_MSA
#endif
// Scale ARGB vertically with bilinear interpolation.
......@@ -843,6 +847,75 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -107,6 +107,21 @@ static void ScalePlaneDown2(int src_width,
ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
}
#endif
#if defined(HAS_SCALEROWDOWN2_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown2 =
filtering == kFilterNone
? ScaleRowDown2_Any_MSA
: (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
: ScaleRowDown2Box_Any_MSA);
if (IS_ALIGNED(dst_width, 32)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
: (filtering == kFilterLinear
? ScaleRowDown2Linear_MSA
: ScaleRowDown2Box_MSA);
}
}
#endif
if (filtering == kFilterLinear) {
src_stride = 0;
......@@ -232,6 +247,15 @@ static void ScalePlaneDown4(int src_width,
ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
}
#endif
#if defined(HAS_SCALEROWDOWN4_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown4 =
filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
}
}
#endif
if (filtering == kFilterLinear) {
src_stride = 0;
......@@ -567,6 +591,26 @@ static void ScalePlaneDown38(int src_width,
}
}
#endif
#if defined(HAS_SCALEROWDOWN38_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
}
if (dst_width % 12 == 0) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_MSA;
ScaleRowDown38_2 = ScaleRowDown38_MSA;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
}
}
}
#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
......@@ -842,6 +886,14 @@ static void ScalePlaneBox(int src_width,
}
}
#endif
#if defined(HAS_SCALEADDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleAddRow = ScaleAddRow_Any_MSA;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_MSA;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
......
......@@ -135,6 +135,21 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN2_MSA
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_MSA,
ScaleRowDown2Linear_MSA,
ScaleRowDown2Linear_C,
2,
1,
31)
SDANY(ScaleRowDown2Box_Any_MSA,
ScaleRowDown2Box_MSA,
ScaleRowDown2Box_C,
2,
1,
31)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3,
......@@ -162,6 +177,15 @@ SDANY(ScaleRowDown4Box_Any_NEON,
1,
7)
#endif
#ifdef HAS_SCALEROWDOWN4_MSA
SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_MSA,
ScaleRowDown4Box_MSA,
ScaleRowDown4Box_C,
4,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3,
ScaleRowDown34_SSSE3,
......@@ -242,6 +266,26 @@ SDANY(ScaleRowDown38_2_Box_Any_NEON,
1,
11)
#endif
#ifdef HAS_SCALEROWDOWN38_MSA
SDANY(ScaleRowDown38_Any_MSA,
ScaleRowDown38_MSA,
ScaleRowDown38_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_3_Box_Any_MSA,
ScaleRowDown38_3_Box_MSA,
ScaleRowDown38_3_Box_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_2_Box_Any_MSA,
ScaleRowDown38_2_Box_MSA,
ScaleRowDown38_2_Box_C,
8 / 3,
1,
11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2,
......@@ -374,6 +418,9 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif
#ifdef HAS_SCALEADDROW_MSA
SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
#endif
#undef SAANY
#ifdef __cplusplus
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment