Commit 288bfbef authored by Manojkumar Bhosale's avatar Manojkumar Bhosale

Add MSA optimized remaining scale row functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gain (vs C vectorized)
ScaleRowDown2_MSA            - ~22.3x
ScaleRowDown2_Any_MSA        - ~19.9x
ScaleRowDown2Linear_MSA      - ~31.2x
ScaleRowDown2Linear_Any_MSA  - ~29.4x
ScaleRowDown2Box_MSA         - ~20.1x
ScaleRowDown2Box_Any_MSA     - ~19.6x
ScaleRowDown4_MSA            - ~11.7x
ScaleRowDown4_Any_MSA        - ~11.2x
ScaleRowDown4Box_MSA         - ~15.1x
ScaleRowDown4Box_Any_MSA     - ~15.1x
ScaleRowDown38_MSA           - ~1x
ScaleRowDown38_Any_MSA       - ~1x
ScaleRowDown38_2_Box_MSA     - ~1.7x
ScaleRowDown38_2_Box_Any_MSA - ~1.7x
ScaleRowDown38_3_Box_MSA     - ~1.7x
ScaleRowDown38_3_Box_Any_MSA - ~1.7x
ScaleAddRow_MSA              - ~1.2x
ScaleAddRow_Any_MSA          - ~1.15x

Performance Gain (vs C non-vectorized)
ScaleRowDown2_MSA            - ~22.4x
ScaleRowDown2_Any_MSA        - ~19.8x
ScaleRowDown2Linear_MSA      - ~31.6x
ScaleRowDown2Linear_Any_MSA  - ~29.4x
ScaleRowDown2Box_MSA         - ~20.1x
ScaleRowDown2Box_Any_MSA     - ~19.6x
ScaleRowDown4_MSA            - ~11.7x
ScaleRowDown4_Any_MSA        - ~11.2x
ScaleRowDown4Box_MSA         - ~15.1x
ScaleRowDown4Box_Any_MSA     - ~15.1x
ScaleRowDown38_MSA           - ~3.2x
ScaleRowDown38_Any_MSA       - ~3.2x
ScaleRowDown38_2_Box_MSA     - ~2.4x
ScaleRowDown38_2_Box_Any_MSA - ~2.3x
ScaleRowDown38_3_Box_MSA     - ~2.9x
ScaleRowDown38_3_Box_Any_MSA - ~2.8x
ScaleAddRow_MSA              - ~8x
ScaleAddRow_Any_MSA          - ~7.46x

Review-Url: https://codereview.chromium.org/2559683002 .
parent bd108758
...@@ -51,30 +51,28 @@ ...@@ -51,30 +51,28 @@
}) })
#endif // (__mips == 64) #endif // (__mips == 64)
#define SW(val, pdst) \ #define SW(val, pdst) \
({ \ ({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \ uint32_t val_m = (val); \
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
\ : [pdst_sw_m] "=m"(*pdst_sw_m) \
: [pdst_sw_m] "=m"(*pdst_sw_m) \ : [val_m] "r"(val_m)); \
: [val_m] "r"(val_m)); \
}) })
#if (__mips == 64) #if (__mips == 64)
#define SD(val, pdst) \ #define SD(val, pdst) \
({ \ ({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint64_t val_m = (val); \ uint64_t val_m = (val); \
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
\ : [pdst_sd_m] "=m"(*pdst_sd_m) \
: [pdst_sd_m] "=m"(*pdst_sd_m) \ : [val_m] "r"(val_m)); \
: [val_m] "r"(val_m)); \
}) })
#else // !(__mips == 64) #else // !(__mips == 64)
#define SD(val, pdst) \ #define SD(val, pdst) \
({ \ ({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \ uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
...@@ -118,18 +116,18 @@ ...@@ -118,18 +116,18 @@
}) })
#endif // (__mips == 64) #endif // (__mips == 64)
#define SW(val, pdst) \ #define SW(val, pdst) \
({ \ ({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \ uint32_t val_m = (val); \
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \ : [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \ : [val_m] "r"(val_m)); \
}) })
#define SD(val, pdst) \ #define SD(val, pdst) \
({ \ ({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \ uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
...@@ -145,6 +143,9 @@ ...@@ -145,6 +143,9 @@
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
/* Description : Load two vectors with 16 'byte' sized elements /* Description : Load two vectors with 16 'byte' sized elements
Arguments : Inputs - psrc, stride Arguments : Inputs - psrc, stride
Outputs - out0, out1 Outputs - out0, out1
...@@ -186,6 +187,18 @@ ...@@ -186,6 +187,18 @@
} }
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 8 halfword elements from 'in0' to (pdst)
Store 8 halfword elements from 'in1' to (pdst + stride)
*/
#define ST_H2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_H(RTYPE, in0, (pdst)); \
ST_H(RTYPE, in1, (pdst) + stride); \
}
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
/* Description : Shuffle byte vector elements as per mask vector /* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
......
...@@ -106,6 +106,10 @@ extern "C" { ...@@ -106,6 +106,10 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_SCALEARGBROWDOWN2_MSA #define HAS_SCALEARGBROWDOWN2_MSA
#define HAS_SCALEARGBROWDOWNEVEN_MSA #define HAS_SCALEARGBROWDOWNEVEN_MSA
#define HAS_SCALEROWDOWN2_MSA
#define HAS_SCALEROWDOWN4_MSA
#define HAS_SCALEROWDOWN38_MSA
#define HAS_SCALEADDROW_MSA
#endif #endif
// Scale ARGB vertically with bilinear interpolation. // Scale ARGB vertically with bilinear interpolation.
...@@ -843,6 +847,75 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ...@@ -843,6 +847,75 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width); int dst_width);
void ScaleRowDown2_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -107,6 +107,21 @@ static void ScalePlaneDown2(int src_width, ...@@ -107,6 +107,21 @@ static void ScalePlaneDown2(int src_width,
ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2; ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown2 =
filtering == kFilterNone
? ScaleRowDown2_Any_MSA
: (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
: ScaleRowDown2Box_Any_MSA);
if (IS_ALIGNED(dst_width, 32)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
: (filtering == kFilterLinear
? ScaleRowDown2Linear_MSA
: ScaleRowDown2Box_MSA);
}
}
#endif
if (filtering == kFilterLinear) { if (filtering == kFilterLinear) {
src_stride = 0; src_stride = 0;
...@@ -232,6 +247,15 @@ static void ScalePlaneDown4(int src_width, ...@@ -232,6 +247,15 @@ static void ScalePlaneDown4(int src_width,
ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2; ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN4_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown4 =
filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
}
}
#endif
if (filtering == kFilterLinear) { if (filtering == kFilterLinear) {
src_stride = 0; src_stride = 0;
...@@ -567,6 +591,26 @@ static void ScalePlaneDown38(int src_width, ...@@ -567,6 +591,26 @@ static void ScalePlaneDown38(int src_width,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN38_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
}
if (dst_width % 12 == 0) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_MSA;
ScaleRowDown38_2 = ScaleRowDown38_MSA;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
}
}
}
#endif
for (y = 0; y < dst_height - 2; y += 3) { for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
...@@ -842,6 +886,14 @@ static void ScalePlaneBox(int src_width, ...@@ -842,6 +886,14 @@ static void ScalePlaneBox(int src_width,
} }
} }
#endif #endif
#if defined(HAS_SCALEADDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleAddRow = ScaleAddRow_Any_MSA;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_MSA;
}
}
#endif
for (j = 0; j < dst_height; ++j) { for (j = 0; j < dst_height; ++j) {
int boxheight; int boxheight;
......
...@@ -135,6 +135,21 @@ SDODD(ScaleRowDown2Box_Odd_NEON, ...@@ -135,6 +135,21 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
1, 1,
15) 15)
#endif #endif
#ifdef HAS_SCALEROWDOWN2_MSA
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_MSA,
ScaleRowDown2Linear_MSA,
ScaleRowDown2Linear_C,
2,
1,
31)
SDANY(ScaleRowDown2Box_Any_MSA,
ScaleRowDown2Box_MSA,
ScaleRowDown2Box_C,
2,
1,
31)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3 #ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3, SDANY(ScaleRowDown4Box_Any_SSSE3,
...@@ -162,6 +177,15 @@ SDANY(ScaleRowDown4Box_Any_NEON, ...@@ -162,6 +177,15 @@ SDANY(ScaleRowDown4Box_Any_NEON,
1, 1,
7) 7)
#endif #endif
#ifdef HAS_SCALEROWDOWN4_MSA
SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_MSA,
ScaleRowDown4Box_MSA,
ScaleRowDown4Box_C,
4,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3 #ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3, SDANY(ScaleRowDown34_Any_SSSE3,
ScaleRowDown34_SSSE3, ScaleRowDown34_SSSE3,
...@@ -242,6 +266,26 @@ SDANY(ScaleRowDown38_2_Box_Any_NEON, ...@@ -242,6 +266,26 @@ SDANY(ScaleRowDown38_2_Box_Any_NEON,
1, 1,
11) 11)
#endif #endif
#ifdef HAS_SCALEROWDOWN38_MSA
SDANY(ScaleRowDown38_Any_MSA,
ScaleRowDown38_MSA,
ScaleRowDown38_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_3_Box_Any_MSA,
ScaleRowDown38_3_Box_MSA,
ScaleRowDown38_3_Box_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_2_Box_Any_MSA,
ScaleRowDown38_2_Box_MSA,
ScaleRowDown38_2_Box_C,
8 / 3,
1,
11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2 #ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2, SDANY(ScaleARGBRowDown2_Any_SSE2,
...@@ -374,6 +418,9 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) ...@@ -374,6 +418,9 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#ifdef HAS_SCALEADDROW_NEON #ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif #endif
#ifdef HAS_SCALEADDROW_MSA
SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
#endif
#undef SAANY #undef SAANY
#ifdef __cplusplus #ifdef __cplusplus
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment