Commit 45b176d1 authored by Manojkumar Bhosale's avatar Manojkumar Bhosale Committed by Frank Barchard

Add MSA optimized Interpolate/MergeUV/Misc functions

BUG=libyuv:634

Change-Id: If8d60bd57f01fe95bc2fd26196466574195cc126

Performance Gain (vs C auto-vectorized)
InterpolateRow_MSA      - ~3.3x
InterpolateRow_Any_MSA  - ~2.5x
ARGBSetRow_MSA          - ~1.0x
ARGBSetRow_Any_MSA      - ~1.0x
ARGBToRGB24Row_MSA      - ~1.9x
ARGBToRGB24Row_Any_MSA  - ~1.6x
MergeUVRow_MSA          - ~1.6x
MergeUVRow_Any_MSA      - ~1.2x

Performance Gain (vs C non-vectorized)
InterpolateRow_MSA      - ~11.3x
InterpolateRow_Any_MSA  - ~ 7.9x
ARGBSetRow_MSA          - ~ 6.2x
ARGBSetRow_Any_MSA      - ~ 4.0x
ARGBToRGB24Row_MSA      - ~ 9.9x
ARGBToRGB24Row_Any_MSA  - ~ 8.4x
MergeUVRow_MSA          - ~12.7x
MergeUVRow_Any_MSA      - ~ 8.0x

Change-Id: If8d60bd57f01fe95bc2fd26196466574195cc126
Reviewed-on: https://chromium-review.googlesource.com/445817Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent a041b0ae
......@@ -447,6 +447,10 @@ extern "C" {
#define HAS_J400TOARGBROW_MSA
#define HAS_YUY2TOARGBROW_MSA
#define HAS_UYVYTOARGBROW_MSA
#define HAS_INTERPOLATEROW_MSA
#define HAS_ARGBSETROW_MSA
#define HAS_RAWTORGB24ROW_MSA
#define HAS_MERGEUVROW_MSA
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
......@@ -1412,6 +1416,10 @@ void MergeUVRow_NEON(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
int width);
void MergeUVRow_MSA(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
int width);
void MergeUVRow_Any_SSE2(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
......@@ -1424,6 +1432,10 @@ void MergeUVRow_Any_NEON(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
int width);
void MergeUVRow_Any_MSA(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
......@@ -1482,6 +1494,8 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_Any_MSA(uint8* dst_argb, uint32 v32, int count);
// ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8* src_argb,
......@@ -1552,6 +1566,7 @@ void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width);
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
......@@ -1613,6 +1628,7 @@ void RGB24ToARGBRow_Any_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToARGBRow_Any_MSA(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
void RAWToRGB24Row_Any_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565,
uint8* dst_argb,
int width);
......@@ -2930,6 +2946,11 @@ void InterpolateRow_DSPR2(uint8* dst_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
void InterpolateRow_MSA(uint8* dst_ptr,
const uint8* src_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
void InterpolateRow_Any_NEON(uint8* dst_ptr,
const uint8* src_ptr,
ptrdiff_t src_stride_ptr,
......@@ -2950,6 +2971,11 @@ void InterpolateRow_Any_DSPR2(uint8* dst_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
void InterpolateRow_Any_MSA(uint8* dst_ptr,
const uint8* src_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
void InterpolateRow_16_C(uint16* dst_ptr,
const uint16* src_ptr,
......
......@@ -359,6 +359,14 @@ int ARGBToNV12(const uint8* src_argb,
ARGBToUVRow = ARGBToUVRow_DSPR2;
}
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_MSA;
}
}
#endif
{
// Allocate a rows of uv.
......@@ -502,6 +510,14 @@ int ARGBToNV21(const uint8* src_argb,
ARGBToUVRow = ARGBToUVRow_DSPR2;
}
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_MSA;
}
}
#endif
{
// Allocate a rows of uv.
......
......@@ -381,6 +381,14 @@ void MergeUVPlane(const uint8* src_u,
}
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow = MergeUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
MergeUVRow = MergeUVRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of UV.
......@@ -1509,6 +1517,14 @@ int RAWToRGB24(const uint8* src_raw,
}
}
#endif
#if defined(HAS_RAWTORGB24ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
if (IS_ALIGNED(width, 16)) {
RAWToRGB24Row = RAWToRGB24Row_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
RAWToRGB24Row(src_raw, dst_rgb24, width);
......@@ -1638,6 +1654,14 @@ int ARGBRect(uint8* dst_argb,
ARGBSetRow = ARGBSetRow_X86;
}
#endif
#if defined(HAS_ARGBSETROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBSetRow = ARGBSetRow_Any_MSA;
if (IS_ALIGNED(width, 4)) {
ARGBSetRow = ARGBSetRow_MSA;
}
}
#endif
// Set plane
for (y = 0; y < height; ++y) {
......@@ -2374,6 +2398,14 @@ int InterpolatePlane(const uint8* src0,
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
InterpolateRow(dst, src0, src1 - src0, width, interpolation);
......@@ -3152,6 +3184,14 @@ int YUY2ToNV12(const uint8* src_yuy2,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
{
int awidth = halfwidth * 2;
......@@ -3252,6 +3292,14 @@ int UYVYToNV12(const uint8* src_uyvy,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
{
int awidth = halfwidth * 2;
......
......@@ -212,6 +212,9 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
#ifdef HAS_MERGEUVROW_NEON
ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
#endif
#ifdef HAS_MERGEUVROW_MSA
ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
#endif
// Math functions.
#ifdef HAS_ARGBMULTIPLYROW_SSE2
......@@ -429,6 +432,9 @@ ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#endif
#if defined(HAS_RAWTORGB24ROW_MSA)
ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
......@@ -797,6 +803,9 @@ ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#ifdef HAS_INTERPOLATEROW_DSPR2
ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
#endif
#ifdef HAS_INTERPOLATEROW_MSA
ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
#endif
#undef ANY11T
// Any 1 to 1 mirror.
......@@ -862,6 +871,9 @@ ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
#ifdef HAS_ARGBSETROW_NEON
ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
#endif
#ifdef HAS_ARGBSETROW_MSA
ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32, 4, 3)
#endif
#undef ANY1
// Any 1 to 2. Outputs UV planes.
......
......@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <string.h>
#include "libyuv/row.h"
// This module is for GCC MSA
......@@ -2849,6 +2851,124 @@ void UYVYToARGBRow_MSA(const uint8* src_uyvy,
}
}
void InterpolateRow_MSA(uint8* dst_ptr,
const uint8* src_ptr,
ptrdiff_t src_stride,
int width,
int32 source_y_fraction) {
int32 y1_fraction = source_y_fraction;
int32 y0_fraction = 256 - y1_fraction;
uint16 y_fractions;
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3, y_frac;
if (0 == y1_fraction) {
memcpy(dst_ptr, src_ptr, width);
return;
}
if (128 == y1_fraction) {
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
dst0 = __msa_aver_u_b(src0, src2);
dst1 = __msa_aver_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_ptr, 16);
s += 32;
t += 32;
dst_ptr += 32;
}
return;
}
y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
y_frac = (v8u16)__msa_fill_h(y_fractions);
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
ST_UB2(dst0, dst1, dst_ptr, 16);
s += 32;
t += 32;
dst_ptr += 32;
}
}
void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
int x;
v16u8 dst0 = (v16u8)__msa_fill_w(v32);
for (x = 0; x < width; x += 4) {
ST_UB(dst0, dst_argb);
dst_argb += 16;
}
}
void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
int x;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13,
18, 17, 16, 21, 20, 19, 24, 23};
v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
24, 23, 28, 27, 26, 31, 30, 29};
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
ST_UB2(dst0, dst1, dst_rgb24, 16);
ST_UB(dst2, (dst_rgb24 + 32));
src_raw += 48;
dst_rgb24 += 48;
}
}
void MergeUVRow_MSA(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
int width) {
int x;
v16u8 src0, src1, dst0, dst1;
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
ST_UB2(dst0, dst1, dst_uv, 16);
src_u += 16;
src_v += 16;
dst_uv += 32;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -45,9 +45,8 @@ static void ScalePlaneDown2(int src_width,
int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering == kFilterNone
? ScaleRowDown2_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_C
filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear
? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
int row_stride = src_stride << 1;
(void)src_width;
......@@ -1057,6 +1056,14 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(src_width, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
#if defined(HAS_SCALEFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
......
......@@ -315,6 +315,14 @@ static void ScaleARGBBilinearDown(int src_width,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(clip_src_width, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
......@@ -408,6 +416,14 @@ static void ScaleARGBBilinearUp(int src_width,
IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
if (src_width >= 32768) {
ScaleARGBFilterCols =
......@@ -598,6 +614,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
......
......@@ -1072,6 +1072,14 @@ void ScalePlaneVertical(int src_height,
InterpolateRow = InterpolateRow_DSPR2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
int yi;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment