Commit 78e44628 authored by Frank Barchard's avatar Frank Barchard

Add MSA optimized SplitUV, Set, MirrorUV, SobelX and SobelY row functions.

TBR=kjellander@chromium.org
R=fbarchard@google.com

Bug:libyuv:634
Change-Id: Ie2342f841f1bb8469fc4631b784eddd804f5d53e
Reviewed-on: https://chromium-review.googlesource.com/616765Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent bb17da97
......@@ -422,6 +422,8 @@ extern "C" {
#define HAS_YUY2TOUVROW_MSA
#define HAS_YUY2TOYROW_MSA
#define HAS_ARGBEXTRACTALPHAROW_MSA
#define HAS_SPLITUVROW_MSA
#define HAS_MIRRORUVROW_MSA
#ifndef DISABLE_CLANG_MSA
#define HAS_ABGRTOUVROW_MSA
......@@ -467,6 +469,9 @@ extern "C" {
#define HAS_ARGBBLENDROW_MSA
#define HAS_ARGBQUANTIZEROW_MSA
#define HAS_ARGBCOLORMATRIXROW_MSA
#define HAS_SETROW_MSA
#define HAS_SOBELXROW_MSA
#define HAS_SOBELYROW_MSA
#endif
#endif
......@@ -1362,6 +1367,10 @@ void MirrorUVRow_DSPR2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MirrorUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
......@@ -1391,6 +1400,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
void SplitUVRow_Any_SSE2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
......@@ -1407,6 +1417,10 @@ void SplitUVRow_Any_DSPR2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void SplitUVRow_Any_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MergeUVRow_C(const uint8* src_u,
const uint8* src_v,
......@@ -1496,6 +1510,7 @@ void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y,
int width);
void SetRow_C(uint8* dst, uint8 v8, int count);
void SetRow_MSA(uint8* dst, uint8 v8, int count);
void SetRow_X86(uint8* dst, uint8 v8, int count);
void SetRow_ERMS(uint8* dst, uint8 v8, int count);
void SetRow_NEON(uint8* dst, uint8 v8, int count);
......@@ -3024,6 +3039,11 @@ void SobelXRow_NEON(const uint8* src_y0,
const uint8* src_y2,
uint8* dst_sobelx,
int width);
void SobelXRow_MSA(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int width);
void SobelYRow_C(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
......@@ -3036,6 +3056,10 @@ void SobelYRow_NEON(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int width);
void SobelYRow_MSA(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int width);
void SobelRow_C(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_argb,
......
......@@ -321,6 +321,14 @@ void SplitUVPlane(const uint8* src_uv,
}
}
#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
// Copy a row of UV.
......@@ -1579,6 +1587,11 @@ void SetPlane(uint8* dst_y,
SetRow = SetRow_ERMS;
}
#endif
#if defined(HAS_SETROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
SetRow = SetRow_MSA;
}
#endif
// Set plane
for (y = 0; y < height; ++y) {
......@@ -2634,6 +2647,11 @@ static int ARGBSobelize(const uint8* src_argb,
SobelYRow = SobelYRow_NEON;
}
#endif
#if defined(HAS_SOBELYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelYRow = SobelYRow_MSA;
}
#endif
#if defined(HAS_SOBELXROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXRow = SobelXRow_SSE2;
......@@ -2643,6 +2661,11 @@ static int ARGBSobelize(const uint8* src_argb,
if (TestCpuFlag(kCpuHasNEON)) {
SobelXRow = SobelXRow_NEON;
}
#endif
#if defined(HAS_SOBELXROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelXRow = SobelXRow_MSA;
}
#endif
{
// 3 rows with edges before/after.
......@@ -3181,6 +3204,14 @@ int YUY2ToNV12(const uint8* src_yuy2,
}
}
#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
......@@ -3289,6 +3320,14 @@ int UYVYToNV12(const uint8* src_uyvy,
}
}
#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
......
......@@ -361,6 +361,11 @@ void RotateUV180(const uint8* src,
MirrorUVRow = MirrorUVRow_DSPR2;
}
#endif
#if defined(HAS_MIRRORUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
MirrorUVRow = MirrorUVRow_MSA;
}
#endif
dst_a += dst_stride_a * (height - 1);
dst_b += dst_stride_b * (height - 1);
......
......@@ -914,6 +914,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#ifdef HAS_SPLITUVROW_DSPR2
ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
#endif
#ifdef HAS_SPLITUVROW_MSA
ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
#endif
......
......@@ -2643,7 +2643,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;
#if defined(__clang__)
#pragma clang loop vectorize_width(4)
#pragma clang loop vectorize_width(4)
#endif
for (i = 0; i < width; ++i) {
float v = *src++;
......
......@@ -3291,6 +3291,173 @@ void ARGBColorMatrixRow_MSA(const uint8* src_argb,
}
}
#endif
void SplitUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_u, 16);
ST_UB2(dst2, dst3, dst_v, 16);
src_uv += 64;
dst_u += 32;
dst_v += 32;
}
}
#ifndef DISABLE_CLANG_MSA
void SetRow_MSA(uint8* dst, uint8 v8, int width) {
int x;
v16u8 dst0 = (v16u8)__msa_fill_b(v8);
for (x = 0; x < width; x += 16) {
ST_UB(dst0, dst);
dst += 16;
}
}
#endif
void MirrorUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
src_uv += (2 * width);
for (x = 0; x < width; x += 32) {
src_uv -= 64;
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_v, 16);
ST_UB2(dst2, dst3, dst_u, 16);
dst_u += 32;
dst_v += 32;
}
}
#ifndef DISABLE_CLANG_MSA
void SobelXRow_MSA(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int32 width) {
int x;
v16u8 src0, src1, src2, src3, src4, src5, dst0;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
v16i8 tmp = __msa_ldi_b(8);
v16i8 mask1 = mask0 + tmp;
v8i16 zero = {0};
v8i16 max = __msa_ldi_h(255);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16);
src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16);
vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
vec0 += vec2;
vec1 += vec3;
vec4 += vec2;
vec5 += vec3;
vec0 += vec4;
vec1 += vec5;
vec0 = __msa_add_a_h(zero, vec0);
vec1 = __msa_add_a_h(zero, vec1);
vec0 = __msa_maxi_s_h(vec0, 0);
vec1 = __msa_maxi_s_h(vec1, 0);
vec0 = __msa_min_s_h(max, vec0);
vec1 = __msa_min_s_h(max, vec1);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_sobelx);
src_y0 += 16;
src_y1 += 16;
src_y2 += 16;
dst_sobelx += 16;
}
}
void SobelYRow_MSA(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int32 width) {
int x;
v16u8 src0, src1, dst0;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
v8i16 zero = {0};
v8i16 max = __msa_ldi_h(255);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
vec0 -= vec2;
vec1 -= vec3;
vec6[0] = src_y0[16] - src_y1[16];
vec6[1] = src_y0[17] - src_y1[17];
vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
vec0 += vec2;
vec1 += vec3;
vec4 += vec2;
vec5 += vec3;
vec0 += vec4;
vec1 += vec5;
vec0 = __msa_add_a_h(zero, vec0);
vec1 = __msa_add_a_h(zero, vec1);
vec0 = __msa_maxi_s_h(vec0, 0);
vec1 = __msa_maxi_s_h(vec1, 0);
vec0 = __msa_min_s_h(max, vec0);
vec1 = __msa_min_s_h(max, vec1);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_sobely);
src_y0 += 16;
src_y1 += 16;
dst_sobely += 16;
}
}
#endif
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -1042,39 +1042,40 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
"ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels
"ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2
// consider a variation of this for last 8x2 that replicates the last pixel.
// consider a variation of this for last 8x2 that replicates the last
// pixel.
"ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels
"ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2
"subs %w3, %w3, #16 \n" // 16 dst pixels per loop
// filter first 2x2 group to produce 1st and 4th dest pixels
// 9 3
// 3 1
// filter first 2x2 group to produce 1st and 4th dest pixels
// 9 3
// 3 1
"umull v8.4s, v0.4h, v22.4h \n"
"umlal v8.4s, v1.4h, v21.4h \n"
"umlal v8.4s, v2.4h, v21.4h \n"
"umlal v8.4s, v3.4h, v20.4h \n"
// filter first 2x2 group to produce 2nd and 5th dest pixel
// 3 9
// 1 3
// filter first 2x2 group to produce 2nd and 5th dest pixel
// 3 9
// 1 3
"umull v9.4s, v0.4h, v21.4h \n"
"umlal v9.4s, v1.4h, v22.4h \n"
"umlal v9.4s, v2.4h, v20.4h \n"
"umlal v9.4s, v3.4h, v21.4h \n"
// filter second 2x2 group to produce 3rd and 6th dest pixels
// 9 3
// 3 1
// filter second 2x2 group to produce 3rd and 6th dest pixels
// 9 3
// 3 1
"umull v10.4s, v4.4h, v22.4h \n"
"umlal v10.4s, v5.4h, v21.4h \n"
"umlal v10.4s, v6.4h, v21.4h \n"
"umlal v10.4s, v7.4h, v20.4h \n"
// filter second 2x2 group to produce 4th and 7th dest pixel
// 3 9
// 1 3
// filter second 2x2 group to produce 4th and 7th dest pixel
// 3 9
// 1 3
"umull v11.4s, v4.4h, v21.4h \n"
"umlal v11.4s, v5.4h, v22.4h \n"
"umlal v11.4s, v6.4h, v20.4h \n"
......@@ -1094,12 +1095,11 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
: "r"(2LL), // %4
"r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v20", "v21", "v22" // Clobber List
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v20", "v21", "v22" // Clobber List
);
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
......
......@@ -450,7 +450,6 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) {
}
#endif // HAS_SCALEROWDOWN2_SSSE3
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr,
......@@ -470,16 +469,10 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
for (int i = 0; i < 640 * 2 + 1; ++i) {
orig_pixels[i] = i;
}
ScaleRowUp2_16_NEON(&orig_pixels[0],
640,
&dst_pixels_c[0],
1280);
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ScaleRowUp2_16_NEON(&orig_pixels[0],
640,
&dst_pixels_opt[0],
1280);
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
}
for (int i = 0; i < 1280; ++i) {
......@@ -507,29 +500,17 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
for (int i = 0; i < 2560 * 2; ++i) {
orig_pixels[i] = i;
}
ScaleRowDown2Box_16_C(&orig_pixels[0],
2560,
&dst_pixels_c[0],
1280);
ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
ScaleRowDown2Box_16_NEON(&orig_pixels[0],
2560,
&dst_pixels_opt[0],
1280);
ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
} else {
ScaleRowDown2Box_16_C(&orig_pixels[0],
2560,
&dst_pixels_opt[0],
1280);
ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
}
#else
ScaleRowDown2Box_16_C(&orig_pixels[0],
2560,
&dst_pixels_opt[0],
1280);
ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
#endif
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment