Commit 78e44628 authored by Frank Barchard's avatar Frank Barchard

Add MSA optimized SplitUV, Set, MirrorUV, SobelX and SobelY row functions.

TBR=kjellander@chromium.org
R=fbarchard@google.com

Bug:libyuv:634
Change-Id: Ie2342f841f1bb8469fc4631b784eddd804f5d53e
Reviewed-on: https://chromium-review.googlesource.com/616765Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent bb17da97
...@@ -422,6 +422,8 @@ extern "C" { ...@@ -422,6 +422,8 @@ extern "C" {
#define HAS_YUY2TOUVROW_MSA #define HAS_YUY2TOUVROW_MSA
#define HAS_YUY2TOYROW_MSA #define HAS_YUY2TOYROW_MSA
#define HAS_ARGBEXTRACTALPHAROW_MSA #define HAS_ARGBEXTRACTALPHAROW_MSA
#define HAS_SPLITUVROW_MSA
#define HAS_MIRRORUVROW_MSA
#ifndef DISABLE_CLANG_MSA #ifndef DISABLE_CLANG_MSA
#define HAS_ABGRTOUVROW_MSA #define HAS_ABGRTOUVROW_MSA
...@@ -467,6 +469,9 @@ extern "C" { ...@@ -467,6 +469,9 @@ extern "C" {
#define HAS_ARGBBLENDROW_MSA #define HAS_ARGBBLENDROW_MSA
#define HAS_ARGBQUANTIZEROW_MSA #define HAS_ARGBQUANTIZEROW_MSA
#define HAS_ARGBCOLORMATRIXROW_MSA #define HAS_ARGBCOLORMATRIXROW_MSA
#define HAS_SETROW_MSA
#define HAS_SOBELXROW_MSA
#define HAS_SOBELYROW_MSA
#endif #endif
#endif #endif
...@@ -1362,6 +1367,10 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, ...@@ -1362,6 +1367,10 @@ void MirrorUVRow_DSPR2(const uint8* src_uv,
uint8* dst_u, uint8* dst_u,
uint8* dst_v, uint8* dst_v,
int width); int width);
void MirrorUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
...@@ -1391,6 +1400,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv, ...@@ -1391,6 +1400,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv,
uint8* dst_u, uint8* dst_u,
uint8* dst_v, uint8* dst_v,
int width); int width);
void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
void SplitUVRow_Any_SSE2(const uint8* src_uv, void SplitUVRow_Any_SSE2(const uint8* src_uv,
uint8* dst_u, uint8* dst_u,
uint8* dst_v, uint8* dst_v,
...@@ -1407,6 +1417,10 @@ void SplitUVRow_Any_DSPR2(const uint8* src_uv, ...@@ -1407,6 +1417,10 @@ void SplitUVRow_Any_DSPR2(const uint8* src_uv,
uint8* dst_u, uint8* dst_u,
uint8* dst_v, uint8* dst_v,
int width); int width);
void SplitUVRow_Any_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MergeUVRow_C(const uint8* src_u, void MergeUVRow_C(const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1496,6 +1510,7 @@ void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, ...@@ -1496,6 +1510,7 @@ void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y,
int width); int width);
void SetRow_C(uint8* dst, uint8 v8, int count); void SetRow_C(uint8* dst, uint8 v8, int count);
void SetRow_MSA(uint8* dst, uint8 v8, int count);
void SetRow_X86(uint8* dst, uint8 v8, int count); void SetRow_X86(uint8* dst, uint8 v8, int count);
void SetRow_ERMS(uint8* dst, uint8 v8, int count); void SetRow_ERMS(uint8* dst, uint8 v8, int count);
void SetRow_NEON(uint8* dst, uint8 v8, int count); void SetRow_NEON(uint8* dst, uint8 v8, int count);
...@@ -3024,6 +3039,11 @@ void SobelXRow_NEON(const uint8* src_y0, ...@@ -3024,6 +3039,11 @@ void SobelXRow_NEON(const uint8* src_y0,
const uint8* src_y2, const uint8* src_y2,
uint8* dst_sobelx, uint8* dst_sobelx,
int width); int width);
void SobelXRow_MSA(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int width);
void SobelYRow_C(const uint8* src_y0, void SobelYRow_C(const uint8* src_y0,
const uint8* src_y1, const uint8* src_y1,
uint8* dst_sobely, uint8* dst_sobely,
...@@ -3036,6 +3056,10 @@ void SobelYRow_NEON(const uint8* src_y0, ...@@ -3036,6 +3056,10 @@ void SobelYRow_NEON(const uint8* src_y0,
const uint8* src_y1, const uint8* src_y1,
uint8* dst_sobely, uint8* dst_sobely,
int width); int width);
void SobelYRow_MSA(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int width);
void SobelRow_C(const uint8* src_sobelx, void SobelRow_C(const uint8* src_sobelx,
const uint8* src_sobely, const uint8* src_sobely,
uint8* dst_argb, uint8* dst_argb,
......
...@@ -321,6 +321,14 @@ void SplitUVPlane(const uint8* src_uv, ...@@ -321,6 +321,14 @@ void SplitUVPlane(const uint8* src_uv,
} }
} }
#endif #endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
// Copy a row of UV. // Copy a row of UV.
...@@ -1579,6 +1587,11 @@ void SetPlane(uint8* dst_y, ...@@ -1579,6 +1587,11 @@ void SetPlane(uint8* dst_y,
SetRow = SetRow_ERMS; SetRow = SetRow_ERMS;
} }
#endif #endif
#if defined(HAS_SETROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
SetRow = SetRow_MSA;
}
#endif
// Set plane // Set plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
...@@ -2634,6 +2647,11 @@ static int ARGBSobelize(const uint8* src_argb, ...@@ -2634,6 +2647,11 @@ static int ARGBSobelize(const uint8* src_argb,
SobelYRow = SobelYRow_NEON; SobelYRow = SobelYRow_NEON;
} }
#endif #endif
#if defined(HAS_SOBELYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelYRow = SobelYRow_MSA;
}
#endif
#if defined(HAS_SOBELXROW_SSE2) #if defined(HAS_SOBELXROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
SobelXRow = SobelXRow_SSE2; SobelXRow = SobelXRow_SSE2;
...@@ -2643,6 +2661,11 @@ static int ARGBSobelize(const uint8* src_argb, ...@@ -2643,6 +2661,11 @@ static int ARGBSobelize(const uint8* src_argb,
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
SobelXRow = SobelXRow_NEON; SobelXRow = SobelXRow_NEON;
} }
#endif
#if defined(HAS_SOBELXROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelXRow = SobelXRow_MSA;
}
#endif #endif
{ {
// 3 rows with edges before/after. // 3 rows with edges before/after.
...@@ -3181,6 +3204,14 @@ int YUY2ToNV12(const uint8* src_yuy2, ...@@ -3181,6 +3204,14 @@ int YUY2ToNV12(const uint8* src_yuy2,
} }
} }
#endif #endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
...@@ -3289,6 +3320,14 @@ int UYVYToNV12(const uint8* src_uyvy, ...@@ -3289,6 +3320,14 @@ int UYVYToNV12(const uint8* src_uyvy,
} }
} }
#endif #endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
......
...@@ -361,6 +361,11 @@ void RotateUV180(const uint8* src, ...@@ -361,6 +361,11 @@ void RotateUV180(const uint8* src,
MirrorUVRow = MirrorUVRow_DSPR2; MirrorUVRow = MirrorUVRow_DSPR2;
} }
#endif #endif
#if defined(HAS_MIRRORUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
MirrorUVRow = MirrorUVRow_MSA;
}
#endif
dst_a += dst_stride_a * (height - 1); dst_a += dst_stride_a * (height - 1);
dst_b += dst_stride_b * (height - 1); dst_b += dst_stride_b * (height - 1);
......
...@@ -914,6 +914,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) ...@@ -914,6 +914,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#ifdef HAS_SPLITUVROW_DSPR2 #ifdef HAS_SPLITUVROW_DSPR2
ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
#endif #endif
#ifdef HAS_SPLITUVROW_MSA
ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
#endif #endif
......
...@@ -2643,7 +2643,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { ...@@ -2643,7 +2643,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f; float fsum = 0.f;
int i; int i;
#if defined(__clang__) #if defined(__clang__)
#pragma clang loop vectorize_width(4) #pragma clang loop vectorize_width(4)
#endif #endif
for (i = 0; i < width; ++i) { for (i = 0; i < width; ++i) {
float v = *src++; float v = *src++;
......
...@@ -3291,6 +3291,173 @@ void ARGBColorMatrixRow_MSA(const uint8* src_argb, ...@@ -3291,6 +3291,173 @@ void ARGBColorMatrixRow_MSA(const uint8* src_argb,
} }
} }
#endif #endif
void SplitUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_u, 16);
ST_UB2(dst2, dst3, dst_v, 16);
src_uv += 64;
dst_u += 32;
dst_v += 32;
}
}
#ifndef DISABLE_CLANG_MSA
void SetRow_MSA(uint8* dst, uint8 v8, int width) {
int x;
v16u8 dst0 = (v16u8)__msa_fill_b(v8);
for (x = 0; x < width; x += 16) {
ST_UB(dst0, dst);
dst += 16;
}
}
#endif
void MirrorUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
src_uv += (2 * width);
for (x = 0; x < width; x += 32) {
src_uv -= 64;
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_v, 16);
ST_UB2(dst2, dst3, dst_u, 16);
dst_u += 32;
dst_v += 32;
}
}
#ifndef DISABLE_CLANG_MSA
void SobelXRow_MSA(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int32 width) {
int x;
v16u8 src0, src1, src2, src3, src4, src5, dst0;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
v16i8 tmp = __msa_ldi_b(8);
v16i8 mask1 = mask0 + tmp;
v8i16 zero = {0};
v8i16 max = __msa_ldi_h(255);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16);
src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16);
vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
vec0 += vec2;
vec1 += vec3;
vec4 += vec2;
vec5 += vec3;
vec0 += vec4;
vec1 += vec5;
vec0 = __msa_add_a_h(zero, vec0);
vec1 = __msa_add_a_h(zero, vec1);
vec0 = __msa_maxi_s_h(vec0, 0);
vec1 = __msa_maxi_s_h(vec1, 0);
vec0 = __msa_min_s_h(max, vec0);
vec1 = __msa_min_s_h(max, vec1);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_sobelx);
src_y0 += 16;
src_y1 += 16;
src_y2 += 16;
dst_sobelx += 16;
}
}
void SobelYRow_MSA(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int32 width) {
int x;
v16u8 src0, src1, dst0;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
v8i16 zero = {0};
v8i16 max = __msa_ldi_h(255);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
vec0 -= vec2;
vec1 -= vec3;
vec6[0] = src_y0[16] - src_y1[16];
vec6[1] = src_y0[17] - src_y1[17];
vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
vec0 += vec2;
vec1 += vec3;
vec4 += vec2;
vec5 += vec3;
vec0 += vec4;
vec1 += vec5;
vec0 = __msa_add_a_h(zero, vec0);
vec1 = __msa_add_a_h(zero, vec1);
vec0 = __msa_maxi_s_h(vec0, 0);
vec1 = __msa_maxi_s_h(vec1, 0);
vec0 = __msa_min_s_h(max, vec0);
vec1 = __msa_min_s_h(max, vec1);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_sobely);
src_y0 += 16;
src_y1 += 16;
dst_sobely += 16;
}
}
#endif
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -1042,39 +1042,40 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr, ...@@ -1042,39 +1042,40 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
"ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels "ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels
"ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2 "ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2
// consider a variation of this for last 8x2 that replicates the last pixel. // consider a variation of this for last 8x2 that replicates the last
// pixel.
"ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels "ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels
"ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2 "ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2
"subs %w3, %w3, #16 \n" // 16 dst pixels per loop "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
// filter first 2x2 group to produce 1st and 4th dest pixels // filter first 2x2 group to produce 1st and 4th dest pixels
// 9 3 // 9 3
// 3 1 // 3 1
"umull v8.4s, v0.4h, v22.4h \n" "umull v8.4s, v0.4h, v22.4h \n"
"umlal v8.4s, v1.4h, v21.4h \n" "umlal v8.4s, v1.4h, v21.4h \n"
"umlal v8.4s, v2.4h, v21.4h \n" "umlal v8.4s, v2.4h, v21.4h \n"
"umlal v8.4s, v3.4h, v20.4h \n" "umlal v8.4s, v3.4h, v20.4h \n"
// filter first 2x2 group to produce 2nd and 5th dest pixel // filter first 2x2 group to produce 2nd and 5th dest pixel
// 3 9 // 3 9
// 1 3 // 1 3
"umull v9.4s, v0.4h, v21.4h \n" "umull v9.4s, v0.4h, v21.4h \n"
"umlal v9.4s, v1.4h, v22.4h \n" "umlal v9.4s, v1.4h, v22.4h \n"
"umlal v9.4s, v2.4h, v20.4h \n" "umlal v9.4s, v2.4h, v20.4h \n"
"umlal v9.4s, v3.4h, v21.4h \n" "umlal v9.4s, v3.4h, v21.4h \n"
// filter second 2x2 group to produce 3rd and 6th dest pixels // filter second 2x2 group to produce 3rd and 6th dest pixels
// 9 3 // 9 3
// 3 1 // 3 1
"umull v10.4s, v4.4h, v22.4h \n" "umull v10.4s, v4.4h, v22.4h \n"
"umlal v10.4s, v5.4h, v21.4h \n" "umlal v10.4s, v5.4h, v21.4h \n"
"umlal v10.4s, v6.4h, v21.4h \n" "umlal v10.4s, v6.4h, v21.4h \n"
"umlal v10.4s, v7.4h, v20.4h \n" "umlal v10.4s, v7.4h, v20.4h \n"
// filter second 2x2 group to produce 4th and 7th dest pixel // filter second 2x2 group to produce 4th and 7th dest pixel
// 3 9 // 3 9
// 1 3 // 1 3
"umull v11.4s, v4.4h, v21.4h \n" "umull v11.4s, v4.4h, v21.4h \n"
"umlal v11.4s, v5.4h, v22.4h \n" "umlal v11.4s, v5.4h, v22.4h \n"
"umlal v11.4s, v6.4h, v20.4h \n" "umlal v11.4s, v6.4h, v20.4h \n"
...@@ -1094,12 +1095,11 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr, ...@@ -1094,12 +1095,11 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
: "r"(2LL), // %4 : "r"(2LL), // %4
"r"(14LL) // %5 "r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v8", "v9", "v10", "v11", "v20", "v21", "v22" // Clobber List "v11", "v20", "v21", "v22" // Clobber List
); );
} }
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -450,7 +450,6 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) { ...@@ -450,7 +450,6 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) {
} }
#endif // HAS_SCALEROWDOWN2_SSSE3 #endif // HAS_SCALEROWDOWN2_SSSE3
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr, extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr,
...@@ -470,16 +469,10 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { ...@@ -470,16 +469,10 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
for (int i = 0; i < 640 * 2 + 1; ++i) { for (int i = 0; i < 640 * 2 + 1; ++i) {
orig_pixels[i] = i; orig_pixels[i] = i;
} }
ScaleRowUp2_16_NEON(&orig_pixels[0], ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
640,
&dst_pixels_c[0],
1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ScaleRowUp2_16_NEON(&orig_pixels[0], ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
640,
&dst_pixels_opt[0],
1280);
} }
for (int i = 0; i < 1280; ++i) { for (int i = 0; i < 1280; ++i) {
...@@ -507,29 +500,17 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { ...@@ -507,29 +500,17 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
for (int i = 0; i < 2560 * 2; ++i) { for (int i = 0; i < 2560 * 2; ++i) {
orig_pixels[i] = i; orig_pixels[i] = i;
} }
ScaleRowDown2Box_16_C(&orig_pixels[0], ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
2560,
&dst_pixels_c[0],
1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON); int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) { if (has_neon) {
ScaleRowDown2Box_16_NEON(&orig_pixels[0], ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
2560,
&dst_pixels_opt[0],
1280);
} else { } else {
ScaleRowDown2Box_16_C(&orig_pixels[0], ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
2560,
&dst_pixels_opt[0],
1280);
} }
#else #else
ScaleRowDown2Box_16_C(&orig_pixels[0], ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
2560,
&dst_pixels_opt[0],
1280);
#endif #endif
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment