Commit b6e8e9aa authored by Manojkumar Bhosale's avatar Manojkumar Bhosale Committed by Frank Barchard

Add MSA optimized HalfFloatRow function

TBR=kjellander@chromium.org
R=fbarchard@google.com

Bug:libyuv:634
Change-Id: I54a2c57d66093b887c8ba31fd7a21a102165393a
Reviewed-on: https://chromium-review.googlesource.com/628557Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
parent f0a9d6d2
......@@ -424,6 +424,7 @@ extern "C" {
#define HAS_ARGBEXTRACTALPHAROW_MSA
#define HAS_SPLITUVROW_MSA
#define HAS_MIRRORUVROW_MSA
#define HAS_HALFFLOATROW_MSA
#ifndef DISABLE_CLANG_MSA
#define HAS_ABGRTOUVROW_MSA
......@@ -3190,6 +3191,11 @@ void HalfFloat1Row_Any_NEON(const uint16* src,
uint16* dst,
float scale,
int width);
void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_MSA(const uint16* src,
uint16* dst,
float scale,
int width);
void ARGBLumaColorTableRow_C(const uint8* src_argb,
uint8* dst_argb,
......
......@@ -2941,6 +2941,14 @@ int HalfFloatPlane(const uint16* src_y,
}
}
#endif
#if defined(HAS_HALFFLOATROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
HalfFloatRow = HalfFloatRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
HalfFloatRow = HalfFloatRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
......
......@@ -749,6 +749,9 @@ ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
#endif
#ifdef HAS_HALFFLOATROW_MSA
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31)
#endif
#undef ANY11P16
// Any 1 to 1 with yuvconstants
......
......@@ -3458,6 +3458,64 @@ void SobelYRow_MSA(const uint8* src_y0,
}
#endif
void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width) {
int i;
v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
v4f32 mult_vec;
v8i16 zero = {0};
mult_vec[0] = 1.9259299444e-34f * scale;
mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
for (i = 0; i < width; i += 32) {
src0 = (v8u16)__msa_ld_h((v8i16*)src, 0);
src1 = (v8u16)__msa_ld_h((v8i16*)src, 16);
src2 = (v8u16)__msa_ld_h((v8i16*)src, 32);
src3 = (v8u16)__msa_ld_h((v8i16*)src, 48);
vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
fvec0 = __msa_ffint_u_w(vec0);
fvec1 = __msa_ffint_u_w(vec1);
fvec2 = __msa_ffint_u_w(vec2);
fvec3 = __msa_ffint_u_w(vec3);
fvec4 = __msa_ffint_u_w(vec4);
fvec5 = __msa_ffint_u_w(vec5);
fvec6 = __msa_ffint_u_w(vec6);
fvec7 = __msa_ffint_u_w(vec7);
fvec0 *= mult_vec;
fvec1 *= mult_vec;
fvec2 *= mult_vec;
fvec3 *= mult_vec;
fvec4 *= mult_vec;
fvec5 *= mult_vec;
fvec6 *= mult_vec;
fvec7 *= mult_vec;
vec0 = ((v4u32)fvec0) >> 13;
vec1 = ((v4u32)fvec1) >> 13;
vec2 = ((v4u32)fvec2) >> 13;
vec3 = ((v4u32)fvec3) >> 13;
vec4 = ((v4u32)fvec4) >> 13;
vec5 = ((v4u32)fvec5) >> 13;
vec6 = ((v4u32)fvec6) >> 13;
vec7 = ((v4u32)fvec7) >> 13;
dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
ST_UH2(dst0, dst1, dst, 8);
ST_UH2(dst2, dst3, dst + 16, 8);
src += 32;
dst += 32;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment