Commit 532f5708 authored by Frank Barchard's avatar Frank Barchard

Add MSA optimized I422AlphaToARGBRow_MSA and I422ToRGB24Row_MSA functions

R=fbarchard@google.com
BUG=libyuv:634

Performance Gain (vs C vectorized)
I422AlphaToARGBRow_MSA      : ~1.4x
I422AlphaToARGBRow_Any_MSA  : ~1.4x
I422ToRGB24Row_MSA          : ~4.8x
I422ToRGB24Row_Any_MSA      : ~4.8x

Performance Gain (vs C non-vectorized)
I422AlphaToARGBRow_MSA      : ~7.0x
I422AlphaToARGBRow_Any_MSA  : ~7.0x
I422ToRGB24Row_MSA          : ~7.9x
I422ToRGB24Row_Any_MSA      : ~7.7x

Review URL: https://codereview.chromium.org/2454433003 .
parent 02ae8b60
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1631 Version: 1632
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -383,6 +383,8 @@ extern "C" { ...@@ -383,6 +383,8 @@ extern "C" {
#define HAS_ARGBTOUVROW_MSA #define HAS_ARGBTOUVROW_MSA
#define HAS_I422TOARGBROW_MSA #define HAS_I422TOARGBROW_MSA
#define HAS_I422TORGBAROW_MSA #define HAS_I422TORGBAROW_MSA
#define HAS_I422ALPHATOARGBROW_MSA
#define HAS_I422TORGB24ROW_MSA
#endif #endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
...@@ -663,6 +665,19 @@ void I422ToRGBARow_MSA(const uint8* src_y, ...@@ -663,6 +665,19 @@ void I422ToRGBARow_MSA(const uint8* src_y,
uint8* dst_rgba, uint8* dst_rgba,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422AlphaToARGBRow_MSA(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
...@@ -1653,6 +1668,19 @@ void I422ToRGBARow_Any_MSA(const uint8* src_y, ...@@ -1653,6 +1668,19 @@ void I422ToRGBARow_Any_MSA(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422AlphaToARGBRow_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
const uint8* src_a,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_Any_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1631 #define LIBYUV_VERSION 1632
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -564,6 +564,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, ...@@ -564,6 +564,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2; I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
} }
#endif #endif
#if defined(HAS_I422ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3) #if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
......
...@@ -558,6 +558,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, ...@@ -558,6 +558,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TORGB24ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
if (IS_ALIGNED(width, 16)) {
I422ToRGB24Row = I422ToRGB24Row_MSA;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
......
...@@ -53,6 +53,9 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) ...@@ -53,6 +53,9 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
#ifdef HAS_I422ALPHATOARGBROW_NEON #ifdef HAS_I422ALPHATOARGBROW_NEON
ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
#endif #endif
#ifdef HAS_I422ALPHATOARGBROW_MSA
ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
#endif
#undef ANY41C #undef ANY41C
// Any 3 planes to 1. // Any 3 planes to 1.
...@@ -168,6 +171,7 @@ ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) ...@@ -168,6 +171,7 @@ ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#ifdef HAS_I422TOARGBROW_MSA #ifdef HAS_I422TOARGBROW_MSA
ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
#endif #endif
#undef ANY31C #undef ANY31C
......
This diff is collapsed.
...@@ -2711,7 +2711,6 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -2711,7 +2711,6 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
); );
} }
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile ( asm volatile (
"1: \n" "1: \n"
...@@ -2735,31 +2734,6 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { ...@@ -2735,31 +2734,6 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
); );
} }
void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
"fmul v1.4s, v1.4s, %3.s[0] \n"
"uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v4.8h, v1.4s, #13 \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "v1", "v2", "v4"
);
}
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile ( asm volatile (
"1: \n" "1: \n"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment