Commit 3009890c authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

NV21ToRGB24_AVX2 and SSSE3

Use 2 step conversion for NV21ToRGB24 to leverage AVX2
low levels instead of C.

Was C
NV21ToRGB24_Opt (882 ms)

Now SSSE3
NV21ToRGB24_Opt (218 ms)

Bug: libyuv:778
Test: LibYUVConvertTest.NV21ToRGB24_Opt
Change-Id: I58faf766bbec4cc595aab2e217f6c874dd4b4363
Reviewed-on: https://chromium-review.googlesource.com/951629
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent 98a0a157
......@@ -105,8 +105,10 @@ extern "C" {
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB24ROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
#define HAS_NV21TOARGBROW_SSSE3
#define HAS_NV21TORGB24ROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RAWTORGB24ROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
......@@ -200,8 +202,10 @@ extern "C" {
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV12TORGB24ROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_NV21TORGB24ROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
......@@ -1979,11 +1983,31 @@ void NV12ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb565,
......@@ -2169,6 +2193,26 @@ void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
......@@ -3358,9 +3402,9 @@ void ByteToFloatRow_NEON(const uint8_t* src,
float* dst,
float scale,
int width);
void ByteToFloatRow_Any_NEON(const uint8_t* src,
float* dst,
float scale,
void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
float* dst_ptr,
float param,
int width);
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
......
......@@ -1789,6 +1789,22 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_NV12TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
}
}
#endif
#if defined(HAS_NV12TORGB24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
}
}
#endif
for (y = 0; y < height; ++y) {
NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
......@@ -1832,6 +1848,22 @@ static int NV21ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_NV21TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
}
}
#endif
#if defined(HAS_NV21TORGB24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
}
}
#endif
for (y = 0; y < height; ++y) {
NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
......@@ -1844,8 +1876,8 @@ static int NV21ToRGB24Matrix(const uint8_t* src_y,
return 0;
}
// TODO(fbarchard): \(fbarchard): NV12ToRAW can be implemented by mirrored
// matrix. Convert NV12 to RGB24.
// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix.
// Convert NV12 to RGB24.
LIBYUV_API
int NV12ToRGB24(const uint8_t* src_y,
int src_stride_y,
......
......@@ -341,18 +341,18 @@ ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8_t temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
}
// Biplanar to RGB.
......@@ -386,6 +386,18 @@ ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
#ifdef HAS_NV21TORGB24ROW_NEON
ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
#endif
#ifdef HAS_NV12TORGB24ROW_SSSE3
ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
#endif
#ifdef HAS_NV21TORGB24ROW_SSSE3
ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
#endif
#ifdef HAS_NV12TORGB24ROW_AVX2
ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
#endif
#ifdef HAS_NV21TORGB24ROW_AVX2
ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
......
......@@ -2953,6 +2953,88 @@ void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
}
#endif
#if defined(HAS_NV12TORGB24ROW_SSSE3)
void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_uv += twidth;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_NV21TORGB24ROW_SSSE3)
void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_vu += twidth;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_NV12TORGB24ROW_AVX2)
void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_uv += twidth;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_NV21TORGB24ROW_AVX2)
void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_vu += twidth;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_I422TORGB565ROW_AVX2)
void I422ToRGB565Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment