Commit 914a9856 authored by Frank Barchard's avatar Frank Barchard

Reimplement NV21ToARGB to allow different color matrix.

Low level for NV21ToARGB written to accept yuv matrix used by
other YUV to ARGB functions.
Previously NV21 was implemented for Windows using NV12 with a different
matrix that swapped U and V.  But the Arm version of the low level does
not allow the matrix U and V contributions to be swapped.
Using a new low level function that reads NV21 and uses the same
yuvconstants as other YUV conversion functions allows an Arm port of
this function.

TBR=harryjin@google.com
BUG=libyuv:500

Review URL: https://codereview.chromium.org/1388273002 .
parent 68fa59c8
......@@ -145,13 +145,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert NV21 to RGB565.
LIBYUV_API
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
......
......@@ -126,6 +126,7 @@ extern "C" {
#define HAS_MIRRORUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
#define HAS_NV21TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
......@@ -249,6 +250,7 @@ extern "C" {
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_I422ALPHATOARGBROW_AVX2
#define HAS_I422ALPHATOABGRROW_AVX2
......@@ -312,6 +314,7 @@ extern "C" {
#define HAS_MIRRORUVROW_NEON
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB565ROW_NEON
#define HAS_NV21TOARGBROW_NEON
#define HAS_RAWTOARGBROW_NEON
#define HAS_RAWTOUVROW_NEON
#define HAS_RAWTOYROW_NEON
......@@ -632,6 +635,11 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_NEON(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
......@@ -1075,6 +1083,11 @@ void NV12ToRGB565Row_C(const uint8* src_y,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_C(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_C(const uint8* src_yuy2,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
......@@ -1293,6 +1306,16 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
......@@ -1491,6 +1514,16 @@ void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
......@@ -1756,6 +1789,11 @@ void NV12ToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
......
......@@ -24,11 +24,12 @@ extern "C" {
#define LIBYUV_DISABLE_X86
#endif
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
#define GCC_HAS_AVX2 1
#endif // GNUC >= 4.7
#endif // __GNUC__
// clang >= 3.4.0 required for AVX2.
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
......@@ -37,6 +38,12 @@ extern "C" {
#endif // clang >= 3.4
#endif // __clang__
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
......@@ -56,10 +63,17 @@ extern "C" {
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEADDROW_AVX2
#endif
// The following are available for Visual C and clangcl 32 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#endif
......
......@@ -1093,11 +1093,11 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*NV12ToARGBRow)(const uint8* y_buf,
void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
struct YuvConstants* yuvconstants,
int width) = NV12ToARGBRow_C;
int width) = NV21ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
return -1;
......@@ -1108,33 +1108,33 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_NV12TOARGBROW_SSSE3)
#if defined(HAS_NV21TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
NV21ToARGBRow = NV21ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_NV12TOARGBROW_AVX2)
#if defined(HAS_NV21TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
NV12ToARGBRow = NV12ToARGBRow_AVX2;
NV21ToARGBRow = NV21ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_NV12TOARGBROW_NEON)
#if defined(HAS_NV21TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
NV21ToARGBRow = NV21ToARGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, &kYvuConstants, width);
NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvConstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
......
......@@ -1039,64 +1039,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
return 0;
}
// Convert NV21 to RGB565.
LIBYUV_API
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
int y;
void (*NV12ToRGB565Row)(const uint8* y_buf,
const uint8* src_vu,
uint8* rgb_buf,
struct YuvConstants* yuvconstants,
int width) = NV12ToRGB565Row_C;
if (!src_y || !src_vu || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
dst_stride_rgb565 = -dst_stride_rgb565;
}
#if defined(HAS_NV12TORGB565ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
}
}
#endif
#if defined(HAS_NV12TORGB565ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
}
}
#endif
#if defined(HAS_NV12TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToRGB565Row = NV12ToRGB565Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
NV12ToRGB565Row(src_y, src_vu, dst_rgb565, &kYvuConstants, width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
src_vu += src_stride_vu;
}
}
return 0;
}
LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
......
......@@ -280,6 +280,15 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#ifdef HAS_NV12TOARGBROW_NEON
ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_SSSE3
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_AVX2
ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV21TOARGBROW_NEON
ANY21C(NV21ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
......
......@@ -1663,6 +1663,30 @@ void NV12ToARGBRow_C(const uint8* src_y,
}
}
void NV21ToARGBRow_C(const uint8* src_y,
const uint8* src_vu,
uint8* rgb_buf,
struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_vu[1], src_vu[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
YuvPixel(src_y[1], src_vu[1], src_vu[0],
rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_vu += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], src_vu[1], src_vu[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
void NV12ToRGB565Row_C(const uint8* src_y,
const uint8* src_uv,
uint8* dst_rgb565,
......
......@@ -164,6 +164,12 @@ static const lvec8 kShuffleUYVYUV = {
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
};
// NV21 shuf 8 VU to 16 UV.
static const lvec8 kShuffleNV21 = {
1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
};
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_J400TOARGBROW_SSE2
......@@ -1398,6 +1404,15 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 VU from NV21, upsample to 8 UV
#define READNV21 \
"movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
"lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
"pshufb %[kShuffleNV21], %%xmm0 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
#define READYUY2 \
"movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
......@@ -1769,6 +1784,31 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
);
}
void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READNV21
YUVTORGB(yuvconstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleNV21]"m"(kShuffleNV21)
// Does not use r14.
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
......@@ -1940,6 +1980,17 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 8 VU from NV21, upsample to 16 UV.
#define READNV21_AVX2 \
"vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
"lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
"vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
"vpermq $0xd8,%%ymm4,%%ymm4 \n" \
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
#define READYUY2_AVX2 \
"vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
......@@ -2251,8 +2302,37 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_YUY2TOARGBROW_AVX2
#endif // HAS_NV12TOARGBROW_AVX2
#if defined(HAS_NV21TOARGBROW_AVX2)
// 16 pixels.
// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
const uint8* vu_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
READNV21_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleNV21]"m"(kShuffleNV21)
// Does not use r14.
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_NV21TOARGBROW_AVX2
#if defined(HAS_YUY2TOARGBROW_AVX2)
// 16 pixels.
......
......@@ -579,6 +579,34 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
);
}
void NV21ToARGBRow_NEON(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READNV21
YUVTORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB]"r"(&yuvconstants->kUVToRB),
[kUVToG]"r"(&yuvconstants->kUVToG),
[kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
[kYToRgb]"r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
void NV12ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_rgb565,
......
......@@ -576,6 +576,34 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
}
#endif // HAS_NV12TOARGBROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON
void NV21ToARGBRow_NEON(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READNV21
YUVTORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
"movi v23.8b, #255 \n"
MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVBiasBGR]"r"(&kYuvConstants.kUVBiasBGR),
[kYToRgb]"r"(&kYuvConstants.kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_NV12TOARGBROW_NEON
#ifdef HAS_NV12TORGB565ROW_NEON
void NV12ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_uv,
......
......@@ -319,6 +319,12 @@ static const lvec8 kShuffleUYVYUV = {
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
};
// NV21 shuf 8 VU to 16 UV.
static const lvec8 kShuffleNV21 = {
1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
};
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked)
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
......@@ -1992,6 +1998,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm lea eax, [eax + 16] \
}
// Read 8 UV from NV21, upsample to 16 UV.
#define READNV21_AVX2 __asm { \
__asm vmovdqu xmm0, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
__asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16] \
}
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
#define READYUY2_AVX2 __asm { \
__asm vmovdqu ymm4, [eax] /* YUY2 */ \
......@@ -2365,6 +2383,41 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_NV12TOARGBROW_AVX2
#ifdef HAS_NV21TOARGBROW_AVX2
// 16 pixels.
// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked)
void NV21ToARGBRow_AVX2(const uint8* y_buf,
const uint8* vu_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push ebx
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // VU
mov edx, [esp + 8 + 12] // argb
mov ebx, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READNV21_AVX2
YUVTORGB_AVX2(ebx)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
pop ebx
pop esi
vzeroupper
ret
}
}
#endif // HAS_NV21TOARGBROW_AVX2
// 16 pixels.
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
__declspec(naked)
......@@ -2608,6 +2661,16 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm lea eax, [eax + 8] \
}
// Read 4 VU from NV21, upsample to 8 UV.
#define READNV21 __asm { \
__asm movq xmm0, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
__asm pshufb xmm0, xmmword ptr kShuffleNV21 \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
}
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
#define READYUY2 __asm { \
__asm movdqu xmm4, [eax] /* YUY2 */ \
......@@ -3152,6 +3215,38 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked)
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push ebx
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // VU
mov edx, [esp + 8 + 12] // argb
mov ebx, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READNV21
YUVTORGB(ebx)
STOREARGB
sub ecx, 8
jg convertloop
pop ebx
pop esi
ret
}
}
// 8 pixels.
// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked)
......
......@@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
......@@ -608,12 +609,12 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
// Reads 32 bytes and accumulates to 32 shorts at a time.
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
asm volatile (
"vpxor %%xmm5,%%xmm5 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 16
"lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
......
......@@ -671,7 +671,6 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
W1280, DIFF, N, NEG, OFF) \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment