Reimplement NV21ToARGB to allow different color matrix.

Low level for NV21ToARGB written to accept yuv matrix used by other YUV to ARGB functions. Previously NV21 was implemented for Windows using NV12 with a different matrix that swapped U and V. But the Arm version of the low level does not allow the matrix U and V contributions to be swapped. Using a new low level function that reads NV21 and uses the same yuvconstants as other YUV conversion functions allows an Arm port of this function. TBR=harryjin@google.com BUG=libyuv:500 Review URL: https://codereview.chromium.org/1388273002 .

Reimplement NV21ToARGB to allow different color matrix.
Low level for NV21ToARGB written to accept yuv matrix used by other YUV to ARGB functions. Previously NV21 was implemented for Windows using NV12 with a different matrix that swapped U and V. But the Arm version of the low level does not allow the matrix U and V contributions to be swapped. Using a new low level function that reads NV21 and uses the same yuvconstants as other YUV conversion functions allows an Arm port of this function. TBR=harryjin@google.com BUG=libyuv:500 Review URL: https://codereview.chromium.org/1388273002 .
914a9856 · Frank Barchard · 68fa59c8 · 914a9856 · 914a9856 · 914a9856
Commit 914a9856 authored Oct 07, 2015 by Frank Barchard
13 changed files
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -145,13 +145,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);
-// Convert NV21 to RGB565.
-LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
 // I422ToARGB is in convert_argb.h
 // Convert I422 to BGRA.
 LIBYUV_API

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -126,6 +126,7 @@ extern "C" {
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
@@ -249,6 +250,7 @@ extern "C" {
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
 #define HAS_I422ALPHATOARGBROW_AVX2
 #define HAS_I422ALPHATOABGRROW_AVX2
@@ -312,6 +314,7 @@ extern "C" {
 #define HAS_MIRRORUVROW_NEON
 #define HAS_NV12TOARGBROW_NEON
 #define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RAWTOUVROW_NEON
 #define HAS_RAWTOYROW_NEON
@@ -632,6 +635,11 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
                          uint8* dst_rgb565,
                          struct YuvConstants* yuvconstants,
                          int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        struct YuvConstants* yuvconstants,
+                        int width);
 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                        uint8* dst_argb,
                        struct YuvConstants* yuvconstants,
@@ -1075,6 +1083,11 @@ void NV12ToRGB565Row_C(const uint8* src_y,
                       uint8* dst_argb,
                       struct YuvConstants* yuvconstants,
                       int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     struct YuvConstants* yuvconstants,
+                     int width);
 void YUY2ToARGBRow_C(const uint8* src_yuy2,
                     uint8* dst_argb,
                     struct YuvConstants* yuvconstants,
@@ -1293,6 +1306,16 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
                          uint8* dst_argb,
                          struct YuvConstants* yuvconstants,
                          int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        struct YuvConstants* yuvconstants,
+                        int width);
 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
                         uint8* dst_argb,
                         struct YuvConstants* yuvconstants,
@@ -1491,6 +1514,16 @@ void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
                            uint8* dst_argb,
                            struct YuvConstants* yuvconstants,
                            int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_vu,
+                             uint8* dst_argb,
+                             struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            struct YuvConstants* yuvconstants,
+                            int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
                               const uint8* src_uv,
                               uint8* dst_argb,
@@ -1756,6 +1789,11 @@ void NV12ToARGBRow_Any_NEON(const uint8* src_y,
                            uint8* dst_argb,
                            struct YuvConstants* yuvconstants,
                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            struct YuvConstants* yuvconstants,
+                            int width);
 void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
                              const uint8* src_uv,
                              uint8* dst_argb,

--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -24,11 +24,12 @@ extern "C" {
 #define LIBYUV_DISABLE_X86
 #endif
-// Visual C 2012 required for AVX2.
+// GCC >= 4.7.0 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-    defined(_MSC_VER) && _MSC_VER >= 1700
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define VISUALC_HAS_AVX2 1
+#define GCC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
 // clang >= 3.4.0 required for AVX2.
 #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
@@ -37,6 +38,12 @@ extern "C" {
 #endif  // clang >= 3.4
 #endif  // __clang__
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
@@ -56,10 +63,17 @@ extern "C" {
 #define HAS_SCALEADDROW_SSE2
 #endif
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#endif
 // The following are available for Visual C and clangcl 32 bit:
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_SCALEADDROW_AVX2
 #define HAS_SCALEROWDOWN2_AVX2
 #define HAS_SCALEROWDOWN4_AVX2
 #endif

--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -1093,11 +1093,11 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
  int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
+  void (*NV21ToARGBRow)(const uint8* y_buf,
                        const uint8* uv_buf,
                        uint8* rgb_buf,
                        struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
+                        int width) = NV21ToARGBRow_C;
  if (!src_y || !src_uv || !dst_argb ||
      width <= 0 || height == 0) {
    return -1;
@@ -1108,33 +1108,33 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
+#if defined(HAS_NV21TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
+#if defined(HAS_NV21TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_NV12TOARGBROW_NEON)
+#if defined(HAS_NV21TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYvuConstants, width);
+    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvConstants, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    if (y & 1) {

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1039,64 +1039,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
  return 0;
 }
-// Convert NV21 to RGB565.
-LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_vu, int src_stride_vu,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  int y;
-  void (*NV12ToRGB565Row)(const uint8* y_buf,
-                          const uint8* src_vu,
-                          uint8* rgb_buf,
-                          struct YuvConstants* yuvconstants,
-                          int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_vu || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB565Row(src_y, src_vu, dst_rgb565, &kYvuConstants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
-  }
-  return 0;
-}
 LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
              int width, int height,

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -280,6 +280,15 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
 #ifdef HAS_NV12TOARGBROW_NEON
 ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV12TORGB565ROW_SSSE3
 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
 #endif

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1663,6 +1663,30 @@ void NV12ToARGBRow_C(const uint8* src_y,
  }
 }
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* rgb_buf,
+                     struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_vu[1], src_vu[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
 void NV12ToRGB565Row_C(const uint8* src_y,
                       const uint8* src_uv,
                       uint8* dst_rgb565,

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -164,6 +164,12 @@ static const lvec8 kShuffleUYVYUV = {
  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
 };
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 #ifdef HAS_J400TOARGBROW_SSE2
@@ -1398,6 +1404,15 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21                                                               \
+    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
+    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
 #define READYUY2                                                               \
    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
@@ -1769,6 +1784,31 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
  );
 }
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* vu_buf,
+                                uint8* dst_argb,
+                                struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
                                uint8* dst_argb,
                                struct YuvConstants* yuvconstants,
@@ -1940,6 +1980,17 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2                                                          \
+    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
+    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
 #define READYUY2_AVX2                                                          \
    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
@@ -2251,8 +2302,37 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  );
 }
-#endif  // HAS_YUY2TOARGBROW_AVX2
+#endif  // HAS_NV12TOARGBROW_AVX2
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* vu_buf,
+                               uint8* dst_argb,
+                               struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV21_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
 #if defined(HAS_YUY2TOARGBROW_AVX2)
 // 16 pixels.

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -579,6 +579,34 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
  );
 }
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV21
+    YUVTORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
 void NV12ToRGB565Row_NEON(const uint8* src_y,
                          const uint8* src_uv,
                          uint8* dst_rgb565,

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -576,6 +576,34 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
 }
 #endif  // HAS_NV12TOARGBROW_NEON
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kYuvConstants.kUVBiasBGR),
+      [kYToRgb]"r"(&kYuvConstants.kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
 #ifdef HAS_NV12TORGB565ROW_NEON
 void NV12ToRGB565Row_NEON(const uint8* src_y,
                          const uint8* src_uv,

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -319,6 +319,12 @@ static const lvec8 kShuffleUYVYUV = {
  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
 };
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
 // Duplicates gray value 3 times and fills in alpha opaque.
 __declspec(naked)
 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
@@ -1992,6 +1998,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    __asm lea        eax, [eax + 16]                                           \
  }
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
 #define READYUY2_AVX2 __asm {                                                  \
    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
@@ -2365,6 +2383,41 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_NV12TOARGBROW_AVX2
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* vu_buf,
+                        uint8* dst_argb,
+                        struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+ convertloop:
+    READNV21_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+    sub        ecx, 16
+    jg         convertloop
+    pop        ebx
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
 // 16 pixels.
 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
 __declspec(naked)
@@ -2608,6 +2661,16 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
    __asm lea        eax, [eax + 8]                                            \
  }
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
 #define READYUY2 __asm {                                                       \
    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
@@ -3152,6 +3215,38 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* vu_buf,
+                         uint8* dst_argb,
+                         struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+ convertloop:
+    READNV21
+    YUVTORGB(ebx)
+    STOREARGB
+    sub        ecx, 8
+    jg         convertloop
+    pop        ebx
+    pop        esi
+    ret
+  }
+}
 // 8 pixels.
 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
 __declspec(naked)

--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -9,6 +9,7 @@
 */
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 #ifdef __cplusplus
 namespace libyuv {
@@ -608,12 +609,12 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 // Reads 32 bytes and accumulates to 32 shorts at a time.
 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  asm volatile (
-    "vpxor      %%xmm5,%%xmm5                  \n"
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
    LABELALIGN
  "1:                                          \n"
    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 16
+    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -671,7 +671,6 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
 TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
 TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
 TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
-TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
 #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
                       W1280, DIFF, N, NEG, OFF)                               \