add YUV24 and AYUV formats

Alternatives to RGB24 and AYUV for working with GPU. BUG=libyuv:832 TESTED=out/Release/libyuv_unittest --gtest_filter=*NV21To???24* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 R=rrwinterton@gmail.com Change-Id: I5559c63f4bd4c847492fcb1571f7b03c58146689 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1501735Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>

add YUV24 and AYUV formats
Alternatives to RGB24 and AYUV for working with GPU. BUG=libyuv:832 TESTED=out/Release/libyuv_unittest --gtest_filter=*NV21To???24* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 R=rrwinterton@gmail.com Change-Id: I5559c63f4bd4c847492fcb1571f7b03c58146689 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1501735Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
5b6042fa · Frank Barchard · Commit Bot · 7ce50764 · 5b6042fa · 5b6042fa
Commit 5b6042fa authored Mar 05, 2019 by Frank Barchard Committed by Commit Bot Mar 05, 2019
15 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1724
+Version: 1725
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -226,6 +226,17 @@ int UYVYToI420(const uint8_t* src_uyvy,
               int width,
               int height);
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 // Convert M420 to I420.
 LIBYUV_API
 int M420ToI420(const uint8_t* src_m420,
@@ -375,13 +386,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
                   int height);
 #ifdef HAVE_JPEG
-// src_mjpg is pointer to raw jpeg bytes in memory
-// src_size_mjpg is size of jpeg in bytes
 // src_width/height provided by capture.
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToI420(const uint8_t* src_mjpg,
+int MJPGToI420(const uint8_t* sample,
-               size_t src_size_mjpg,
+               size_t sample_size,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
@@ -395,8 +404,8 @@ int MJPGToI420(const uint8_t* src_mjpg,
 // JPEG to NV21
 LIBYUV_API
-int MJPGToNV21(const uint8_t* src_mjpg,
+int MJPGToNV21(const uint8_t* sample,
-               size_t src_size_mjpg,
+               size_t sample_size,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_vu,
@@ -408,8 +417,8 @@ int MJPGToNV21(const uint8_t* src_mjpg,
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8_t* src_mjpg,
+int MJPGSize(const uint8_t* sample,
-             size_t src_size_mjpg,
+             size_t sample_size,
             int* width,
             int* height);
 #endif

--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -298,6 +298,17 @@ int NV21ToRGB24(const uint8_t* src_y,
                int width,
                int height);
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height);
 // Convert NV12 to RAW.
 LIBYUV_API
 int NV12ToRAW(const uint8_t* src_y,
@@ -627,8 +638,8 @@ int AR30ToAB30(const uint8_t* src_ar30,
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToARGB(const uint8_t* src_mjpg,
+int MJPGToARGB(const uint8_t* sample,
-               size_t src_size_mjpg,
+               size_t sample_size,
               uint8_t* dst_argb,
               int dst_stride_argb,
               int src_width,

--- a/include/libyuv/mjpeg_decoder.h
+++ b/include/libyuv/mjpeg_decoder.h
@@ -26,7 +26,7 @@ namespace libyuv {
 extern "C" {
 #endif
-LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg_size);
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
 #ifdef __cplusplus
 }  // extern "C"

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -295,6 +295,8 @@ extern "C" {
 #define HAS_I422TOYUY2ROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
+// TODO(fbarchard): Fix AVX2 version of YUV24
+// #define HAS_NV21TOYUV24ROW_AVX2
 #endif
 // The following are available for AVX512 clang x86 platforms:
@@ -330,6 +332,8 @@ extern "C" {
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOVUROW_NEON
+#define HAS_AYUVTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_BYTETOFLOATROW_NEON
@@ -355,6 +359,7 @@ extern "C" {
 #define HAS_NV12TORGB565ROW_NEON
 #define HAS_NV21TOARGBROW_NEON
 #define HAS_NV21TORGB24ROW_NEON
+#define HAS_NV21TOYUV24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
 #define HAS_RAWTOUVROW_NEON
@@ -402,6 +407,7 @@ extern "C" {
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_FLOATDIVTOBYTEROW_NEON
 #define HAS_SCALESUMSAMPLES_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -815,6 +821,10 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
@@ -2183,6 +2193,10 @@ void NV21ToRGB24Row_C(const uint8_t* src_y,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width);
 void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
                     uint8_t* rgb_buf,
                     const struct YuvConstants* yuvconstants,
@@ -2349,6 +2363,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
@@ -2554,6 +2572,10 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -3027,6 +3049,10 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
 void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
@@ -3345,6 +3371,19 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
                            uint8_t* dst_v,
                            int width);
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv, int stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width);
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv, int stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width);
+void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, int stride_ayuv,
+                          uint8_t* dst_vu,
+                          int width);
 void I422ToYUY2Row_C(const uint8_t* src_y,
                     const uint8_t* src_u,
                     const uint8_t* src_v,
@@ -3960,6 +3999,18 @@ float ScaleSumSamples_NEON(const float* src,
 void ScaleSamples_C(const float* src, float* dst, float scale, int width);
 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
+void FloatDivToByteRow_C(const float* src_weights,
+                         const float* src_values,
+                         uint8_t* dst_out,
+                         uint8_t* dst_mask,
+                         int width);
+void FloatDivToByteRow_NEON(const float* src_weights,
+                            const float* src_values,
+                            uint8_t* dst_out,
+                            uint8_t* dst_mask,
+                            int width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1724
+#define LIBYUV_VERSION 1725
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -880,6 +880,76 @@ int UYVYToI420(const uint8_t* src_uyvy,
  return 0;
 }
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_vu, int width) =
+      AYUVToVURow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToVURow = AYUVToVURow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToVURow = AYUVToVURow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToVURow = AYUVToVURow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToVURow = AYUVToVURow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToVURow = AYUVToVURow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToVURow = AYUVToVURow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_vu += dst_stride_vu;
+  }
+  if (height & 1) {
+    AYUVToVURow(src_ayuv, 0, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
 // Convert ARGB to I420.
 LIBYUV_API
 int ARGBToI420(const uint8_t* src_argb,
@@ -2165,6 +2235,7 @@ int Android420ToI420(const uint8_t* src_y,
  return 0;
 }
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -1998,6 +1998,56 @@ int NV21ToRAW(const uint8_t* src_y,
                           dst_stride_raw, &kYvuI601Constants, width, height);
 }
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height) {
+  int y;
+  void (*NV21ToYUV24Row)(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                          uint8_t* dst_yuv24,
+                          int width) = NV21ToYUV24Row_C;
+  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+    dst_stride_yuv24 = -dst_stride_yuv24;
+  }
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+    dst_yuv24 += dst_stride_yuv24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
 // Convert M420 to ARGB.
 LIBYUV_API
 int M420ToARGB(const uint8_t* src_m420,

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -286,7 +286,12 @@ ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
 #ifdef HAS_MERGEUVROW_MMI
 ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
 #endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
 // Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
@@ -702,6 +707,10 @@ ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #ifdef HAS_UYVYTOYROW_MMI
 ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
 #endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
@@ -1381,6 +1390,36 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
 #endif
 #undef ANY12S
+// Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu,  \
+               int width) {                                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n);                          \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
+    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
+  }
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -3231,6 +3231,73 @@ void GaussCol_C(const uint16_t* src0,
  }
 }
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_yuv24[0] = src_vu[0];  // V
+  	dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
+    dst_yuv24[3] = src_vu[0];  // V
+  	dst_yuv24[4] = src_vu[1];  // U
+    dst_yuv24[5] = src_y[1];   // Y1
+    src_y += 2;
+    src_vu += 2;
+    dst_yuv24 += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    dst_yuv24[0] = src_vu[0];  // V
+    dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
+  }
+}
+// Filter 2 rows of AYUV UV's (444) into VU (420).
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width) {
+  // Output a row of VU values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2;
+    src_ayuv += 8;
+    dst_vu += 2;
+  }
+  if (width & 1) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 0] + 2) >> 2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 1] + 2) >> 2;
+ }
+}
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_ayuv[2];   // v,u,y,a
+    src_ayuv += 4;
+  }
+}
+// divide values by weights and provide mask to indicate weight of 0.
+void FloatDivToByteRow_C(const float* src_weights,
+                         const float* src_values,
+                         uint8_t* dst_out,
+                         uint8_t* dst_mask,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_out[x] = Clamp(src_values[x] / src_weights[x]);
+    dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff;
+  }
+}
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -6669,6 +6669,121 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+// begin NV21ToYUV24Row_C avx2 constants
+static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
+                               0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
+static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
+static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
+static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
+                              0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
+static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
+                              0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
+static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
+                              0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
+static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
+                              0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
+static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
+                              0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
+static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
+                              0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
+// NV21ToYUV24Row_AVX2
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  uint8_t* src_y_ptr;
+  uint64_t src_offset = 0;
+  uint64_t width64;
+  width64 = width;
+  src_y_ptr = (uint8_t *) src_y;
+  asm volatile(
+      "vmovdqu     %5, %%ymm0 \n"  //init blend value
+      "vmovdqu     %6, %%ymm1 \n"  //init blend value
+      "vmovdqu     %7, %%ymm2 \n"  //init blend value
+//      "sub         $0x20, %3  \n"  //sub 32 from width for final loop
+      LABELALIGN
+      "1:                                             \n" //label 1
+      "vmovdqu     (%0,%4), %%ymm3                    \n" //src_y
+      "vmovdqu     1(%1,%4), %%ymm4                   \n" //src_uv+1
+      "vmovdqu     (%1), %%ymm5                       \n" //src_uv
+      "vpshufb     %8, %%ymm3, %%ymm13                \n" //y, kSHUF0 for shuf
+      "vpshufb     %9, %%ymm4, %%ymm14                \n" //uv+1, kSHUF1 for shuf
+      "vpshufb     %10, %%ymm5, %%ymm15               \n" //uv, kSHUF2 for shuf
+      "vpshufb     %11, %%ymm3, %%ymm3                \n" //y kSHUF3 for shuf
+      "vpshufb     %12, %%ymm4, %%ymm4                \n" //uv+1 kSHUF4 for shuf
+      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12  \n" //blend 0
+      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14  \n" //blend 0 
+      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12  \n" //blend 2
+      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13  \n" //blend 1
+      "vpshufb     %13, %%ymm5, %%ymm15               \n" //shuffle const
+      "vpor        %%ymm4, %%ymm3, %%ymm5             \n" //get results
+      "vmovdqu     %%ymm12, 0x20(%2)                  \n" //store dst_yuv+20h
+      "vpor        %%ymm15, %%ymm5, %%ymm3            \n" //get results
+      "add         $0x20, %4                          \n" //add to src buffer ptr
+      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4      \n" //insert
+      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5     \n" //insert
+      "vmovdqu     %%ymm4, (%2)                       \n" //store dst_yuv
+      "vmovdqu     %%ymm5, 0x40(%2)                   \n" //store dst_yuv+40h
+      "add         $0x60,%2                           \n" //add to dst buffer ptr
+//      "cmp         %3, %4                             \n" //(width64 - 32 bytes) and src_offset
+      "sub         $0x20,%3                           \n" // 32 pixels per loop
+      "jg          1b                                 \n"
+      "vzeroupper                                     \n" //sse-avx2 transistions
+      : "+r"(src_y),      //%0
+        "+r"(src_vu),     //%1
+        "+r"(dst_yuv24),  //%2
+        "+r"(width64),    //%3
+        "+r"(src_offset)  //%4
+      : "m"(kBLEND0),     //%5
+        "m"(kBLEND1),     //%6
+        "m"(kBLEND2),     //%7
+        "m"(kSHUF0),      //%8
+        "m"(kSHUF1),      //%9
+        "m"(kSHUF2),      //%10
+        "m"(kSHUF3),      //%11
+        "m"(kSHUF4),      //%12
+        "m"(kSHUF5)       //%13
+      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", "xmm13", "xmm14", "xmm15");
+}
+#endif  // HAS_NV21TOYUV24ROW_AVX2
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2685,6 +2685,77 @@ void ByteToFloatRow_NEON(const uint8_t* src,
      : "cc", "memory", "q1", "q2", "q3");
 }
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile (
+  "1:                                          \n"
+    "vld1.8    {q2}, [%0]!                     \n"  // load 16 Y values
+    "vld2.8    {d0, d2}, [%1]!                 \n"  // load 8 VU values
+    "vmov      d1, d0                          \n"
+    "vzip.u8   d0, d1                          \n"  // VV
+    "vmov      d3, d2                          \n"
+    "vzip.u8   d2, d3                          \n"  // UU
+    "subs      %3, %3, #16                     \n"  // 16 pixels per loop
+    "vst3.8    {d0, d2, d4}, [%2]!             \n"  // store 16 YUV pixels
+    "vst3.8    {d1, d3, d5}, [%2]!             \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),      // %0
+      "+r"(src_vu),     // %1
+      "+r"(dst_yuv24),  // %2
+      "+r"(width)       // %3
+    :
+    : "cc", "memory", "q0", "q1", "q2");
+}
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
+    "1:                                        \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV pixels.
+    "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV pixels.
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
+    "vqrshrun.s16 d1, q1, #2                   \n"
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels VU.
+    "bgt        1b                             \n"
+  : "+r"(src_ayuv),        // %0
+    "+r"(src_stride_ayuv), // %1
+    "+r"(dst_vu),          // %2
+    "+r"(width)            // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
+  );
+}
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV pixels
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vst1.8     {q2}, [%1]!                    \n"  // store 16 Y's.
+    "bgt       1b                              \n"
+   : "+r"(src_ayuv),   // %0
+      "+r"(dst_y),     // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "q0", "q1", "q2", "q3");
+}
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 #ifdef __cplusplus

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2876,6 +2876,113 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                          uint8_t* dst_yuv24,
+                          int width) {
+  asm volatile (
+  "1:                                          \n"
+    "ld1        {v2.16b}, [%0], #16            \n"  // load 16 Y values
+    "ld2        {v0.8b, v1.8b}, [%1], #16      \n"  // load 8 VU values
+    "zip1       v0.16b, v0.16b, v0.16b         \n"  // replicate V values
+    "zip1       v1.16b, v1.16b, v1.16b         \n"  // replicate U values
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels per loop
+    "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+    "b.gt       1b                             \n"
+    : "+r"(src_y),      // %0
+      "+r"(src_vu),     // %1
+      "+r"(dst_yuv24),  // %2
+      "+r"(width)       // %3
+    :
+    : "cc", "memory", "v0", "v1", "v2");
+}
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+  "1:                                          \n"
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+    "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
+    "uqrshrn    v1.8b, v1.8h, #2               \n"
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
+    "b.gt       1b                             \n"
+  : "+r"(src_ayuv),  // %0
+    "+r"(src_ayuv_1),  // %1
+    "+r"(dst_vu),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels
+    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
+    "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
+    "b.gt       1b                             \n"
+    : "+r"(src_ayuv),   // %0
+      "+r"(dst_y),      // %1
+      "+r"(width)       // %2
+    :
+    : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+void FloatDivToByteRow_NEON(const float* src_weights,
+                            const float* src_values,
+                            uint8_t* dst_out,
+                            uint8_t* dst_mask,
+                            int width) {
+  asm volatile(
+      "movi       v0.4s, #0                      \n"
+      "1:                                        \n"
+      "ld1        {v1.4s,v2.4s}, [%0], #32       \n"  // load 8 float weights
+      "ld1        {v3.4s,v4.4s}, [%1], #32       \n"  // load 8 float values
+      "subs       %w4, %w4, #8                   \n"  // 8 pixels per loop
+      "fdiv       v1.4s, v3.4s, v1.4s            \n"  // values / weights
+      "fdiv       v2.4s, v4.4s, v2.4s            \n"
+      "fcvtzu     v1.4s, v1.4s                   \n"  // float to int
+      "fcvtzu     v2.4s, v2.4s                   \n"  // float to int
+      "uqxtn      v1.4h, v1.4s                   \n"  // 8 shorts
+      "uqxtn2     v1.8h, v2.4s                   \n"
+      "uqxtn      v1.8b, v1.8h                   \n"  // 8 bytes
+      "st1        {v1.8b}, [%2], #8              \n"  // store 8 byte out
+      "fcmgt      v5.4s, v1.4s, v0.4s            \n"  // cmp weight to zero
+      "fcmgt      v6.4s, v2.4s, v0.4s            \n"
+      "uqxtn      v5.4h, v5.4s                   \n"  // 8 shorts
+      "uqxtn2     v5.8h, v6.4s                   \n"
+      "uqxtn      v5.8b, v1.8h                   \n"  // 8 bytes
+      "st1        {v5.8b}, [%3], #8              \n"  // store 8 byte mask
+      "b.gt       1b                             \n"
+      : "+r"(src_weights), // %0
+        "+r"(src_values),  // %1
+        "+r"(dst_out),     // %2
+        "+r"(dst_mask),    // %3
+        "+r"(width)        // %4
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+}
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #ifdef __cplusplus

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -680,7 +680,7 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
 TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,\
                         W1280, DIFF, N, NEG, OFF)                             \
  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
@@ -716,9 +716,9 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
-    FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
                  kHeight);                                                    \
-    FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
                  kHeight);                                                    \
    int max_diff = 0;                                                          \
    for (int i = 0; i < kHeight; ++i) {                                        \
@@ -740,25 +740,27 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
    free_aligned_buffer_page_end(dst_argb32_opt);                              \
  }
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
+                        BPP_B, DIFF)                                          \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
                   benchmark_width_ - 4, DIFF, _Any, +, 0)                    \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
                   benchmark_width_, DIFF, _Unaligned, +, 1)                  \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
                   benchmark_width_, DIFF, _Invert, -, 0)                     \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
                   benchmark_width_, DIFF, _Opt, +, 0)
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, 3, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, 3, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RAW, 3, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RAW, 3, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9)
+TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
 #ifdef DO_THREE_PLANES
 // Do 3 allocations for yuv.  conventional but slower.
@@ -978,6 +980,7 @@ TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
 TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
 TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
 TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
 #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,  \
                  HEIGHT_B, W1280, DIFF, N, NEG, OFF)                        \

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -3267,4 +3267,85 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
  EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
 }
+float TestFloatDivToByte(int benchmark_width,
+                       int benchmark_height,
+                       int benchmark_iterations,
+                       float scale,
+                       bool opt) {
+  int i, j;
+  // NEON does multiple of 8, so round count up
+  const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
+  align_buffer_page_end(src_weights, kPixels * 4);
+  align_buffer_page_end(src_values, kPixels * 4);
+  align_buffer_page_end(dst_out_c, kPixels);
+  align_buffer_page_end(dst_out_opt, kPixels);
+  align_buffer_page_end(dst_mask_c, kPixels);
+  align_buffer_page_end(dst_mask_opt, kPixels);
+  // Randomize works but may contain some denormals affecting performance.
+  // MemRandomize(orig_y, kPixels * 4);
+  // large values are problematic.  audio is really -1 to 1.
+  for (i = 0; i < kPixels; ++i) {
+    (reinterpret_cast<float*>(src_weights))[i] = scale;
+    (reinterpret_cast<float*>(src_values))[i] = sinf(static_cast<float>(i) * 0.1f);
+  }
+  memset(dst_out_c, 0, kPixels);
+  memset(dst_out_opt, 1, kPixels);
+  memset(dst_mask_c, 2, kPixels);
+  memset(dst_mask_opt, 3, kPixels);
+  FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
+                      reinterpret_cast<float*>(src_values),
+                      dst_out_c, dst_mask_c, kPixels);
+  for (j = 0; j < benchmark_iterations; j++) {
+    if (opt) {
+#ifdef HAS_FLOATDIVTOBYTEROW_NEON
+      FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights),
+                             reinterpret_cast<float*>(src_values),
+                              dst_out_opt, dst_mask_opt, kPixels);
+#else
+      FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
+                          reinterpret_cast<float*>(src_values),
+                           dst_out_opt, dst_mask_opt, kPixels);
+#endif
+    } else {
+      FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
+                          reinterpret_cast<float*>(src_values),
+                           dst_out_opt, dst_mask_opt, kPixels);
+    }
+  }
+  uint8_t max_diff = 0;
+  for (i = 0; i < kPixels; ++i) {
+    uint8_t abs_diff = abs(dst_out_c[i] - dst_out_opt[i]) +
+                       abs(dst_mask_c[i] - dst_mask_opt[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_page_end(src_weights);
+  free_aligned_buffer_page_end(src_values);
+  free_aligned_buffer_page_end(dst_out_c);
+  free_aligned_buffer_page_end(dst_out_opt);
+  free_aligned_buffer_page_end(dst_mask_c);
+  free_aligned_buffer_page_end(dst_mask_opt);
+  return max_diff;
+}
+TEST_F(LibYUVPlanarTest, TestFloatDivToByte_C) {
+  float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, 1.2f, false);
+  EXPECT_EQ(0, diff);
+}
+TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) {
+  float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, 1.2f, true);
+  EXPECT_EQ(0, diff);
+}
 }  // namespace libyuv