YUY2 and UYVY conversions to I420 ported to NEON

BUG=64 TEST=untested Review URL: https://webrtc-codereview.appspot.com/823007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@371 16f28f9a-4ce2-e073-06de-1de4eb20be90

YUY2 and UYVY conversions to I420 ported to NEON
BUG=64 TEST=untested Review URL: https://webrtc-codereview.appspot.com/823007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@371 16f28f9a-4ce2-e073-06de-1de4eb20be90
dddf94c3 · fbarchard@google.com · f0ada0e9 · dddf94c3 · dddf94c3 · dddf94c3
Commit dddf94c3 authored Sep 21, 2012 by fbarchard@google.com
12 changed files
--- a/Android.mk
+++ b/Android.mk
@@ -16,12 +16,11 @@ common_SRC_FILES := \
    source/row_posix.cc \
    source/scale.cc \
    source/scale_argb.cc \
-    source/video_common.cc
-# For Neon support, add .neon to all filenames and the following
-#    source/rotate_neon.cc
-#    source/row_neon.cc
+    source/video_common.cc \
+    source/rotate_neon.cc \
+    source/row_neon.cc

-common_CFLAGS := -Wall -fexceptions
+common_CFLAGS := -Wall -fexceptions -DHAVE_ARMEABI_V7A=1 -mfloat-abi=softfp -mfpu=neon

 common_C_INCLUDES = $(LOCAL_PATH)/include


--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 370
+Version: 371
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -67,6 +67,7 @@ extern "C" {
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_SPLITUV_SSE2
+#define HAS_UYVYTOUV422ROW_SSE2
 #define HAS_UYVYTOUVROW_SSE2
 #define HAS_UYVYTOYROW_SSE2
 #define HAS_YTOARGBROW_SSE2
@@ -119,11 +120,22 @@ extern "C" {
 #define HAS_I422TOBGRAROW_NEON
 #define HAS_I422TOABGRROW_NEON
 #define HAS_I422TORGBAROW_NEON
+// TODO(fbarchard): Hook these up to calling functions.
+#define HAS_ARGBTORGBAROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
 #define HAS_ABGRTOARGBROW_NEON
 #define HAS_BGRATOARGBROW_NEON
 #define HAS_RGBATOARGBROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+
 #endif

 #if defined(_MSC_VER) && !defined(__CLR_VER)
@@ -542,6 +554,11 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
                                uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
                                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
                   uint8* dst_u, uint8* dst_v, int pix);
@@ -552,6 +569,11 @@ void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);

 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
@@ -564,6 +586,12 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
                                uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
                                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+
 void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
                   uint8* dst_u, uint8* dst_v, int pix);
@@ -574,7 +602,11 @@ void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
                             uint8* dst_u, uint8* dst_v, int pix);
-
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);

 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 370
+#define LIBYUV_VERSION 371

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -744,6 +744,21 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
      }
    }
  }
+#elif defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+      if (width > 16) {
+        YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        YUY2ToUVRow = YUY2ToUVRow_NEON;
+      }
+    }
+  }
 #endif

  for (int y = 0; y < height - 1; y += 2) {
@@ -768,11 +783,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
-  if (!src_uyvy ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@@ -802,7 +812,23 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
      }
    }
  }
+#elif defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUVRow = UYVYToUVRow_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        UYVYToUVRow = UYVYToUVRow_NEON;
+      }
+    }
+  }
 #endif
+
  for (int y = 0; y < height - 1; y += 2) {
    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
    UYVYToYRow(src_uyvy, dst_y, width);
@@ -905,7 +931,55 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210,
      UYVYToYRow = UYVYToYRow_SSE2;
    }
  }
+#elif defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUVRow = UYVYToUVRow_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        UYVYToUVRow = UYVYToUVRow_NEON;
+      }
+    }
+  }
 #endif
+
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+      UYVYToYRow = UYVYToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        UYVYToYRow = UYVYToYRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUVRow = UYVYToUVRow_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        UYVYToUVRow = UYVYToUVRow_NEON;
+      }
+    }
+  }
+#endif
+
  for (int y = 0; y < height - 1; y += 2) {
    V210ToUYVYRow(src_v210, row, width);
    V210ToUYVYRow(src_v210 + src_stride_v210, row + kMaxStride, width);

--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -671,7 +671,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
      }
    }
  }
+#elif defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+      if (width > 16) {
+        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+      }
+    }
+  }
 #endif
+
  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -933,7 +933,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
    if (width * 3 <= kMaxStride) {
      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
    }
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
    }
  }
@@ -1004,7 +1004,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
    if (width * 3 <= kMaxStride) {
      ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
    }
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBToRAWRow = ARGBToRAWRow_NEON;
    }
  }

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -321,8 +321,7 @@ int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBTORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(width, 16)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
    ARGBToRGBARow = ARGBToRGBARow_NEON;
  }
 #endif
@@ -366,7 +365,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
    if (width * 3 <= kMaxStride) {
      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
    }
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
    }
  }
@@ -411,7 +410,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
    if (width * 3 <= kMaxStride) {
      ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
    }
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBToRAWRow = ARGBToRAWRow_NEON;
    }
  }

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -957,8 +957,6 @@ YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
 YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
 YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
-#endif
-#ifdef HAS_I422TORGBAROW_NEON
 YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
 #endif
 #undef YANY
@@ -1000,6 +998,10 @@ YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
 #endif
 YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
 YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
+#ifdef HAS_YUY2TOYROW_NEON
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2)
+#endif
 #undef YANY

 #define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                            \
@@ -1021,6 +1023,10 @@ UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
 #endif
 UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
 UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
+#ifdef HAS_YUY2TOUVROW_NEON
+UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
+UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
+#endif
 #undef UVANY

 #define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP)                         \
@@ -1038,6 +1044,12 @@ UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,               \
         YUY2ToUV422Row_C, 2)
 UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,               \
         UYVYToUV422Row_C, 2)
+#ifdef HAS_YUY2TOUV422ROW_NEON
+UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,                         \
+         YUY2ToUV422Row_C, 2)
+UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,                         \
+         UYVYToUV422Row_C, 2)
+#endif
 #undef UV422ANY

 #endif

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -72,7 +72,7 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
    YUV422TORGB
    "vmov.u8    d21, d16                       \n"
    "vmov.u8    d23, #255                      \n"
-    "vst4.u8    {d20, d21, d22, d23}, [%3]!    \n"
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
    "subs       %4, %4, #8                     \n"
    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
@@ -105,7 +105,7 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
    "vswp.u8    d20, d22                       \n"
    "vmov.u8    d21, d16                       \n"
    "vmov.u8    d19, #255                      \n"
-    "vst4.u8    {d19, d20, d21, d22}, [%3]!    \n"
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
    "subs       %4, %4, #8                     \n"
    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
@@ -138,7 +138,7 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
    "vswp.u8    d20, d22                       \n"
    "vmov.u8    d21, d16                       \n"
    "vmov.u8    d23, #255                      \n"
-    "vst4.u8    {d20, d21, d22, d23}, [%3]!    \n"
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
    "subs       %4, %4, #8                     \n"
    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
@@ -170,7 +170,7 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
    YUV422TORGB
    "vmov.u8    d21, d16                       \n"
    "vmov.u8    d19, #255                      \n"
-    "vst4.u8    {d19, d20, d21, d22}, [%3]!    \n"
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
    "subs       %4, %4, #8                     \n"
    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
@@ -192,7 +192,7 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
  asm volatile (
  "1:                                          \n"
-    "vld2.u8    {q0,q1}, [%0]!                 \n"  // load 16 pairs of UV
+    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
    "vst1.u8    {q0}, [%1]!                    \n"  // store U
    "vst1.u8    {q1}, [%2]!                    \n"  // Store V
@@ -213,9 +213,9 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  asm volatile (
  "1:                                          \n"
    "pld        [%0, #0xC0]                    \n"  // preload
-    "vldm       %0!,{q0,q1,q2,q3}              \n"  // load 64
+    "vldm       %0!,{q0, q1, q2, q3}              \n"  // load 64
    "subs       %2, %2, #64                    \n"  // 64 processed per loop
-    "vstm       %1!,{q0,q1,q2,q3}              \n"  // store 64
+    "vstm       %1!,{q0, q1, q2, q3}              \n"  // store 64
    "bgt        1b                             \n"
    : "+r"(src),   // %0
      "+r"(dst),   // %1
@@ -360,21 +360,22 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
 }
 #endif  // HAS_MIRRORROWUV_NEON

+// TODO(fbarchard): Avoid d4-d7.
 #ifdef HAS_BGRATOARGBROW_NEON
 void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
  asm volatile (
  "1:                                          \n"
-    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of BGRA.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vswp.u8    q2, q3                         \n"  // swap G, R
-    "vswp.u8    q1, q4                         \n"  // swap B, A
-    "vst4.u8    {q1,q2,q3,q4}, [%1]!           \n"  // store 16 pixels of ARGB.
+    "vld4.8     {d5, d6, d7, d8}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d6, d7                         \n"  // swap G, R
+    "vswp.u8    d5, d8                         \n"  // swap B, A
+    "vst4.8     {d5, d6, d7, d8}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_bgra),  // %0
    "+r"(dst_argb),  // %1
    "+r"(pix)        // %2
  :
-  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List
  );
 }
 #endif  // HAS_BGRATOARGBROW_NEON
@@ -383,16 +384,16 @@ void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
 void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
  asm volatile (
  "1:                                          \n"
-    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ABGR.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vswp.u8    q1, q3                         \n"  // swap R, B
-    "vst4.u8    {q1,q2,q3,q4}, [%1]!           \n"  // store 16 pixels of ARGB.
+    "vld4.8     {d5, d6, d7, d8}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d5, d7                         \n"  // swap R, B
+    "vst4.8     {d5, d6, d7, d8}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_abgr),  // %0
    "+r"(dst_argb),  // %1
    "+r"(pix)        // %2
  :
-  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List
  );
 }
 #endif  // HAS_ABGRTOARGBROW_NEON
@@ -400,17 +401,17 @@ void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
 #ifdef HAS_RGBATOARGBROW_NEON
 void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
  asm volatile (
-  "1:                                          \n"
-    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of RGBA.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vmov.u8    q5, q1                         \n"  // move A after RGB
-    "vst4.u8    {q2,q3,q4,q5}, [%1]!           \n"  // store 16 pixels of ARGB.
-    "bgt        1b                             \n"
+  "1:                                           \n"
+    "vld4.8     {d5, d6, d7, d8}, [%0]!         \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                      \n"  // 8 processed per loop.
+    "vmov.u8    d9, d5                          \n"  // move A after RGB
+    "vst4.8     {d6, d7, d8, d9}, [%1]!         \n"  // store 8 pixels of ARGB.
+    "bgt        1b                              \n"
  : "+r"(src_rgba),  // %0
    "+r"(dst_argb),  // %1
    "+r"(pix)        // %2
  :
-  : "memory", "cc", "q1", "q2", "q3", "q4", "q5" // Clobber List
+  : "memory", "cc", "d5", "d6", "d7", "d8", "d9" // Clobber List
  );
 }
 #endif  // HAS_RGBATOARGBROW_NEON
@@ -418,17 +419,17 @@ void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
 #ifdef HAS_RGB24TOARGBROW_NEON
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  asm volatile (
-    "vmov.u8    q4, #255                       \n"  // Alpha
+    "vmov.u8    d8, #255                       \n"  // Alpha
  "1:                                          \n"
-    "vld3.u8    {q1,q2,q3}, [%0]!              \n"  // load 16 pixels of RGB24.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vst4.u8    {q1,q2,q3,q4}, [%1]!           \n"  // store 16 pixels of ARGB.
+    "vld3.8     {d5, d6, d7}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vst4.8     {d5, d6, d7, d8}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
  :
-  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List
  );
 }
 #endif  // HAS_RGB24TOARGBROW_NEON
@@ -436,18 +437,18 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
 #ifdef HAS_RAWTOARGBROW_NEON
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
  asm volatile (
-    "vmov.u8    q4, #255                       \n"  // Alpha
+    "vmov.u8    d8, #255                       \n"  // Alpha
  "1:                                          \n"
-    "vld3.u8    {q1,q2,q3}, [%0]!              \n"  // load 16 pixels of RAW.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vswp.u8    q1, q3                         \n"  // swap R, B
-    "vst4.u8    {q1,q2,q3,q4}, [%1]!           \n"  // store 16 pixels of ARGB.
+    "vld3.8     {d5, d6, d7}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d5, d7                         \n"  // swap R, B
+    "vst4.8     {d5, d6, d7, d8}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
-  : "+r"(src_raw),  // %0
+  : "+r"(src_raw),   // %0
    "+r"(dst_argb),  // %1
    "+r"(pix)        // %2
  :
-  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List
  );
 }
 #endif  // HAS_RAWTOARGBROW_NEON
@@ -456,16 +457,16 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
 void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
  asm volatile (
  "1:                                          \n"
-    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ARGB.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vmov.u8    q0, q4                         \n"
-    "vst4.u8    {q0,q1,q2,q3}, [%1]!           \n"  // store 16 pixels of RGBA.
+    "vld4.8     {d5, d6, d7, d8}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmov.u8    d4, d8                         \n"
+    "vst4.8     {d4, d5, d6, d7}, [%1]!        \n"  // store 8 pixels of RGBA.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_rgba),  // %1
    "+r"(pix)        // %2
  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q4" // Clobber List
+  : "memory", "cc", "d4", "d5", "d6", "d7", "d8" // Clobber List
  );
 }
 #endif  // HAS_ARGBTORGBAROW_NEON
@@ -474,15 +475,15 @@ void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
  asm volatile (
  "1:                                          \n"
-    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ARGB.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vst3.u8    {q1,q2,q3}, [%1]!              \n"  // store 16 pixels of RGB24.
+    "vld4.8     {d5, d6, d7, d8}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vst3.8     {d5, d6, d7}, [%1]!            \n"  // store 8 pixels of RGB24.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_rgb24),  // %1
    "+r"(pix)         // %2
  :
-  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List
  );
 }
 #endif  // HAS_ARGBTORGB24ROW_NEON
@@ -491,20 +492,144 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
  asm volatile (
  "1:                                          \n"
-    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ARGB.
-    "vswp.u8    q1, q3                         \n"  // swap R, B
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    "vst3.u8    {q1,q2,q3}, [%1]!              \n"  // store 16 pixels of RAW.
+    "vld4.8     {d5, d6, d7, d8}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "vswp.u8    d5, d7                         \n"  // swap R, B
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vst3.8     {d5, d6, d7}, [%1]!            \n"  // store 8 pixels of RAW.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_raw),   // %1
    "+r"(pix)        // %2
  :
-  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List
  );
 }
 #endif  // HAS_ARGBTORAWROW_NEON

+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld2.u8    {d0, d1}, [%0]!                \n"  // load 8 pixels of YUY2.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vst1.u8    {d0}, [%1]!                    \n"  // store 8 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1" // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld2.u8    {d0, d1}, [%0]!                \n"  // load 8 pixels of UYVY.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vst1.u8    {d1}, [%1]!                    \n"  // store 8 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1" // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "vst1.u8    {d1}, [%1]!                    \n"  // store 8 U.
+    "vst1.u8    {d3}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "vst1.u8    {d0}, [%1]!                    \n"  // store 8 U.
+    "vst1.u8    {d2}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "adds       %1, %0, %1                     \n"  // stride + src_yuy2
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    "vst1.u8    {d1}, [%2]!                    \n"  // store 8 U.
+    "vst1.u8    {d3}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(stride_yuy2),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "adds       %1, %0, %1                     \n"  // stride + src_uyvy
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    "vst1.u8    {d0}, [%2]!                    \n"  // store 8 U.
+    "vst1.u8    {d2}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(stride_uyvy),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
 #endif  // __ARM_NEON__

 #ifdef __cplusplus

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -2755,13 +2755,11 @@ void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
 * its original size.
 *
 */
-static void ScalePlaneDown2(int src_width, int src_height,
+static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
                            const uint8* src_ptr, uint8* dst_ptr,
                            FilterMode filtering) {
-  assert(IS_ALIGNED(src_width, 2));
-  assert(IS_ALIGNED(src_height, 2));
  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) =
      filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
@@ -2795,13 +2793,11 @@ static void ScalePlaneDown2(int src_width, int src_height,
 * This is an optimized version for scaling down a plane to 1/4 of
 * its original size.
 */
-static void ScalePlaneDown4(int src_width, int src_height,
+static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
                            const uint8* src_ptr, uint8* dst_ptr,
                            FilterMode filtering) {
-  assert(IS_ALIGNED(src_width, 4));
-  assert(IS_ALIGNED(src_height, 4));
  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) =
      filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
@@ -2832,13 +2828,11 @@ static void ScalePlaneDown4(int src_width, int src_height,
 * of its original size.
 *
 */
-static void ScalePlaneDown8(int src_width, int src_height,
+static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
                            const uint8* src_ptr, uint8* dst_ptr,
                            FilterMode filtering) {
-  assert(IS_ALIGNED(src_width, 8));
-  assert(IS_ALIGNED(src_height, 8));
  void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) =
      filtering && (dst_width <= kMaxOutputWidth) ?
@@ -2864,7 +2858,7 @@ static void ScalePlaneDown8(int src_width, int src_height,
 * Provided by Frank Barchard (fbarchard@google.com)
 *
 */
-static void ScalePlaneDown34(int src_width, int src_height,
+static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint8* src_ptr, uint8* dst_ptr,
@@ -2953,7 +2947,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
 * ggghhhii
 * Boxes are 3x3, 2x3, 3x2 and 2x2
 */
-static void ScalePlaneDown38(int src_width, int src_height,
+static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint8* src_ptr, uint8* dst_ptr,

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -791,13 +791,11 @@ void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr,
 * its original size.
 *
 */
-static void ScaleARGBDown2(int src_width, int src_height,
+static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,
                           const uint8* src_ptr, uint8* dst_ptr,
                           FilterMode filtering) {
-  assert(IS_ALIGNED(src_width, 2));
-  assert(IS_ALIGNED(src_height, 2));
  void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) =
      filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;