Any conversions fix for RGB 3 bytes. Fix for overread valgrind. Avoid memcpy. …

Any conversions fix for RGB 3 bytes. Fix for overread valgrind. Avoid memcpy. Add _Any unittests for all conversions. BUG=121 TEST=./libyuv_unittest --gtest_filter=*Any Review URL: https://webrtc-codereview.appspot.com/873010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@401 16f28f9a-4ce2-e073-06de-1de4eb20be90

Any conversions fix for RGB 3 bytes. Fix for overread valgrind. Avoid memcpy. …
Any conversions fix for RGB 3 bytes. Fix for overread valgrind. Avoid memcpy. Add _Any unittests for all conversions. BUG=121 TEST=./libyuv_unittest --gtest_filter=*Any Review URL: https://webrtc-codereview.appspot.com/873010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@401 16f28f9a-4ce2-e073-06de-1de4eb20be90
8d37dd5c · fbarchard@google.com · e91bdaca · 8d37dd5c · 8d37dd5c · 8d37dd5c
Commit 8d37dd5c authored Oct 11, 2012 by fbarchard@google.com
12 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 399
+Version: 401
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -42,6 +42,7 @@ extern "C" {
 #define HAS_ARGB4444TOARGBROW_SSE2
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTOBAYERROW_SSSE3
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565ROW_SSE2
@@ -94,7 +95,7 @@ extern "C" {
 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2
 #endif
-// The following are Windows only:
+// The following are Windows only.  TODO(fbarchard): Port to gcc.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_ARGBCOLORTABLEROW_X86
@@ -116,6 +117,7 @@ extern "C" {
 // The following are available on Neon platforms
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ARGBTOBAYERROW_NEON
 #define HAS_COPYROW_NEON
 #define HAS_HALFROW_NEON
 #define HAS_I422TOABGRROW_NEON
@@ -759,6 +761,12 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
                  uint8* dst_uv, int pix);
+void ARGBToBayerRow_C(const uint8* src_argb,
+                      uint8* dst_bayer, uint32 selector, int pix);
+void ARGBToBayerRow_SSSE3(const uint8* src_argb,
+                          uint8* dst_bayer, uint32 selector, int pix);
+void ARGBToBayerRow_NEON(const uint8* src_argb,
+                         uint8* dst_bayer, uint32 selector, int pix);
 #ifdef __cplusplus
 }  // extern "C"

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 399
+#define LIBYUV_VERSION 401
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -115,14 +115,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
@@ -132,6 +125,13 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
      }
    }
  }
+#elif defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
 #endif
  for (int y = 0; y < height; ++y) {
@@ -189,7 +189,6 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
  return 0;
 }
 // Convert I400 to ARGB.
 LIBYUV_API
 int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
@@ -725,21 +724,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
-                        uint8* argb_buf,
+                        uint8* rgb_buf,
                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8) &&
+    if (IS_ALIGNED(width, 8)) {
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
 #endif
@@ -796,21 +797,23 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
-                        uint8* argb_buf,
+                        uint8* rgb_buf,
                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8) &&
+    if (IS_ALIGNED(width, 8)) {
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
 #endif

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -20,81 +20,6 @@ namespace libyuv {
 extern "C" {
 #endif
-// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
-// and vst would select which 2 components to write. The low level would need
-// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
-                                 uint8* dst_bayer, uint32 selector, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_bayer
-    movd       xmm5, [esp + 12]  // selector
-    mov        ecx, [esp + 16]   // pix
-    pshufd     xmm5, xmm5, 0
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    lea        eax, [eax + 16]
-    pshufb     xmm0, xmm5
-    sub        ecx, 4
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jg         wloop
-    ret
-  }
-}
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBTOBAYERROW_SSSE3
-static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                                 uint32 selector, int pix) {
-  asm volatile (
-    "movd   %3,%%xmm5                          \n"
-    "pshufd $0x0,%%xmm5,%%xmm5                 \n"
-    ".p2align  4                               \n"
-"1:                                            \n"
-    "movdqa (%0),%%xmm0                        \n"
-    "lea    0x10(%0),%0                        \n"
-    "pshufb %%xmm5,%%xmm0                      \n"
-    "sub    $0x4,%2                            \n"
-    "movd   %%xmm0,(%1)                        \n"
-    "lea    0x4(%1),%1                         \n"
-    "jg     1b                                 \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_bayer), // %1
-    "+r"(pix)        // %2
-  : "g"(selector)    // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
-);
-}
-#endif
-static void ARGBToBayerRow_C(const uint8* src_argb,
-                             uint8* dst_bayer, uint32 selector, int pix) {
-  int index0 = selector & 0xff;
-  int index1 = (selector >> 8) & 0xff;
-  // Copy a row of Bayer.
-  for (int x = 0; x < pix - 1; x += 2) {
-    dst_bayer[0] = src_argb[index0];
-    dst_bayer[1] = src_argb[index1];
-    src_argb += 8;
-    dst_bayer += 2;
-  }
-  if (pix & 1) {
-    dst_bayer[0] = src_argb[index0];
-  }
-}
 // generate a selector mask useful for pshufb
 static uint32 GenerateSelector(int select0, int select1) {
  return static_cast<uint32>(select0) |
@@ -147,11 +72,14 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
                         uint32 selector, int pix) = ARGBToBayerRow_C;
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(width, 4) &&
      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
  }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+    ARGBToBayerRow = ARGBToBayerRow_NEON;
+  }
 #endif
  const int blue_index = 0;  // Offsets for ARGB format
  const int green_index = 1;
@@ -455,15 +383,22 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+    }
  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
+#elif defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
  }
 #endif
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
                         uint32 selector, int pix) = ARGBToBayerRow_C;
@@ -471,6 +406,10 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
  }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+    ARGBToBayerRow = ARGBToBayerRow_NEON;
+  }
 #endif
  const int blue_index = 0;  // Offsets for ARGB format
  const int green_index = 1;

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -626,8 +626,7 @@ int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
      IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
    ARGBToRGBARow = ARGBToRGBARow_SSSE3;
  }
-#endif
+#elif defined(HAS_ARGBTORGBAROW_NEON)
-#if defined(HAS_ARGBTORGBAROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
    ARGBToRGBARow = ARGBToRGBARow_NEON;
  }
@@ -657,22 +656,17 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRGB24Row_C;
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-    if (width * 3 <= kMaxStride) {
+      IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
-    }
+    if (IS_ALIGNED(width, 16)) {
-    if (IS_ALIGNED(width, 16) &&
-        IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
    }
  }
-#endif
+#elif defined(HAS_ARGBTORGB24ROW_NEON)
-#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
-    if (width * 3 <= kMaxStride) {
-      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
-    }
    if (IS_ALIGNED(width, 8)) {
      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
    }
@@ -703,22 +697,17 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRAWRow_C;
 #if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-    if (width * 3 <= kMaxStride) {
+      IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
-      ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
-    }
+    if (IS_ALIGNED(width, 16)) {
-    if (IS_ALIGNED(width, 16) &&
-        IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
    }
  }
-#endif
+#elif defined(HAS_ARGBTORAWROW_NEON)
-#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
-    if (width * 3 <= kMaxStride) {
-      ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
-    }
    if (IS_ALIGNED(width, 8)) {
      ARGBToRAWRow = ARGBToRAWRow_NEON;
    }
@@ -749,11 +738,9 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRGB565Row_C;
 #if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 2 <= kMaxStride) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
-      ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
-    }
    if (IS_ALIGNED(width, 4)) {
      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
    }
@@ -784,11 +771,9 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToARGB1555Row_C;
 #if defined(HAS_ARGBTOARGB1555ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 2 <= kMaxStride) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
-      ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
-    }
    if (IS_ALIGNED(width, 4)) {
      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
    }
@@ -819,11 +804,9 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToARGB4444Row_C;
 #if defined(HAS_ARGBTOARGB4444ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    if (width * 2 <= kMaxStride) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
-      ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
-    }
    if (IS_ALIGNED(width, 4)) {
      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
    }
@@ -839,7 +822,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
 }
 // Convert NV12 to RGB565.
-// TODO(fbarchard): (Re) Optimize for Neon.
+// TODO(fbarchard): One pass conversion.
 LIBYUV_API
 int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 const uint8* src_uv, int src_stride_uv,
@@ -859,22 +842,26 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                        uint8* rgb_buf,
                        int width) = NV12ToARGBRow_C;
 #if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
    NV12ToARGBRow = NV12ToARGBRow_SSSE3;
  }
-#endif
+#elif defined(HAS_NV12TOARGBROW_NEON)
-#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-  if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
    NV12ToARGBRow = NV12ToARGBRow_NEON;
  }
 #endif
+  if (width * 4 > kMaxStride) {
+    return -1;
+  }
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRGB565Row_C;
 #if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
-    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
  }
 #endif
@@ -893,10 +880,10 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 // Convert NV21 to RGB565.
 LIBYUV_API
 int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_vu, int src_stride_vu,
+                 const uint8* src_uv, int src_stride_uv,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height) {
-  if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) {
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
@@ -910,27 +897,36 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
                        uint8* rgb_buf,
                        int width) = NV21ToARGBRow_C;
 #if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
    NV21ToARGBRow = NV21ToARGBRow_SSSE3;
  }
+#elif defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    NV21ToARGBRow = NV21ToARGBRow_NEON;
+  }
 #endif
+  if (width * 4 > kMaxStride) {
+    return -1;
+  }
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRGB565Row_C;
 #if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
-    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_vu, row, width);
+    NV21ToARGBRow(src_y, src_uv, row, width);
    ARGBToRGB565Row(row, dst_rgb565, width);
    dst_rgb565 += dst_stride_rgb565;
    src_y += src_stride_y;
    if (y & 1) {
-      src_vu += src_stride_vu;
+      src_uv += src_stride_uv;
    }
  }
  return 0;

--- a/source/row_common.cc
+++ b/source/row_common.cc
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -842,6 +842,24 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
   );
 }
+// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
+// TODO(fbarchard): Neon port.
+void ARGBToBayerRow_NEON(const uint8* src_argb,
+                         uint8* dst_bayer, uint32 selector, int pix) {
+  int index0 = selector & 0xff;
+  int index1 = (selector >> 8) & 0xff;
+  // Copy a row of Bayer.
+  for (int x = 0; x < pix - 1; x += 2) {
+    dst_bayer[0] = src_argb[index0];
+    dst_bayer[1] = src_argb[index1];
+    src_argb += 8;
+    dst_bayer += 2;
+  }
+  if (pix & 1) {
+    dst_bayer[0] = src_argb[index0];
+  }
+}
 #endif  // __ARM_NEON__
 #ifdef __cplusplus

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -118,6 +118,16 @@ CONST uvec8 kShuffleMaskARGBToRAW = {
  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
 };
+// Shuffle table for converting ARGBToRGB24 for I420ToRGB24.  First 8 + next 4
+CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+// Shuffle table for converting ARGB to RAW.
+CONST uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -1431,6 +1441,115 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
  );
 }
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb24_buf,
+                                 int width) {
+// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
+#ifdef __APPLE__
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm5    \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm6  \n"
+  :: [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24),
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0));
+#endif
+  asm volatile (
+#ifndef __APPLE__
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm5    \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm6  \n"
+#endif
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0,(%[rgb24_buf])           \n"
+    "movdqu    %%xmm1,0x8(%[rgb24_buf])        \n"
+    "lea       0x18(%[rgb24_buf]),%[rgb24_buf] \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [rgb24_buf]"+r"(rgb24_buf),  // %[rgb24_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
+#ifndef __APPLE__
+    , [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24),
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0)
+#endif
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* raw_buf,
+                               int width) {
+#ifdef __APPLE__
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm5    \n"
+    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm6  \n"
+  :: [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW),
+    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0));
+#endif
+  asm volatile (
+#ifndef __APPLE__
+    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm5    \n"
+    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm6  \n"
+#endif
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0,(%[raw_buf])             \n"
+    "movdqu    %%xmm1,0x8(%[raw_buf])          \n"
+    "lea       0x18(%[raw_buf]),%[raw_buf]     \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [raw_buf]"+r"(raw_buf),  // %[raw_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
+#ifndef __APPLE__
+    , [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW),
+    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0)
+#endif
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
@@ -3751,6 +3870,31 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 #endif
  );
 }
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+                          uint32 selector, int pix) {
+  asm volatile (
+    "movd   %3,%%xmm5                          \n"
+    "pshufd $0x0,%%xmm5,%%xmm5                 \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa (%0),%%xmm0                        \n"
+    "lea    0x10(%0),%0                        \n"
+    "pshufb %%xmm5,%%xmm0                      \n"
+    "sub    $0x4,%2                            \n"
+    "movd   %%xmm0,(%1)                        \n"
+    "lea    0x4(%1),%1                         \n"
+    "jg     1b                                 \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_bayer), // %1
+    "+r"(pix)        // %2
+  : "g"(selector)    // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+  );
+}
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4217,6 +4217,29 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  }
 }
+__declspec(naked) __declspec(align(16))
+void ARGBToBayerRow_SSSE3(const uint8* src_argb,
+                          uint8* dst_bayer, uint32 selector, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_bayer
+    movd       xmm5, [esp + 12]  // selector
+    mov        ecx, [esp + 16]   // pix
+    pshufd     xmm5, xmm5, 0
+    align      16
+  wloop:
+    movdqa     xmm0, [eax]
+    lea        eax, [eax + 16]
+    pshufb     xmm0, xmm5
+    sub        ecx, 4
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jg         wloop
+    ret
+  }
+}
 #endif  // _M_IX86
 #ifdef __cplusplus

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc