rewrite ARGBToI420 with SSSE3

TEST=talk unittests BUG=none Review URL: http://webrtc-codereview.appspot.com/251003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@46 16f28f9a-4ce2-e073-06de-1de4eb20be90

rewrite ARGBToI420 with SSSE3
TEST=talk unittests BUG=none Review URL: http://webrtc-codereview.appspot.com/251003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@46 16f28f9a-4ce2-e073-06de-1de4eb20be90
585a1261 · fbarchard@google.com · 8cfa3073 · 585a1261 · 585a1261 · 585a1261
Commit 585a1261 authored Oct 28, 2011 by fbarchard@google.com
7 changed files
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -16,93 +16,81 @@
 namespace libyuv {
-int
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
-I420ToRGB24(const uint8* src_yplane, int src_ystride,
+                const uint8* src_u, int src_stride_u,
-            const uint8* src_uplane, int src_ustride,
+                const uint8* src_v, int src_stride_v,
-            const uint8* src_vplane, int src_vstride,
+                uint8* dst_frame, int dst_stride_frame,
-            uint8* dst_frame, int dst_stride,
+                int width, int height);
-            int src_width, int src_height);
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-int
+                   const uint8* src_u, int src_stride_u,
-I420ToARGB4444(const uint8* src_yplane, int src_ystride,
+                   const uint8* src_v, int src_stride_v,
-               const uint8* src_uplane, int src_ustride,
+                   uint8* dst_frame, int dst_stride_frame,
-               const uint8* src_vplane, int src_vstride,
+                   int width, int height);
-               uint8* dst_frame, int dst_stride,
-               int src_width, int src_height);
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
-int
+                 const uint8* src_v, int src_stride_v,
-I420ToRGB565(const uint8* src_yplane, int src_ystride,
+                 uint8* dst_frame, int dst_stride_frame,
-             const uint8* src_uplane, int src_ustride,
+                 int width, int height);
-             const uint8* src_vplane, int src_vstride,
-             uint8* dst_frame, int dst_stride,
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-             int src_width, int src_height);
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
-int
+                   uint8* dst_frame, int dst_stride_frame,
-I420ToARGB1555(const uint8* src_yplane, int src_ystride,
+                   int width, int height);
-               const uint8* src_uplane, int src_ustride,
-               const uint8* src_vplane, int src_vstride,
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               uint8* dst_frame, int dst_stride,
+               const uint8* src_u, int src_stride_u,
-               int src_width, int src_height);
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
-int
+               int width, int height);
-I420ToYUY2(const uint8* src_yplane, int src_ystride,
-           const uint8* src_uplane, int src_ustride,
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
-           const uint8* src_vplane, int src_vstride,
+               const uint8* src_u, int src_stride_u,
-           uint8* dst_frame, int dst_stride,
+               const uint8* src_v, int src_stride_v,
-           int src_width, int src_height);
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
-int
-I420ToUYVY(const uint8* src_yplane, int src_ystride,
+// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height
-           const uint8* src_uplane, int src_ustride,
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-           const uint8* src_vplane, int src_vstride,
+                uint8* dst_frame, int dst_stride_frame,
-           uint8* dst_frame, int dst_stride,
+                int width, int height);
-           int src_width, int src_height);
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
-int
+                uint8* dst_y, int dst_stride_y,
-RGB24ToARGB(const uint8* src_frame, int src_stride,
+                uint8* dst_u, int dst_stride_u,
-            uint8* dst_frame, int dst_stride,
+                uint8* dst_v, int dst_stride_v,
-            int src_width, int src_height);
+                int width, int height);
-int
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
-RGB24ToI420(const uint8* src_frame, int src_stride,
+              uint8* dst_y, int dst_stride_y,
-            uint8* dst_yplane, int dst_ystride,
+              uint8* dst_u, int dst_stride_u,
-            uint8* dst_uplane, int dst_ustride,
+              uint8* dst_v, int dst_stride_v,
-            uint8* dst_vplane, int dst_vstride,
+              int width, int height);
-            int src_width, int src_height);
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
-int
+               uint8* dst_y, int dst_stride_y,
-RAWToI420(const uint8* src_frame, int src_stride,
+               uint8* dst_u, int dst_stride_u,
-          uint8* dst_yplane, int dst_ystride,
+               uint8* dst_v, int dst_stride_v,
-          uint8* dst_uplane, int dst_ustride,
+               int width, int height);
-          uint8* dst_vplane, int dst_vstride,
-          int src_width, int src_height);
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
-int
+               uint8* dst_u, int dst_stride_u,
-ABGRToI420(const uint8* src_frame, int src_stride,
+               uint8* dst_v, int dst_stride_v,
-           uint8* dst_yplane, int dst_ystride,
+               int width, int height);
-           uint8* dst_uplane, int dst_ustride,
-           uint8* dst_vplane, int dst_vstride,
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
-           int src_width, int src_height);
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
-int
+               uint8* dst_v, int dst_stride_v,
-BGRAToI420(const uint8* src_frame, int src_stride,
+               int width, int height);
-           uint8* dst_yplane, int dst_ystride,
-           uint8* dst_uplane, int dst_ustride,
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-           uint8* dst_vplane, int dst_vstride,
+                 const uint8* src_uv, int src_stride_uv,
-           int src_width, int src_height);
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height);
-int
-ARGBToI420(const uint8* src_frame, int src_stride,
-           uint8* dst_yplane, int dst_ystride,
-           uint8* dst_uplane, int dst_ustride,
-           uint8* dst_vplane, int dst_vstride,
-           int src_width, int src_height);
-int
-NV12ToRGB565(const uint8* src_yplane, int src_ystride,
-             const  uint8* src_uvplane, int src_uvstride,
-             uint8* dst_frame, int dst_stride,
-             int src_width, int src_height);
 } //  namespace libyuv

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -10,8 +10,10 @@
 #include "libyuv/convert.h"
-#include "libyuv/basic_types.h"
 #include "conversion_tables.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "row.h"
 //#define SCALEOPT //Currently for windows only. June 2010
@@ -30,29 +32,29 @@ static inline uint8 Clip(int32 val) {
  return (uint8) val;
 }
-int I420ToRGB24(const uint8* src_yplane, int src_ystride,
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_uplane, int src_ustride,
+                const uint8* src_u, int src_stride_u,
-                const uint8* src_vplane, int src_vstride,
+                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride,
+                uint8* dst_frame, int dst_stride_frame,
-                int src_width, int src_height)
+                int width, int height) {
-{
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
-      dst_frame == NULL)
    return -1;
+  }
  // RGB orientation - bottom up
-  uint8* out = dst_frame + dst_stride * src_height - dst_stride;
+  // TODO(fbarchard): support inversion
-  uint8* out2 = out - dst_stride;
+  uint8* out = dst_frame + dst_stride_frame * height - dst_stride_frame;
+  uint8* out2 = out - dst_stride_frame;
  int h, w;
  int tmp_r, tmp_g, tmp_b;
  const uint8 *y1, *y2 ,*u, *v;
-  y1 = src_yplane;
+  y1 = src_y;
-  y2 = y1 + src_ystride;
+  y2 = y1 + src_stride_y;
-  u = src_uplane;
+  u = src_u;
-  v = src_vplane;
+  v = src_v;
-  for (h = ((src_height + 1) >> 1); h > 0; h--){
+  for (h = ((height + 1) >> 1); h > 0; h--){
    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((src_width + 1) >> 1); w++){
+    for (w = 0; w < ((width + 1) >> 1); w++){
      // Vertical and horizontal sub-sampling
      tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
      tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
@@ -89,41 +91,40 @@ int I420ToRGB24(const uint8* src_yplane, int src_ystride,
      u++;
      v++;
    }
-    y1 += src_ystride + src_ystride - src_width;
+    y1 += src_stride_y + src_stride_y - width;
-    y2 += src_ystride + src_ystride - src_width;
+    y2 += src_stride_y + src_stride_y - width;
-    u += src_ustride - ((src_width + 1) >> 1);
+    u += src_stride_u - ((width + 1) >> 1);
-    v += src_vstride - ((src_width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
-    out -= dst_stride * 3;
+    out -= dst_stride_frame * 3;
-    out2 -= dst_stride * 3;
+    out2 -= dst_stride_frame * 3;
  } // end height for
  return 0;
 }
 // Little Endian...
-int I420ToARGB4444(const uint8* src_yplane, int src_ystride,
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_uplane, int src_ustride,
+                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_vplane, int src_vstride,
+                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride,
+                   uint8* dst_frame, int dst_stride_frame,
-                   int src_width, int src_height)
+                   int width, int height) {
-{
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
-      dst_frame == NULL)
    return -1;
+  }
  // RGB orientation - bottom up
-  uint8* out = dst_frame + dst_stride * (src_height - 1);
+  uint8* out = dst_frame + dst_stride_frame * (height - 1);
-  uint8* out2 = out - dst_stride;
+  uint8* out2 = out - dst_stride_frame;
  int tmp_r, tmp_g, tmp_b;
  const uint8 *y1,*y2, *u, *v;
-  y1 = src_yplane;
+  y1 = src_y;
-  y2 = y1 + src_ystride;
+  y2 = y1 + src_stride_y;
-  u = src_uplane;
+  u = src_u;
-  v = src_vplane;
+  v = src_v;
  int h, w;
-  for (h = ((src_height + 1) >> 1); h > 0; h--){
+  for (h = ((height + 1) >> 1); h > 0; h--) {
    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((src_width + 1) >> 1); w++){
+    for (w = 0; w < ((width + 1) >> 1); w++) {
        // Vertical and horizontal sub-sampling
        // Convert to RGB888 and re-scale to 4 bits
        tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
@@ -157,51 +158,50 @@ int I420ToARGB4444(const uint8* src_yplane, int src_ystride,
        u++;
        v++;
    }
-    y1 += 2 * src_ystride - src_width;
+    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_ystride - src_width;
+    y2 += 2 * src_stride_y - width;
-    u += src_ustride - ((src_width + 1) >> 1);
+    u += src_stride_u - ((width + 1) >> 1);
-    v += src_vstride - ((src_width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
-    out -= (dst_stride + src_width) * 2;
+    out -= (dst_stride_frame + width) * 2;
-    out2 -= (dst_stride + src_width) * 2;
+    out2 -= (dst_stride_frame + width) * 2;
  } // end height for
  return 0;
 }
-int I420ToRGB565(const uint8* src_yplane, int src_ystride,
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uplane, int src_ustride,
+                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_vplane, int src_vstride,
+                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_frame, int dst_stride,
+                 uint8* dst_frame, int dst_stride_frame,
-                 int src_width, int src_height)
+                 int width, int height) {
-{
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
-      dst_frame == NULL)
    return -1;
+  }
  // Negative height means invert the image.
-  if (src_height < 0) {
+  if (height < 0) {
-    src_height = -src_height;
+    height = -height;
-    src_yplane = src_yplane + (src_height - 1) * src_ystride;
+    src_y = src_y + (height - 1) * src_stride_y;
-    src_uplane = src_uplane + (src_height - 1) * src_ustride;
+    src_u = src_u + (height - 1) * src_stride_u;
-    src_vplane = src_vplane + (src_height - 1) * src_vstride;
+    src_v = src_v + (height - 1) * src_stride_v;
-    src_ystride = -src_ystride;
+    src_stride_y = -src_stride_y;
-    src_ustride = -src_ustride;
+    src_stride_u = -src_stride_u;
-    src_vstride = -src_vstride;
+    src_stride_v = -src_stride_v;
  }
-  uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1);
+  uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
-  uint16* out2 = out - dst_stride;
+  uint16* out2 = out - dst_stride_frame;
  int tmp_r, tmp_g, tmp_b;
-  const uint8 *y1,*y2, *u, *v;
+  const uint8* y1,* y2, * u, * v;
-  y1 = src_yplane;
+  y1 = src_y;
-  y2 = y1 + src_ystride;
+  y2 = y1 + src_stride_y;
-  u = src_uplane;
+  u = src_u;
-  v = src_vplane;
+  v = src_v;
  int h, w;
-  for (h = ((src_height + 1) >> 1); h > 0; h--){
+  for (h = ((height + 1) >> 1); h > 0; h--){
    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((src_width + 1) >> 1); w++){
+    for (w = 0; w < ((width + 1) >> 1); w++){
      // Vertical and horizontal sub-sampling
      // 1. Convert to RGB888
      // 2. Shift to adequate location (in the 16 bit word) - RGB 565
@@ -237,41 +237,39 @@ int I420ToRGB565(const uint8* src_yplane, int src_ystride,
      u++;
      v++;
    }
-    y1 += 2 * src_ystride - src_width;
+    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_ystride - src_width;
+    y2 += 2 * src_stride_y - width;
-    u += src_ustride - ((src_width + 1) >> 1);
+    u += src_stride_u - ((width + 1) >> 1);
-    v += src_vstride - ((src_width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
-    out -= 2 * dst_stride + src_width;
+    out -= 2 * dst_stride_frame + width;
-    out2 -=  2 * dst_stride + src_width;
+    out2 -=  2 * dst_stride_frame + width;
  }
  return 0;
 }
-int I420ToARGB1555(const uint8* src_yplane, int src_ystride,
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_uplane, int src_ustride,
+                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_vplane, int src_vstride,
+                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride,
+                   uint8* dst_frame, int dst_stride_frame,
-                   int src_width, int src_height)
+                   int width, int height) {
-{
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
+    return -1;
-      dst_frame == NULL){
-     return -1;
  }
-  uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1);
+  uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
-  uint16* out2 = out - dst_stride ;
+  uint16* out2 = out - dst_stride_frame ;
  int32 tmp_r, tmp_g, tmp_b;
  const uint8 *y1,*y2, *u, *v;
  int h, w;
-  y1 = src_yplane;
+  y1 = src_y;
-  y2 = y1 + src_ystride;
+  y2 = y1 + src_stride_y;
-  u = src_uplane;
+  u = src_u;
-  v = src_vplane;
+  v = src_v;
-  for (h = ((src_height + 1) >> 1); h > 0; h--){
+  for (h = ((height + 1) >> 1); h > 0; h--){
    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((src_width + 1) >> 1); w++){
+    for (w = 0; w < ((width + 1) >> 1); w++){
      // Vertical and horizontal sub-sampling
      // 1. Convert to RGB888
      // 2. Shift to adequate location (in the 16 bit word) - RGB 555
@@ -307,41 +305,37 @@ int I420ToARGB1555(const uint8* src_yplane, int src_ystride,
      u++;
      v++;
    }
-    y1 += 2 * src_ystride - src_width;
+    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_ystride - src_width;
+    y2 += 2 * src_stride_y - width;
-    u += src_ustride - ((src_width + 1) >> 1);
+    u += src_stride_u - ((width + 1) >> 1);
-    v += src_vstride - ((src_width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
-    out -= 2 * dst_stride + src_width;
+    out -= 2 * dst_stride_frame + width;
-    out2 -=  2 * dst_stride + src_width;
+    out2 -=  2 * dst_stride_frame + width;
  }
  return 0;
 }
-int I420ToYUY2(const uint8* src_yplane, int src_ystride,
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_uplane, int src_ustride,
+               const uint8* src_u, int src_stride_u,
-               const uint8* src_vplane, int src_vstride,
+               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride,
+               uint8* dst_frame, int dst_stride_frame,
-               int src_width, int src_height)
+               int width, int height) {
-{
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
-      dst_frame == NULL){
    return -1;
  }
-  const uint8* in1 = src_yplane;
+  const uint8* in1 = src_y;
-  const uint8* in2 = src_yplane + src_ystride ;
+  const uint8* in2 = src_y + src_stride_y;
-  const uint8* src_u = src_uplane;
-  const uint8* src_v = src_vplane;
  uint8* out1 = dst_frame;
-  uint8* out2 = dst_frame + dst_stride;
+  uint8* out2 = dst_frame + dst_stride_frame;
  // YUY2 - Macro-pixel = 2 image pixels
  // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
 #ifndef SCALEOPT
-  for (int i = 0; i < ((src_height + 1) >> 1); i++){
+  for (int i = 0; i < ((height + 1) >> 1); i++){
-    for (int j = 0; j < ((src_width + 1) >> 1); j++){
+    for (int j = 0; j < ((width + 1) >> 1); j++){
      out1[0] = in1[0];
      out1[1] = *src_u;
      out1[2] = in1[1];
@@ -358,16 +352,15 @@ int I420ToYUY2(const uint8* src_yplane, int src_ystride,
      in1 += 2;
      in2 += 2;
    }
-    in1 += 2 * src_ystride - src_width;
+    in1 += 2 * src_stride_y - width;
-    in2 += 2 * src_ystride - src_width;
+    in2 += 2 * src_stride_y - width;
-    src_u += src_ustride - ((src_width + 1) >> 1);
+    src_u += src_stride_u - ((width + 1) >> 1);
-    src_v += src_vstride - ((src_width + 1) >> 1);
+    src_v += src_stride_v - ((width + 1) >> 1);
-    out1 += dst_stride + dst_stride - 2 * src_width;
+    out1 += dst_stride_frame + dst_stride_frame - 2 * width;
-    out2 += dst_stride + dst_stride - 2 * src_width;
+    out2 += dst_stride_frame + dst_stride_frame - 2 * width;
  }
 #else
-  for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++)
+  for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) {
-  {
    int32 width__ = (width >> 4);
    _asm
    {
@@ -424,40 +417,39 @@ int I420ToYUY2(const uint8* src_yplane, int src_ystride,
      ;popa
      emms
    }
-    in1 += 2 * src_ystride - src_width;
+    in1 += 2 * src_stride_y - width;
-    in2 += 2 * src_ystride - src_width;
+    in2 += 2 * src_stride_y - width;
-    out1 += dst_stride + dst_stride - 2 * width;
+    out1 += dst_stride_frame + dst_stride_frame - 2 * width;
-    out2 += dst_stride + dst_stride - 2 * width;
+    out2 += dst_stride_frame + dst_stride_frame - 2 * width;
  }
 #endif
  return 0;
 }
-int I420ToUYVY(const uint8* src_yplane, int src_ystride,
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_uplane, int src_ustride,
+               const uint8* src_u, int src_stride_u,
-               const uint8* src_vplane, int src_vstride,
+               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride,
+               uint8* dst_frame, int dst_stride_frame,
-               int src_width, int src_height)
+               int width, int height) {
-{
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-  if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
-      dst_frame == NULL)
    return -1;
+  }
  int i = 0;
-  const uint8* y1 = src_yplane;
+  const uint8* y1 = src_y;
-  const uint8* y2 = y1 + src_ystride;
+  const uint8* y2 = y1 + src_stride_y;
-  const uint8* u = src_uplane;
+  const uint8* u = src_u;
-  const uint8* v = src_vplane;
+  const uint8* v = src_v;
  uint8* out1 = dst_frame;
-  uint8* out2 = dst_frame + dst_stride;
+  uint8* out2 = dst_frame + dst_stride_frame;
  // Macro-pixel = 2 image pixels
  // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5.....
 #ifndef SCALEOPT
-  for (; i < ((src_height + 1) >> 1);i++){
+  for (; i < ((height + 1) >> 1); i++) {
-    for (int j = 0; j < ((src_width + 1) >> 1) ;j++){
+    for (int j = 0; j < ((width + 1) >> 1); j++) {
      out1[0] = *u;
      out1[1] = y1[0];
      out1[2] = *v;
@@ -474,16 +466,15 @@ int I420ToUYVY(const uint8* src_yplane, int src_ystride,
      y1 += 2;
      y2 += 2;
    }
-    y1 += 2 * src_ystride - src_width;
+    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_ystride - src_width;
+    y2 += 2 * src_stride_y - width;
-    u += src_ustride - ((src_width + 1) >> 1);
+    u += src_stride_u - ((width + 1) >> 1);
-    v += src_vstride - ((src_width + 1) >> 1);
+    v += src_stride_v - ((width + 1) >> 1);
-    out1 += 2 * (dst_stride - src_width);
+    out1 += 2 * (dst_stride_frame - width);
-    out2 += 2 * (dst_stride - src_width);
+    out2 += 2 * (dst_stride_frame - width);
  }
 #else
-  for (; i < (height >> 1);i++)
+  for (; i < (height >> 1);i++) {
-  {
    int32 width__ = (width >> 4);
    _asm
    {
@@ -540,35 +531,35 @@ loop0:
    }
    in1 += width;
    in2 += width;
-    out1 += 2 * (dst_stride - width);
+    out1 += 2 * (dst_stride_frame - width);
-    out2 += 2 * (dst_stride - width);
+    out2 += 2 * (dst_stride_frame - width);
  }
 #endif
  return 0;
 }
-int NV12ToRGB565(const uint8* src_yplane, int src_ystride,
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const  uint8* src_uvplane, int src_uvstride,
+                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_frame, int dst_stride,
+                 uint8* dst_frame, int dst_stride_frame,
-                 int src_width, int src_height)
+                 int width, int height) {
-{
+  if (src_y == NULL || src_uv == NULL || dst_frame == NULL) {
-  if (src_yplane == NULL || src_uvplane == NULL || dst_frame == NULL)
+    return -1;
-     return -1;
+  }
  // Bi-Planar: Y plane followed by an interlaced U and V plane
-  const uint8* interlacedSrc = src_uvplane;
+  const uint8* interlacedSrc = src_uv;
-  uint16* out = (uint16*)(src_yplane) + dst_stride * (src_height - 1);
+  uint16* out = (uint16*)(src_y) + dst_stride_frame * (height - 1);
-  uint16* out2 = out - dst_stride;
+  uint16* out2 = out - dst_stride_frame;
  int32 tmp_r, tmp_g, tmp_b;
  const uint8 *y1,*y2;
-  y1 = src_yplane;
+  y1 = src_y;
-  y2 = y1 + src_ystride;
+  y2 = y1 + src_stride_y;
  int h, w;
-  for (h = ((src_height + 1) >> 1); h > 0; h--){
+  for (h = ((height + 1) >> 1); h > 0; h--) {
    // 2 rows at a time, 2 y's at a time
-    for (w = 0; w < ((src_width + 1) >> 1); w++){
+    for (w = 0; w < ((width + 1) >> 1); w++) {
      // Vertical and horizontal sub-sampling
      // 1. Convert to RGB888
      // 2. Shift to adequate location (in the 16 bit word) - RGB 565
@@ -608,29 +599,30 @@ int NV12ToRGB565(const uint8* src_yplane, int src_ystride,
      out2 += 2;
      interlacedSrc += 2;
    }
-    y1 += 2 * src_ystride - src_width;
+    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_ystride - src_width;
+    y2 += 2 * src_stride_y - width;
-    interlacedSrc += src_uvstride - ((src_width + 1) >> 1);
+    interlacedSrc += src_stride_uv - ((width + 1) >> 1);
-    out -= 3 * dst_stride + dst_stride - src_width;
+    out -= 3 * dst_stride_frame + dst_stride_frame - width;
-    out2 -= 3 * dst_stride + dst_stride - src_width;
+    out2 -= 3 * dst_stride_frame + dst_stride_frame - width;
  }
  return 0;
 }
-int RGB24ToARGB(const uint8* src_frame, int src_stride,
+// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height
-                uint8* dst_frame, int dst_stride,
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-                int src_width, int src_height)
+                uint8* dst_frame, int dst_stride_frame,
-{
+                int width, int height) {
-  if (src_frame == NULL || dst_frame == NULL)
+  if (src_frame == NULL || dst_frame == NULL) {
    return -1;
+  }
  int i, j, offset;
  uint8* outFrame = dst_frame;
  const uint8* inFrame = src_frame;
-  outFrame += dst_stride * (src_height - 1) * 4;
+  outFrame += dst_stride_frame * (height - 1) * 4;
-  for (i = 0; i < src_height; i++){
+  for (i = 0; i < height; i++) {
-    for (j = 0; j < src_width; j++){
+    for (j = 0; j < width; j++) {
      offset = j * 4;
      outFrame[0 + offset] = inFrame[0];
      outFrame[1 + offset] = inFrame[1];
@@ -638,8 +630,8 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride,
      outFrame[3 + offset] = 0xff;
      inFrame += 3;
    }
-    outFrame -= 4 * (dst_stride - src_width);
+    outFrame -= 4 * (dst_stride_frame - width);
-    inFrame += src_stride - src_width;
+    inFrame += src_stride_frame - width;
  }
  return 0;
 }
@@ -654,10 +646,10 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride,
 static void \
 NAME(const uint8* src_row0, const uint8* src_row1, \
         uint8* dst_yplane0, uint8* dst_yplane1, \
-         uint8* dst_uplane, \
+         uint8* dst_u, \
-         uint8* dst_vplane, \
+         uint8* dst_v, \
-         int src_width) { \
+         int width) { \
-  for (int x = 0; x < src_width - 1; x += 2) { \
+  for (int x = 0; x < width - 1; x += 2) { \
    dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
                              src_row0[G] * 129 + \
                              src_row0[B] * 25 + 128) >> 8) + 16; \
@@ -670,14 +662,14 @@ NAME(const uint8* src_row0, const uint8* src_row1, \
    dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \
                              src_row1[G + BPP] * 129 + \
                              src_row1[B + BPP] * 25 + 128) >> 8) + 16; \
-    dst_uplane[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
+    dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
                              src_row1[R] + src_row1[R + BPP]) * -38 + \
                             (src_row0[G] + src_row0[G + BPP] + \
                              src_row1[G] + src_row1[G + BPP]) * -74 + \
                             (src_row0[B] + src_row0[B + BPP] + \
                              src_row1[B] + src_row1[B + BPP]) * 112 + \
                              + 512) >> 10) + 128; \
-    dst_vplane[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
+    dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
                              src_row1[R] + src_row1[R + BPP]) * 112 + \
                             (src_row0[G] + src_row0[G + BPP] + \
                              src_row1[G] + src_row1[G + BPP]) * -94 + \
@@ -686,26 +678,26 @@ NAME(const uint8* src_row0, const uint8* src_row1, \
                              + 512) >> 10) + 128; \
    dst_yplane0 += 2; \
    dst_yplane1 += 2; \
-    ++dst_uplane; \
+    ++dst_u; \
-    ++dst_vplane; \
+    ++dst_v; \
    src_row0 += BPP * 2; \
    src_row1 += BPP * 2; \
  } \
-  if (src_width & 1) { \
+  if (width & 1) { \
    dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
                              src_row0[G] * 129 + \
                              src_row0[B] * 25 + 128) >> 8) + 16; \
    dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
                              src_row1[G] * 129 + \
                              src_row1[B] * 25 + 128) >> 8) + 16; \
-    dst_uplane[0] = (uint8)(((src_row0[R] + \
+    dst_u[0] = (uint8)(((src_row0[R] + \
                              src_row1[R]) * -38 + \
                             (src_row0[G] + \
                              src_row1[G]) * -74 + \
                             (src_row0[B] + \
                              src_row1[B]) * 112 + \
                              + 256) >> 9) + 128; \
-    dst_vplane[0] = (uint8)(((src_row0[R] + \
+    dst_v[0] = (uint8)(((src_row0[R] + \
                              src_row1[R]) * 112 + \
                             (src_row0[G] + \
                              src_row1[G]) * -94 + \
@@ -723,104 +715,157 @@ MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4)
 MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3)
 MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3)
-static int RGBToI420(const uint8* src_frame, int src_stride,
+static int RGBToI420(const uint8* src_frame, int src_stride_frame,
-                     uint8* dst_yplane, int dst_ystride,
+                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_uplane, int dst_ustride,
+                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_vplane, int dst_vstride,
+                     uint8* dst_v, int dst_stride_v,
-                     int src_width, int src_height,
+                     int width, int height,
                     void (*RGBToI420Row)(const uint8* src_row0,
                                          const uint8* src_row1,
                                          uint8* dst_yplane0,
                                          uint8* dst_yplane1,
-                                          uint8* dst_uplane,
+                                          uint8* dst_u,
-                                          uint8* dst_vplane,
+                                          uint8* dst_v,
-                                          int src_width)) {
+                                          int width)) {
-  if (src_frame == NULL || dst_yplane == NULL ||
+  if (src_frame == NULL || dst_y == NULL ||
-      dst_vplane == NULL || dst_vplane == NULL)
+      dst_v == NULL || dst_v == NULL)
    return -1;
-  if (src_height < 0) {
+  if (height < 0) {
-    src_height = -src_height;
+    height = -height;
-    src_frame = src_frame + src_stride * (src_height -1);
+    src_frame = src_frame + src_stride_frame * (height -1);
-    src_stride = -src_stride;
+    src_stride_frame = -src_stride_frame;
  }
-  for (int y = 0; y < src_height - 1; y += 2) {
+  for (int y = 0; y < height - 1; y += 2) {
-    RGBToI420Row(src_frame, src_frame + src_stride,
+    RGBToI420Row(src_frame, src_frame + src_stride_frame,
-                 dst_yplane, dst_yplane + dst_ystride,
+                 dst_y, dst_y + dst_stride_y,
-                 dst_uplane, dst_vplane,
+                 dst_u, dst_v,
-                 src_width);
+                 width);
-    src_frame += src_stride * 2;
+    src_frame += src_stride_frame * 2;
-    dst_yplane += dst_ystride * 2;
+    dst_y += dst_stride_y * 2;
-    dst_uplane += dst_ustride;
+    dst_u += dst_stride_u;
-    dst_vplane += dst_vstride;
+    dst_v += dst_stride_v;
  }
-  if (src_height & 1) {
+  if (height & 1) {
    RGBToI420Row(src_frame, src_frame,
-                 dst_yplane, dst_yplane,
+                 dst_y, dst_y,
-                 dst_uplane, dst_vplane,
+                 dst_u, dst_v,
-                 src_width);
+                 width);
  }
  return 0;
 }
-int ARGBToI420(const uint8* src_frame, int src_stride,
+int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_yplane, int dst_ystride,
+               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uplane, int dst_ustride,
+               uint8* dst_u, int dst_stride_u,
-               uint8* dst_vplane, int dst_vstride,
+               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height) {
+               int width, int height) {
-  return RGBToI420(src_frame, src_stride,
+  return RGBToI420(src_frame, src_stride_frame,
-                   dst_yplane, dst_ystride,
+                   dst_y, dst_stride_y,
-                   dst_uplane, dst_ustride,
+                   dst_u, dst_stride_u,
-                   dst_vplane, dst_vstride,
+                   dst_v, dst_stride_v,
-                   src_width, src_height, ARGBToI420Row_C);
+                   width, height, ARGBToI420Row_C);
 }
-int BGRAToI420(const uint8* src_frame, int src_stride,
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_yplane, int dst_ystride,
+               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uplane, int dst_ustride,
+               uint8* dst_u, int dst_stride_u,
-               uint8* dst_vplane, int dst_vstride,
+               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height) {
+               int width, int height) {
-  return RGBToI420(src_frame, src_stride,
+  return RGBToI420(src_frame, src_stride_frame,
-                   dst_yplane, dst_ystride,
+                   dst_y, dst_stride_y,
-                   dst_uplane, dst_ustride,
+                   dst_u, dst_stride_u,
-                   dst_vplane, dst_vstride,
+                   dst_v, dst_stride_v,
-                   src_width, src_height, BGRAToI420Row_C);
+                   width, height, BGRAToI420Row_C);
 }
-int ABGRToI420(const uint8* src_frame, int src_stride,
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_yplane, int dst_ystride,
+               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uplane, int dst_ustride,
+               uint8* dst_u, int dst_stride_u,
-               uint8* dst_vplane, int dst_vstride,
+               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height) {
+               int width, int height) {
-  return RGBToI420(src_frame, src_stride,
+  return RGBToI420(src_frame, src_stride_frame,
-                   dst_yplane, dst_ystride,
+                   dst_y, dst_stride_y,
-                   dst_uplane, dst_ustride,
+                   dst_u, dst_stride_u,
-                   dst_vplane, dst_vstride,
+                   dst_v, dst_stride_v,
-                   src_width, src_height, ABGRToI420Row_C);
+                   width, height, ABGRToI420Row_C);
 }
-int RGB24ToI420(const uint8* src_frame, int src_stride,
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_yplane, int dst_ystride,
+                uint8* dst_y, int dst_stride_y,
-                uint8* dst_uplane, int dst_ustride,
+                uint8* dst_u, int dst_stride_u,
-                uint8* dst_vplane, int dst_vstride,
+                uint8* dst_v, int dst_stride_v,
-                int src_width, int src_height) {
+                int width, int height) {
-  return RGBToI420(src_frame, src_stride,
+  return RGBToI420(src_frame, src_stride_frame,
-                   dst_yplane, dst_ystride,
+                   dst_y, dst_stride_y,
-                   dst_uplane, dst_ustride,
+                   dst_u, dst_stride_u,
-                   dst_vplane, dst_vstride,
+                   dst_v, dst_stride_v,
-                   src_width, src_height, RGB24ToI420Row_C);
+                   width, height, RGB24ToI420Row_C);
 }
-int RAWToI420(const uint8* src_frame, int src_stride,
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_yplane, int dst_ystride,
+              uint8* dst_y, int dst_stride_y,
-              uint8* dst_uplane, int dst_ustride,
+              uint8* dst_u, int dst_stride_u,
-              uint8* dst_vplane, int dst_vstride,
+              uint8* dst_v, int dst_stride_v,
-              int src_width, int src_height) {
+              int width, int height) {
-  return RGBToI420(src_frame, src_stride,
+  return RGBToI420(src_frame, src_stride_frame,
-                   dst_yplane, dst_ystride,
+                   dst_y, dst_stride_y,
-                   dst_uplane, dst_ustride,
+                   dst_u, dst_stride_u,
-                   dst_vplane, dst_vstride,
+                   dst_v, dst_stride_v,
-                   src_width, src_height, RAWToI420Row_C);
+                   width, height, RAWToI420Row_C);
+}
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_frame = src_frame + (height - 1) * src_stride_frame;
+    src_stride_frame = -src_stride_frame;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
+    ARGBToYRow = ARGBToYRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToYRow = ARGBToYRow_C;
+  }
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+      IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) &&
+      IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) {
+    ARGBToUVRow = ARGBToUVRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToUVRow = ARGBToUVRow_C;
+  }
+  for (int y = 0; y < (height - 1); y += 2) {
+    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+    ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+    src_frame += src_stride_frame * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToYRow(src_frame, dst_y, width);
+    ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+  }
+  return 0;
 }
 } // namespace libyuv
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -12,21 +12,10 @@
 #include "libyuv/cpu_id.h"
 #include "video_common.h"
+#include "row.h"
 namespace libyuv {
-// Most code in here is inspired by the material at
-// http://www.siliconimaging.com/RGB%20Bayer.htm
-// Forces compiler to inline, even against its better judgement. Use wisely.
-#if defined(__GNUC__)
-#define FORCE_INLINE __attribute__((always_inline))
-#elif defined(WIN32)
-#define FORCE_INLINE __forceinline
-#else
-#define FORCE_INLINE
-#endif
 // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
 // and vst would select which 2 components to write.  The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
@@ -333,46 +322,6 @@ int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
  return 0;
 }
-// Taken from http://en.wikipedia.org/wiki/YUV
-static FORCE_INLINE int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
-}
-static FORCE_INLINE int RGBToU(uint8 r, uint8 g, uint8 b) {
-  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
-}
-static FORCE_INLINE int RGBToV(uint8 r, uint8 g, uint8 b) {
-  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
-}
-static void ARGBtoYRow(const uint8* src_argb0,
-                       uint8* dst_y, int width) {
-  for (int x = 0; x < width; ++x) {
-    dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
-    src_argb0 += 4;
-    dst_y += 1;
-  }
-}
-static void ARGBtoUVRow(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u,
-                        uint8* dst_v,
-                        int width) {
-  const uint8* src_argb1 = src_argb0 + src_stride_argb;
-  for (int x = 0; x < width - 1; x += 2) {
-    uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
-    uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
-    uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb0 += 8;
-    src_argb1 += 8;
-    dst_u += 1;
-    dst_v += 1;
-  }
-}
 // Converts any Bayer RGB format to ARGB.
 int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
                   uint32 src_fourcc_bayer,
@@ -395,6 +344,28 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_rgb, int pix);
  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_rgb, int pix);
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#define kMaxStride (2048 * 4)
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
+      IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
+    ARGBToYRow = ARGBToYRow_SSSE3;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+    ARGBToUVRow = ARGBToUVRow_SSSE3;
+#else
+    ARGBToUVRow = ARGBToUVRow_C;
+#endif
+  } else
+#endif
+  {
+    ARGBToYRow = ARGBToYRow_C;
+    ARGBToUVRow = ARGBToUVRow_C;
+  }
  switch (src_fourcc_bayer) {
    default:
@@ -417,24 +388,23 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
      break;
  }
-#define kMaxStride 2048 * 4
-  uint8 row[kMaxStride * 2];
  for (int y = 0; y < (height - 1); y += 2) {
    BayerRow0(src_bayer, src_stride_bayer, row, width);
    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
              row + kMaxStride, width);
-    ARGBtoYRow(row, dst_y, width);
+    ARGBToYRow(row, dst_y, width);
-    ARGBtoYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
-    ARGBtoUVRow(row, kMaxStride, dst_u, dst_v, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
    src_bayer += src_stride_bayer * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
+  // TODO(fbarchard): Make sure this filters properly
  if (height & 1) {
    BayerRow0(src_bayer, src_stride_bayer, row, width);
-    ARGBtoYRow(row, dst_y, width);
+    ARGBToYRow(row, dst_y, width);
-    ARGBtoUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
  }
  return 0;
 }

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -68,15 +68,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
-// Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
-  13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
-};
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = {
-  1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
-};
 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
 #define HAS_SPLITUV_SSE2
 __declspec(naked)
@@ -215,7 +206,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
 static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
  __asm__ volatile
  (
-    "vdup.u32   q0, %2            \n"  // duplicate 4 ints
+    "vdup.u32   {q0}, %2          \n"  // duplicate 4 ints
    "1:\n"
    "vst1.u32   {q0}, [%0]!       \n"  // store
    "subs       %1, %1, #16       \n"  // 16 processed per loop
@@ -393,16 +384,16 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
 }
 static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                           uint8* dst, int dst_stride,
+                           uint8* dst, int dst_stride_frame,
                           int width, int height) {
  // Copy plane
  for (int y = 0; y < height; y += 2) {
    memcpy(dst, src, width);
    src += src_stride_0;
-    dst += dst_stride;
+    dst += dst_stride_frame;
    memcpy(dst, src, width);
    src += src_stride_1;
-    dst += dst_stride;
+    dst += dst_stride_frame;
  }
 }
@@ -503,13 +494,13 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
 // Convert NV12 to I420.  Deprecated.
 int NV12ToI420(const uint8* src_y,
               const uint8* src_uv,
-               int src_stride,
+               int src_stride_frame,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
-  return X420ToI420(src_y, src_stride, src_stride,
+  return X420ToI420(src_y, src_stride_frame, src_stride_frame,
-                    src_uv, src_stride,
+                    src_uv, src_stride_frame,
                    dst_y, dst_stride_y,
                    dst_u, dst_stride_u,
                    dst_v, dst_stride_v,
@@ -1371,38 +1362,6 @@ __asm {
  }
 }
-#define HAS_ARGBTOI400ROW_SSSE3
-__declspec(naked)
-static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_y
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm7, _kMultiplyMaskARGBToI400
-    movdqa    xmm6, _kMultiplyMaskARGBToI400_2
-    movdqa    xmm5, xmm6
-    psllw     xmm5, 4         // Generate a mask of 0x10 on each byte.
- convertloop :
-    movdqa    xmm0, [eax]
-    pmaddubsw xmm0, xmm7
-    movdqa    xmm1, [eax + 16]
-    psrlw     xmm0, 7
-    pmaddubsw xmm1, xmm7
-    lea       eax, [eax + 32]
-    psrlw     xmm1, 7
-    packuswb  xmm0, xmm1
-    pmaddubsw xmm0, xmm6
-    packuswb  xmm0, xmm0
-    paddb     xmm0, xmm5
-    movq      qword ptr [edx], xmm0
-    lea       edx, [edx + 8]
-    sub       ecx, 8
-    ja        convertloop
-    ret
-  }
-}
 #elif (defined(__x86_64__) || defined(__i386__)) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
@@ -1554,39 +1513,6 @@ static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
 );
 }
-#define HAS_ARGBTOI400ROW_SSSE3
-static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y,
-                                int pix) {
-  asm volatile(
-  "movdqa     (%3),%%xmm7\n"
-  "movdqa     (%4),%%xmm6\n"
-  "movdqa     %%xmm6,%%xmm5\n"
-  "psllw      $0x4,%%xmm5\n"  // Generate a mask of 0x10 on each byte.
-"1:"
-  "movdqa     (%0),%%xmm0\n"
-  "pmaddubsw  %%xmm7,%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "psrlw      $0x7,%%xmm0\n"
-  "pmaddubsw  %%xmm7,%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "psrlw      $0x7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "pmaddubsw  %%xmm6,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "paddb      %%xmm5,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
-  "sub        $0x8,%2\n"
-  "ja         1b\n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_y),      // %1
-    "+r"(pix)         // %2
-  : "r"(kMultiplyMaskARGBToI400),    // %3
-    "r"(kMultiplyMaskARGBToI400_2)   // %4
-  : "memory"
-);
-}
 #endif
 static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
@@ -1812,16 +1738,6 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
  return 0;
 }
-static void ARGBToI400Row_C(const uint8* src_argb, uint8* dst_y, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    uint32 b = static_cast<uint32>(src_argb[0] * 13u);
-    uint32 g = static_cast<uint32>(src_argb[1] * 64u);
-    uint32 r = static_cast<uint32>(src_argb[2] * 33u);
-    *(dst_y++) = static_cast<uint8>(((b + g + r) >> 7) + 16u);
-    src_argb += 4;
-  }
-}
 // Convert ARGB to I400.
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
@@ -1831,21 +1747,21 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
    src_argb = src_argb + (height - 1) * src_stride_argb;
    src_stride_argb = -src_stride_argb;
  }
-void (*ARGBToI400Row)(const uint8* src_argb, uint8* dst_y, int pix);
+void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-#if defined(HAS_ARGBTOI400ROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
      (width % 4 == 0) &&
      IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
-    ARGBToI400Row = ARGBToI400Row_SSSE3;
+    ARGBToYRow = ARGBToYRow_SSSE3;
  } else
 #endif
  {
-    ARGBToI400Row = ARGBToI400Row_C;
+    ARGBToYRow = ARGBToYRow_C;
  }
  for (int y = 0; y < height; ++y) {
-    ARGBToI400Row(src_argb, dst_y, width);
+    ARGBToYRow(src_argb, dst_y, width);
    src_argb += src_stride_argb;
    dst_y += dst_stride_y;
  }

--- a/source/row.h
+++ b/source/row.h
@@ -13,6 +13,16 @@
 #include "libyuv/basic_types.h"
+#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
+    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_ARGBTOYROW_SSSE3
+#endif
+#if defined(WIN32) \
+    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_ARGBTOUVROW_SSSE3
+#endif
 extern "C" {
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                              const uint8* u_buf,
@@ -42,11 +52,24 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
                            uint8* rgb_buf,
                            int width);
+#ifdef HAS_ARGBTOYROW_SSSE3
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+#endif
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
 #if defined(_MSC_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
 #else
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #endif
 #ifdef OSX
 extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
 extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -12,6 +12,91 @@
 extern "C" {
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Constant multiplication table for converting ARGB to I400.
+extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
+  13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
+};
+extern "C" TALIGN16(const uint8, kAdd16[16]) = {
+  1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+};
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile(
+  "movdqa     (%3),%%xmm7\n"
+  "movdqa     (%4),%%xmm6\n"
+  "movdqa     %%xmm6,%%xmm5\n"
+  "psllw      $0x4,%%xmm5\n"  // Generate a mask of 0x10 on each byte.
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "pmaddubsw  %%xmm7,%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "psrlw      $0x7,%%xmm0\n"
+  "pmaddubsw  %%xmm7,%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "psrlw      $0x7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "pmaddubsw  %%xmm6,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "paddb      %%xmm5,%%xmm0\n"
+  "movq       %%xmm0,(%1)\n"
+  "lea        0x8(%1),%1\n"
+  "sub        $0x8,%2\n"
+  "ja         1b\n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_y),      // %1
+    "+r"(pix)         // %2
+  : "r"(kMultiplyMaskARGBToI400),    // %3
+    "r"(kAdd16)   // %4
+  : "memory"
+);
+}
+#endif
+static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
+}
+static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+}
+static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+}
+void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
+  for (int x = 0; x < width; ++x) {
+    dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
+    src_argb0 += 4;
+    dst_y += 1;
+  }
+}
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb1 = src_argb0 + src_stride_argb;
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
+    uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
+    uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
+    uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
+    uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
 #if defined(__x86_64__)
 // 64 bit linux gcc version

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -12,6 +12,176 @@
 extern "C" {
+#ifdef HAS_ARGBTOYROW_SSSE3
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+// Constant multiplication table for converting ARGB to I400.
+extern "C" TALIGN16(const int8, kRGBToY[16]) = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+extern "C" TALIGN16(const int8, kRGBToU[16]) = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+extern "C" TALIGN16(const int8, kRGBToV[16]) = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+extern "C" TALIGN16(const uint8, kAddY16[16]) = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+};
+extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
+  128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u,
+  128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u
+};
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_y
+    mov        ecx, [esp + 12]  // pix
+    movdqa     xmm7, _kRGBToY
+    movdqa     xmm6, _kAddY16
+    pcmpeqb    xmm5, xmm5      // Generate mask 0x0000ffff
+    psrld      xmm5, 16
+ convertloop :
+    movdqa    xmm0, [eax]
+    movdqa    xmm1, [eax + 16]
+    pmaddubsw xmm0, xmm7
+    lea       eax, [eax + 32]
+    pmaddubsw xmm1, xmm7            // BG ra BG ra BG ra BG ra
+    palignr   xmm2, xmm0, 2         // AR xx AR xx AR xx AR xx
+    paddw     xmm2, xmm0            // BGRA xx BGRA xx BGRA xx BGRA xx
+    pand      xmm2, xmm5            // BGRA 00 BGRA 00 BGRA 00 BGRA 00
+    palignr   xmm3, xmm1, 2
+    paddw     xmm3, xmm1
+    pand      xmm3, xmm5            // BGRA 00 BGRA 00 BGRA 00 BGRA 00
+    packssdw  xmm2, xmm3            // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
+    psrlw     xmm2, 7               // 0B xx 0B xx 0B xx 0B xx
+    packuswb  xmm2, xmm2
+    paddb     xmm2, xmm6
+    movq      qword ptr [edx], xmm2
+    lea       edx, [edx + 8]
+    sub       ecx, 8
+    ja        convertloop
+    ret
+  }
+}
+__declspec(naked)
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, _kRGBToU
+    movdqa     xmm6, _kRGBToV
+    movdqa     xmm5, _kAddUV128
+    pcmpeqb    xmm4, xmm4      // Generate mask 0x0000ffff
+    psrld      xmm4, 16
+ convertloop :
+    // step 1 - subsample 8x2 argb pixels to 4x1
+    movdqa     xmm0, [eax]           // 32x2 -> 32x1
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // 32x1 -> 16x1
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm2, xmm1, 0xdd
+    pavgb      xmm0, xmm2
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 8 different pixels, its 4 pixels of U and 4 of V
+    movdqa     xmm1, xmm0
+    pmaddubsw  xmm0, xmm7            // U
+    pmaddubsw  xmm1, xmm6            // V
+    palignr    xmm2, xmm0, 2         // AR xx AR xx AR xx AR xx
+    paddw      xmm2, xmm0            // BGRA xx BGRA xx BGRA xx BGRA xx
+    pand       xmm2, xmm4            // BGRA 00 BGRA 00 BGRA 00 BGRA 00
+    palignr    xmm3, xmm1, 2
+    paddw      xmm3, xmm1
+    pand       xmm3, xmm4            // BGRA 00 BGRA 00 BGRA 00 BGRA 00
+    psraw      xmm2, 8
+    psraw      xmm3, 8
+    packsswb   xmm2, xmm3            // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
+    paddb      xmm2, xmm5            // -> unsigned
+    packuswb   xmm2, xmm2            // 8 bytes. 4 U, 4 V
+    // step 3 - store 4 U and 4 V values
+    movd       dword ptr [edx], xmm2 // U
+    lea        edx, [edx + 4]
+    pshufd     xmm0, xmm2, 0x55      // V
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 8
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
+}
+static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+}
+static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+}
+void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
+  for (int x = 0; x < width; ++x) {
+    dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
+    src_argb0 += 4;
+    dst_y += 1;
+  }
+}
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb1 = src_argb0 + src_stride_argb;
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
+    uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
+    uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
+    uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
+    uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
 __declspec(naked)
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
                              const uint8* u_buf,
@@ -200,4 +370,6 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
  }
 }
+#endif
 }  // extern "C"