rotate for x86 and bayer refactored - 3x faster.

BUG=1 TEST=tested with talk unittests. Review URL: http://webrtc-codereview.appspot.com/250004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@42 16f28f9a-4ce2-e073-06de-1de4eb20be90

rotate for x86 and bayer refactored - 3x faster.
BUG=1 TEST=tested with talk unittests. Review URL: http://webrtc-codereview.appspot.com/250004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@42 16f28f9a-4ce2-e073-06de-1de4eb20be90
78020389 · fbarchard@google.com · 3f4c056b · 78020389 · 78020389 · 78020389
Commit 78020389 authored Oct 27, 2011 by fbarchard@google.com
5 changed files
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -25,6 +25,14 @@ int I420Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_v, int dst_stride_v,
             int width, int height);
+// Draw a rectangle into I420
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v);
 // Convert I422 to I420.  Used by MJPG.
 int I422ToI420(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@@ -146,7 +154,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
 // Convert ARGB to I400.
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               const uint8* dst_y, int dst_stride_y,
+               uint8* dst_y, int dst_stride_y,
               int width, int height);
 }  // namespace libyuv

--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@@ -11,21 +11,25 @@
 #ifndef INCLUDE_LIBYUV_ROTATE_H_
 #define INCLUDE_LIBYUV_ROTATE_H_
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 namespace libyuv {
 // Supported rotation
 enum RotationMode {
+  kRotate0 = 0, // No rotation
+  kRotate90 = 90,  // Rotate 90 degrees clockwise
+  kRotate180 = 180,  // Rotate 180 degrees
+  kRotate270 = 270,  // Rotate 270 degrees clockwise
+  // Deprecated
  kRotateNone = 0,
  kRotateClockwise = 90,
  kRotateCounterClockwise = 270,
-  kRotate180 = 180,
 };
 // Rotate I420 frame
-int
+int I420Rotate(const uint8* src_y, int src_stride_y,
-I420Rotate(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
               uint8* dst_y, int dst_stride_y,
@@ -34,10 +38,8 @@ I420Rotate(const uint8* src_y, int src_stride_y,
               int width, int height,
               RotationMode mode);
-// Split a NV12 input buffer into Y, U, V buffers and
+// Rotate NV12 input and store in I420
-// then rotate the buffers.
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-int
-NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                     const uint8* src_uv, int src_stride_uv,
                     uint8* dst_y, int dst_stride_y,
                     uint8* dst_u, int dst_stride_u,

--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -27,396 +27,6 @@ namespace libyuv {
 #define FORCE_INLINE
 #endif
-enum {
-  RED = 0,
-  BLUE = 1,
-  GREEN_BETWEEN_RED = 2,
-  GREEN_BETWEEN_BLUE = 3,
-};
-enum Position {
-  LEFT = 0,
-  RIGHT = 1,
-  TOP = 2,
-  BOTTOM = 4,
-  CENTER = 6,
-  // Due to the choice of the above values, these are all distinct and the
-  // corner values and edge values are each contiguous. This allows us to
-  // figure out the position type of a pixel with a single addition operation
-  // using the above values, rather than having to use a 3x3 nested switch
-  // statement.
-  TOP_LEFT = TOP + LEFT,          // 2
-  TOP_RIGHT = TOP + RIGHT,        // 3
-  BOTTOM_LEFT = BOTTOM + LEFT,    // 4
-  BOTTOM_RIGHT = BOTTOM + RIGHT,  // 5
-  LEFT_EDGE = CENTER + LEFT,      // 6
-  RIGHT_EDGE = CENTER + RIGHT,    // 7
-  TOP_EDGE = TOP + CENTER,        // 8
-  BOTTOM_EDGE = BOTTOM + CENTER,  // 10
-  MIDDLE = CENTER + CENTER,       // 12
-};
-static FORCE_INLINE Position GetPosition(int x, int y, int width, int height) {
-  Position xpos = CENTER;
-  Position ypos = CENTER;
-  if (x == 0) {
-    xpos = LEFT;
-  } else if (x == width - 1) {
-    xpos = RIGHT;
-  }
-  if (y == 0) {
-    ypos = TOP;
-  } else if (y == height - 1) {
-    ypos = BOTTOM;
-  }
-  return static_cast<Position>(xpos + ypos);
-}
-static FORCE_INLINE bool IsRedBlue(uint8 colour) {
-  return colour <= BLUE;
-}
-static FORCE_INLINE uint32 FourCcToBayerPixelColourMap(uint32 fourcc) {
-  // The colour map is a 4-byte array-as-uint32 containing the colours for the
-  // four pixels in each 2x2 grid, in left-to-right and top-to-bottom order.
-  switch (fourcc) {
-    default:
-      assert(false);
-    case FOURCC_RGGB:
-      return FOURCC(RED, GREEN_BETWEEN_RED, GREEN_BETWEEN_BLUE, BLUE);
-    case FOURCC_BGGR:
-      return FOURCC(BLUE, GREEN_BETWEEN_BLUE, GREEN_BETWEEN_RED, RED);
-    case FOURCC_GRBG:
-      return FOURCC(GREEN_BETWEEN_RED, RED, BLUE, GREEN_BETWEEN_BLUE);
-    case FOURCC_GBRG:
-      return FOURCC(GREEN_BETWEEN_BLUE, BLUE, RED, GREEN_BETWEEN_RED);
-  }
-}
-static FORCE_INLINE void RGBToYUV(uint8 r, uint8 g, uint8 b,
-                                  uint8* y, uint8* u, uint8* v) {
-  // Taken from http://en.wikipedia.org/wiki/YUV
-  *y = (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
-  *u = ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
-  *v = ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
-}
-static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r,
-                                                   uint8* g,
-                                                   uint8* b,
-                                                   const uint8* src,
-                                                   int src_stride,
-                                                   Position pos,
-                                                   uint8 colour) {
-  // Compute the offsets to use for fetching the adjacent pixels.
-  int adjacent_row;
-  int adjacent_column;
-  switch (pos) {
-    case TOP_LEFT:
-      adjacent_row = src_stride;
-      adjacent_column = 1;
-      break;
-    case TOP_RIGHT:
-      adjacent_row = src_stride;
-      adjacent_column = -1;
-      break;
-    case BOTTOM_LEFT:
-      adjacent_row = -src_stride;
-      adjacent_column = 1;
-      break;
-    case BOTTOM_RIGHT:
-    default:
-      adjacent_row = -src_stride;
-      adjacent_column = -1;
-      break;
-  }
-  // Now interpolate.
-  if (IsRedBlue(colour)) {
-    uint8 current_pixel = src[0];
-    // Average of the adjacent green pixels (there's only two).
-    *g = (src[adjacent_column] + src[adjacent_row]) / 2;
-    // Average of the oppositely-coloured corner pixels (there's only one).
-    uint8 corner_average = src[adjacent_row + adjacent_column];
-    if (colour == RED) {
-      *r = current_pixel;
-      *b = corner_average;
-    } else {  // i.e., BLUE
-      *b = current_pixel;
-      *r = corner_average;
-    }
-  } else {  // i.e., GREEN_BETWEEN_*
-    *g = src[0];
-    // Average of the adjacent same-row pixels (there's only one).
-    uint8 row_average = src[adjacent_column];
-    // Average of the adjacent same-column pixels (there's only one).
-    uint8 column_average = src[adjacent_row];
-    if (colour == GREEN_BETWEEN_RED) {
-      *r = row_average;
-      *b = column_average;
-    } else {  // i.e., GREEN_BETWEEN_BLUE
-      *b = row_average;
-      *r = column_average;
-    }
-  }
-}
-static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r,
-                                                 uint8* g,
-                                                 uint8* b,
-                                                 const uint8* src,
-                                                 int src_stride,
-                                                 Position pos,
-                                                 uint8 colour) {
-  // Compute the offsets to use for fetching the adjacent pixels.
-  // Goes one pixel "in" to the image (i.e. towards the center)
-  int inner;
-  // Goes one pixel to the side (i.e. along the edge) in either the clockwise or
-  // counter-clockwise direction, and its negative value goes in the other
-  // direction.
-  int side;
-  switch (pos) {
-    case TOP_EDGE:
-      inner = src_stride;
-      side = 1;
-      break;
-    case RIGHT_EDGE:
-      inner = -1;
-      side = src_stride;
-      break;
-    case BOTTOM_EDGE:
-      inner = -src_stride;
-      side = 1;
-      break;
-    case LEFT_EDGE:
-    default:
-      inner = 1;
-      side = src_stride;
-      break;
-  }
-  // Now interpolate.
-  if (IsRedBlue(colour)) {
-    uint8 current_pixel = src[0];
-    // Average of the adjacent green pixels (there's only three).
-    *g = (src[inner] + src[side] + src[-side]) / 3;
-    // Average of the oppositely-coloured corner pixels (there's only two).
-    uint8 corner_average = (src[inner + side] + src[inner - side]) / 2;
-    if (colour == RED) {
-      *r = current_pixel;
-      *b = corner_average;
-    } else {  // i.e., BLUE
-      *b = current_pixel;
-      *r = corner_average;
-    }
-  } else {  // i.e., GREEN_BETWEEN_*
-    *g = src[0];
-    // Average of the adjacent side-ways pixels (there's only two).
-    uint8 side_average = (src[side] + src[-side]) / 2;
-    // Average of the adjacent inner-ways pixels (there's only one).
-    uint8 inner_pixel = src[inner];
-    // Including && side == 1 effectively transposes the colour logic for
-    // processing the left/right sides, which is needed since the "T" shape
-    // formed by the pixels is transposed.
-    if (colour == GREEN_BETWEEN_RED && side == 1) {
-      *r = side_average;
-      *b = inner_pixel;
-    } else {  // i.e., GREEN_BETWEEN_BLUE || side != 1
-      *b = side_average;
-      *r = inner_pixel;
-    }
-  }
-}
-// We inline this one because it runs 99% of the time, so inlining it is
-// probably beneficial.
-static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
-                                                   uint8* g,
-                                                   uint8* b,
-                                                   const uint8* src,
-                                                   int src_stride,
-                                                   uint8 colour) {
-  if (IsRedBlue(colour)) {
-    uint8 current_pixel = src[0];
-    // Average of the adjacent green pixels (there's four).
-    // NOTE(tschmelcher): The material at
-    // http://www.siliconimaging.com/RGB%20Bayer.htm discusses a way to improve
-    // quality here by using only two of the green pixels based on the
-    // correlation to the nearby red/blue pixels, but that is slower and would
-    // result in more edge cases.
-    *g = (src[1] + src[-1] + src[src_stride] + src[-src_stride]) / 4;
-    // Average of the oppositely-coloured corner pixels (there's four).
-    uint8 corner_average = (src[src_stride + 1] +
-                            src[src_stride - 1] +
-                            src[-src_stride + 1] +
-                            src[-src_stride - 1]) / 4;
-    if (colour == RED) {
-      *r = current_pixel;
-      *b = corner_average;
-    } else {  // i.e., BLUE
-      *b = current_pixel;
-      *r = corner_average;
-    }
-  } else {  // i.e., GREEN_BETWEEN_*
-    *g = src[0];
-    // Average of the adjacent same-row pixels (there's two).
-    uint8 row_adjacent = (src[1] + src[-1]) / 2;
-    // Average of the adjacent same-column pixels (there's two).
-    uint8 column_adjacent = (src[src_stride] + src[-src_stride]) / 2;
-    if (colour == GREEN_BETWEEN_RED) {
-      *r = row_adjacent;
-      *b = column_adjacent;
-    } else {  // i.e., GREEN_BETWEEN_BLUE
-      *b = row_adjacent;
-      *r = column_adjacent;
-    }
-  }
-}
-// Converts any Bayer RGB format to ARGB.
-int BayerRGBToARGB(const uint8* src, int src_stride, uint32 src_fourcc,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
-  assert(width % 2 == 0);
-  assert(height % 2 == 0);
-  uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
-  int src_row_inc = src_stride * 2 - width;
-  int dst_row_inc = dst_stride * 2 - width * 4;
-  // Iterate over the 2x2 grids.
-  for (int y1 = 0; y1 < height; y1 += 2) {
-    for (int x1 = 0; x1 < width; x1 += 2) {
-      uint32 colours = colour_map;
-      // Iterate over the four pixels within them.
-      for (int y2 = 0; y2 < 2; ++y2) {
-        for (int x2 = 0; x2 < 2; ++x2) {
-          uint8 r, g, b;
-          // The low-order byte of the colour map is the current colour.
-          uint8 current_colour = static_cast<uint8>(colours);
-          colours >>= 8;
-          Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
-          const uint8* src_pixel = &src[y2 * src_stride + x2];
-          uint8* dst_pixel = &dst[y2 * dst_stride + x2 * 4];
-          // Convert from Bayer RGB to regular RGB.
-          if (pos == MIDDLE) {
-            // 99% of the image is the middle.
-            InterpolateBayerRGBCenter(&r, &g, &b,
-                                      src_pixel, src_stride,
-                                      current_colour);
-          } else if (pos >= LEFT_EDGE) {
-            // Next most frequent is edges.
-            InterpolateBayerRGBEdge(&r, &g, &b,
-                                    src_pixel, src_stride, pos,
-                                    current_colour);
-          } else {
-            // Last is the corners. There are only 4.
-            InterpolateBayerRGBCorner(&r, &g, &b,
-                                      src_pixel, src_stride, pos,
-                                      current_colour);
-          }
-          // Store ARGB
-          dst_pixel[0] = b;
-          dst_pixel[1] = g;
-          dst_pixel[2] = r;
-          dst_pixel[3] = 255u;
-        }
-      }
-      src += 2;
-      dst += 2 * 4;
-    }
-    src += src_row_inc;
-    dst += dst_row_inc;
-  }
-  return 0;
-}
-// Converts any Bayer RGB format to I420.
-int BayerRGBToI420(const uint8* src, int src_stride, uint32 src_fourcc,
-                   uint8* y, int y_stride,
-                   uint8* u, int u_stride,
-                   uint8* v, int v_stride,
-                   int width, int height) {
-  assert(width % 2 == 0);
-  assert(height % 2 == 0);
-  uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
-  int src_row_inc = src_stride * 2 - width;
-  int y_row_inc = y_stride * 2 - width;
-  int u_row_inc = u_stride - width / 2;
-  int v_row_inc = v_stride - width / 2;
-  // Iterate over the 2x2 grids.
-  for (int y1 = 0; y1 < height; y1 += 2) {
-    for (int x1 = 0; x1 < width; x1 += 2) {
-      uint32 colours = colour_map;
-      int total_u = 0;
-      int total_v = 0;
-      // Iterate over the four pixels within them.
-      for (int y2 = 0; y2 < 2; ++y2) {
-        for (int x2 = 0; x2 < 2; ++x2) {
-          uint8 r, g, b;
-          // The low-order byte of the colour map is the current colour.
-          uint8 current_colour = static_cast<uint8>(colours);
-          colours >>= 8;
-          Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
-          const uint8* src_pixel = &src[y2 * src_stride + x2];
-          uint8* y_pixel = &y[y2 * y_stride + x2];
-          // Convert from Bayer RGB to regular RGB.
-          if (pos == MIDDLE) {
-            // 99% of the image is the middle.
-            InterpolateBayerRGBCenter(&r, &g, &b,
-                                      src_pixel, src_stride,
-                                      current_colour);
-          } else if (pos >= LEFT_EDGE) {
-            // Next most frequent is edges.
-            InterpolateBayerRGBEdge(&r, &g, &b,
-                                    src_pixel, src_stride, pos,
-                                    current_colour);
-          } else {
-            // Last is the corners. There are only 4.
-            InterpolateBayerRGBCorner(&r, &g, &b,
-                                      src_pixel, src_stride, pos,
-                                      current_colour);
-          }
-          // Convert from RGB to YUV.
-          uint8 tmp_u, tmp_v;
-          RGBToYUV(r, g, b, y_pixel, &tmp_u, &tmp_v);
-          total_u += tmp_u;
-          total_v += tmp_v;
-        }
-      }
-      src += 2;
-      y += 2;
-      *u = total_u / 4;
-      *v = total_v / 4;
-      ++u;
-      ++v;
-    }
-    src += src_row_inc;
-    y += y_row_inc;
-    u += u_row_inc;
-    v += v_row_inc;
-  }
-  return 0;
-}
 // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
 // and vst would select which 2 components to write.  The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
@@ -429,15 +39,15 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  __asm {
    mov        eax, [esp + 4]    // src_argb
    mov        edx, [esp + 8]    // dst_bayer
-    movd       xmm0, [esp + 12]  // selector
+    movd       xmm7, [esp + 12]  // selector
    mov        ecx, [esp + 16]   // pix
-    pshufd     xmm0, xmm0, 0
+    pshufd     xmm7, xmm7, 0
  wloop:
-    movdqa     xmm1, [eax]
+    movdqa     xmm0, [eax]
    lea        eax, [eax + 16]
-    pshufb     xmm1, xmm0
+    pshufb     xmm0, xmm7
-    movd       [edx], xmm1
+    movd       [edx], xmm0
    lea        edx, [edx + 4]
    sub        ecx, 4
    ja         wloop
@@ -445,37 +55,30 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  }
 }
-#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !TARGET_IPHONE_SIMULATOR
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_ARGBTOBAYERROW_SSSE3
-extern "C" void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                                     uint32 selector, int pix);
+                                 uint32 selector, int pix) {
-  asm(
+  asm volatile(
-    ".text\n"
+    "movd   %3,%%xmm7\n"
-#if defined(OSX)
+    "pshufd $0x0,%%xmm7,%%xmm7\n"
-    ".globl _ARGBToBayerRow_SSSE3\n"
-"_ARGBToBayerRow_SSSE3:\n"
-#else
-    ".global ARGBToBayerRow_SSSE3\n"
-"ARGBToBayerRow_SSSE3:\n"
-#endif
-    "mov    0x4(%esp),%eax\n"
-    "mov    0x8(%esp),%edx\n"
-    "movd   0xc(%esp),%xmm0\n"
-    "mov    0x10(%esp),%ecx\n"
-    "pshufd $0x0,%xmm0,%xmm0\n"
 "1:"
-    "movdqa (%eax),%xmm1\n"
+    "movdqa (%0),%%xmm0\n"
-    "lea    0x10(%eax),%eax\n"
+    "lea    0x10(%0),%0\n"
-    "pshufb %xmm0,%xmm1\n"
+    "pshufb %%xmm7,%%xmm0\n"
-    "movd   %xmm1,(%edx)\n"
+    "movd   %%xmm0,(%1)\n"
-    "lea    0x4(%edx),%edx\n"
+    "lea    0x4(%1),%1\n"
-    "sub    $0x4,%ecx\n"
+    "sub    $0x4,%2\n"
    "ja     1b\n"
-    "ret\n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_bayer), // %1
+    "+r"(pix)        // %2
+  : "r"(selector)    // %3
+  : "memory"
 );
+}
 #endif
 static void ARGBToBayerRow_C(const uint8* src_argb,
@@ -483,12 +86,15 @@ static void ARGBToBayerRow_C(const uint8* src_argb,
  int index0 = selector & 0xff;
  int index1 = (selector >> 8) & 0xff;
  // Copy a row of Bayer.
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < (pix - 1); x += 2) {
    dst_bayer[0] = src_argb[index0];
    dst_bayer[1] = src_argb[index1];
    src_argb += 8;
    dst_bayer += 2;
  }
+  if (pix & 1) {
+    dst_bayer[0] = src_argb[index0];
+  }
 }
 // generate a selector mask useful for pshufb
@@ -504,7 +110,11 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
                   uint8* dst_bayer, int dst_stride_bayer,
                   uint32 dst_fourcc_bayer,
                   int width, int height) {
-  assert(width % 2 == 0);
+  if (height < 0) {
+    height = -height;
+    src_rgb = src_rgb + (height - 1) * src_stride_rgb;
+    src_stride_rgb = -src_stride_rgb;
+  }
  void (*ARGBToBayerRow)(const uint8* src_argb,
                         uint8* dst_bayer, uint32 selector, int pix);
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
@@ -556,4 +166,277 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
  return 0;
 }
+#define AVG(a,b) (((a) + (b)) >> 1)
+static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 g = src_bayer0[1];
+  uint8 r = src_bayer1[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = src_bayer0[0];
+    dst_rgb[1] = AVG(g, src_bayer0[1]);
+    dst_rgb[2] = AVG(r, src_bayer1[1]);
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[5] = src_bayer0[1];
+    dst_rgb[6] = src_bayer1[1];
+    dst_rgb[7] = 255U;
+    g = src_bayer0[1];
+    r = src_bayer1[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = src_bayer0[0];
+  dst_rgb[1] = AVG(g, src_bayer0[1]);
+  dst_rgb[2] = AVG(r, src_bayer1[1]);
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer0[0];
+  dst_rgb[5] = src_bayer0[1];
+  dst_rgb[6] = src_bayer1[1];
+  dst_rgb[7] = 255U;
+}
+static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 g = src_bayer0[1];
+  uint8 b = src_bayer1[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = AVG(b, src_bayer1[1]);
+    dst_rgb[1] = AVG(g, src_bayer0[1]);
+    dst_rgb[2] = src_bayer0[0];
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = src_bayer1[1];
+    dst_rgb[5] = src_bayer0[1];
+    dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[7] = 255U;
+    g = src_bayer0[1];
+    b = src_bayer1[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = AVG(b, src_bayer1[1]);
+  dst_rgb[1] = AVG(g, src_bayer0[1]);
+  dst_rgb[2] = src_bayer0[0];
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer1[1];
+  dst_rgb[5] = src_bayer0[1];
+  dst_rgb[6] = src_bayer0[0];
+  dst_rgb[7] = 255U;
+}
+static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 b = src_bayer0[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = AVG(b, src_bayer0[1]);
+    dst_rgb[1] = src_bayer0[0];
+    dst_rgb[2] = src_bayer1[0];
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = src_bayer0[1];
+    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_rgb[7] = 255U;
+    b = src_bayer0[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = AVG(b, src_bayer0[1]);
+  dst_rgb[1] = src_bayer0[0];
+  dst_rgb[2] = src_bayer1[0];
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer0[1];
+  dst_rgb[5] = src_bayer0[0];
+  dst_rgb[6] = src_bayer1[0];
+  dst_rgb[7] = 255U;
+}
+static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_rgb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 r = src_bayer0[1];
+  for (int x = 0; x < (pix - 2); x += 2) {
+    dst_rgb[0] = src_bayer1[0];
+    dst_rgb[1] = src_bayer0[0];
+    dst_rgb[2] = AVG(r, src_bayer0[1]);
+    dst_rgb[3] = 255U;
+    dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_rgb[6] = src_bayer0[1];
+    dst_rgb[7] = 255U;
+    r = src_bayer0[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_rgb += 8;
+  }
+  dst_rgb[0] = src_bayer1[0];
+  dst_rgb[1] = src_bayer0[0];
+  dst_rgb[2] = AVG(r, src_bayer0[1]);
+  dst_rgb[3] = 255U;
+  dst_rgb[4] = src_bayer1[0];
+  dst_rgb[5] = src_bayer0[0];
+  dst_rgb[6] = src_bayer0[1];
+  dst_rgb[7] = 255U;
+}
+// Converts any Bayer RGB format to ARGB.
+int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+                   uint32 src_fourcc_bayer,
+                   uint8* dst_rgb, int dst_stride_rgb,
+                   int width, int height) {
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+  switch (src_fourcc_bayer) {
+    default:
+      assert(false);
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
+      break;
+    case FOURCC_BGGR:
+      BayerRow0 = BayerRowBG;
+      BayerRow1 = BayerRowGR;
+      break;
+    case FOURCC_GRBG:
+      BayerRow0 = BayerRowGR;
+      BayerRow1 = BayerRowBG;
+      break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
+  }
+  for (int y = 0; y < (height - 1); y += 2) {
+    BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
+    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+        dst_rgb + dst_stride_rgb, width);
+    src_bayer += src_stride_bayer * 2;
+    dst_rgb += dst_stride_rgb * 2;
+  }
+  if (height & 1) {
+    BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width);
+  }
+  return 0;
+}
+// Taken from http://en.wikipedia.org/wiki/YUV
+static FORCE_INLINE int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
+}
+static FORCE_INLINE int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+}
+static FORCE_INLINE int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+}
+static void ARGBtoYRow(const uint8* src_argb0,
+                       uint8* dst_y, int width) {
+  for (int x = 0; x < width; ++x) {
+    dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
+    src_argb0 += 4;
+    dst_y += 1;
+  }
+}
+static void ARGBtoUVRow(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u,
+                        uint8* dst_v,
+                        int width) {
+  const uint8* src_argb1 = src_argb0 + src_stride_argb;
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
+    uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
+    uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+// Converts any Bayer RGB format to ARGB.
+int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
+                   uint32 src_fourcc_bayer,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_rgb, int pix);
+  switch (src_fourcc_bayer) {
+    default:
+      assert(false);
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
+      break;
+    case FOURCC_BGGR:
+      BayerRow0 = BayerRowBG;
+      BayerRow1 = BayerRowGR;
+      break;
+    case FOURCC_GRBG:
+      BayerRow0 = BayerRowGR;
+      BayerRow1 = BayerRowBG;
+      break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
+  }
+#define kMaxStride 2048 * 4
+  uint8 row[kMaxStride * 2];
+  for (int y = 0; y < (height - 1); y += 2) {
+    BayerRow0(src_bayer, src_stride_bayer, row, width);
+    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+              row + kMaxStride, width);
+    ARGBtoYRow(row, dst_y, width);
+    ARGBtoYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+    ARGBtoUVRow(row, kMaxStride, dst_u, dst_v, width);
+    src_bayer += src_stride_bayer * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BayerRow0(src_bayer, src_stride_bayer, row, width);
+    ARGBtoYRow(row, dst_y, width);
+    ARGBtoUVRow(row, 0, dst_u, dst_v, width);
+  }
+  return 0;
+}
 }  // namespace libyuv
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -49,28 +49,33 @@ static void SplitUV_NEON(const uint8* src_uv,
 #endif
 // Shuffle table for converting ABGR to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
-  { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u };
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
 // Shuffle table for converting BGRA to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
-  { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u };
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
 // Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
-  { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u };
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
 // Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
-  { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u };
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
 // Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) =
+extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
-  { 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u,
+  13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
-    13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u };
+};
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) =
+extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = {
-  { 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u };
+  1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+};
 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
 #define HAS_SPLITUV_SSE2
@@ -169,28 +174,7 @@ static void I420CopyPlane(const uint8* src_y, int src_stride_y,
  }
 }
-static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+// Copy I420 with optional flipping
-                           uint8* dst, int dst_stride,
-                           int width, int height) {
-  // Copy plane
-  for (int y = 0; y < height; y += 2) {
-    memcpy(dst, src, width);
-    src += src_stride_0;
-    dst += dst_stride;
-    memcpy(dst, src, width);
-    src += src_stride_1;
-    dst += dst_stride;
-  }
-}
-// TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same
-// as I420, and only the chroma plane varies. Copy the Y plane by reference,
-// and just convert the UV.  This method can be used for NV21, NV12, I420,
-// I422, M422.  8 of the 12 bits is Y, so this would copy 3 times less data,
-// which is approximately how much faster it would be.
-// Helper function to copy yuv data without scaling.  Used
-// by our jpeg conversion callbacks to incrementally fill a yuv image.
 int I420Copy(const uint8* src_y, int src_stride_y,
             const uint8* src_u, int src_stride_u,
             const uint8* src_v, int src_stride_v,
@@ -198,6 +182,12 @@ int I420Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_u, int dst_stride_u,
             uint8* dst_v, int dst_stride_v,
             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@@ -218,6 +208,137 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }
+// SetRows32 writes 'count' bytes using a 32 bit value repeated
+#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#define HAS_SETROW_NEON
+static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
+  __asm__ volatile
+  (
+    "vdup.u32   {q0}, %2          \n"  // duplicate 4 ints
+    "1:\n"
+    "vst1.u32   {q0}, [%0]!       \n"  // store
+    "subs       %1, %1, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+  : "+r"(dst),  // %0
+    "+r"(count) // %1
+  : "r"(v32)    // %2
+  : "q0", "memory"
+  );
+}
+#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_SETROW_SSE2
+__declspec(naked)
+static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // dst
+    movd       xmm7, [esp + 8]   // v32
+    mov        ecx, [esp + 12]   // count
+    pshufd     xmm7, xmm7, 0
+  wloop:
+    movdqa     [eax], xmm7
+    lea        eax, [eax + 16]
+    sub        ecx, 16
+    ja         wloop
+    ret
+  }
+}
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_SETROW_SSE2
+static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
+  asm volatile(
+  "movd       %2, %%xmm7\n"
+  "pshufd     $0x0,%%xmm7,%%xmm7\n"
+"1:"
+  "movdqa     %%xmm7,(%0)\n"
+  "lea        0x10(%0),%0\n"
+  "sub        $0x10,%1\n"
+  "ja         1b\n"
+  : "+r"(dst),  // %0
+    "+r"(count) // %1
+  : "r"(v32)    // %2
+  : "memory"
+);
+}
+#endif
+static void SetRow8_C(uint8* dst, uint32 v8, int count) {
+  memset(dst, v8, count);
+}
+static void I420SetPlane(uint8* dst_y, int dst_stride_y,
+                         int width, int height,
+                         int value) {
+  void (*SetRow)(uint8* dst, uint32 value, int pix);
+#if defined(HAS_SETROW_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    SetRow = SetRow32_NEON;
+  } else
+#elif defined(HAS_SETROW_SSE2)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+    SetRow = SetRow32_SSE2;
+  } else
+#endif
+  {
+    SetRow = SetRow8_C;
+  }
+  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
+  // Set plane
+  for (int y = 0; y < height; ++y) {
+    SetRow(dst_y, v32, width);
+    dst_y += dst_stride_y;
+  }
+}
+// Draw a rectangle into I420
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v) {
+  if (!dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      x < 0 || y < 0 ||
+      value_y < 0 || value_y > 255 ||
+      value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  I420SetPlane(start_y, dst_stride_y, width, height, value_y);
+  I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
 // Helper function to copy yuv data without scaling.  Used
 // by our jpeg conversion callbacks to incrementally fill a yuv image.
 int I422ToI420(const uint8* src_y, int src_stride_y,
@@ -271,6 +392,20 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
  return 0;
 }
+static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                           uint8* dst, int dst_stride,
+                           int width, int height) {
+  // Copy plane
+  for (int y = 0; y < height; y += 2) {
+    memcpy(dst, src, width);
+    src += src_stride_0;
+    dst += dst_stride;
+    memcpy(dst, src, width);
+    src += src_stride_1;
+    dst += dst_stride;
+  }
+}
 // Support converting from FOURCC_M420
 // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
 // easy conversion to I420.
@@ -1238,8 +1373,7 @@ __asm {
 #define HAS_ARGBTOI400ROW_SSSE3
 __declspec(naked)
-static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y,
+static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-                                int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_argb
    mov       edx, [esp + 8]   // dst_y

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -154,6 +154,133 @@ __asm {
  }
 }
+#define HAS_TRANSPOSE_UVWX8_SSE2
+__declspec(naked)
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                                uint8* dst_a, int dst_stride_a,
+                                uint8* dst_b, int dst_stride_b,
+                                int w) {
+__asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+ convertloop :
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqa    xmm0, [eax]
+    movdqa    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqa    xmm2, [eax]
+    movdqa    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqa    xmm4, [eax]
+    movdqa    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqa    xmm6, [eax]
+    movdqa    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqa    xmm5, [esp]  // restore xmm5
+    movdqa    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqa    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    sub       ecx, 8
+    ja        convertloop
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
 #elif (defined(__i386__) || defined(__x86_64__)) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_TRANSPOSE_WX8_SSSE3
@@ -240,15 +367,134 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 );
 }
+// TODO(fbarchard): Port to 32 bit
+#if defined (__x86_64__)
+#define HAS_TRANSPOSE_UVWX8_SSE2
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                                uint8* dst_a, int dst_stride_a,
+                                uint8* dst_b, int dst_stride_b,
+                                int w) {
+  asm volatile(
+"1:"
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     (%0,%4),%%xmm1\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm0,%%xmm8\n"
+  "punpcklbw  %%xmm1,%%xmm0\n"
+  "punpckhbw  %%xmm1,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm1\n"
+  "movdqa     (%0),%%xmm2\n"
+  "movdqa     (%0,%4),%%xmm3\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm2,%%xmm8\n"
+  "punpcklbw  %%xmm3,%%xmm2\n"
+  "punpckhbw  %%xmm3,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm3\n"
+  "movdqa     (%0),%%xmm4\n"
+  "movdqa     (%0,%4),%%xmm5\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm4,%%xmm8\n"
+  "punpcklbw  %%xmm5,%%xmm4\n"
+  "punpckhbw  %%xmm5,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm5\n"
+  "movdqa     (%0),%%xmm6\n"
+  "movdqa     (%0,%4),%%xmm7\n"
+  "lea        (%0,%4,2),%0\n"
+  "movdqa     %%xmm6,%%xmm8\n"
+  "punpcklbw  %%xmm7,%%xmm6\n"
+  "neg        %4\n"
+  "lea        0x10(%0,%4,8),%0\n"
+  "punpckhbw  %%xmm7,%%xmm8\n"
+  "movdqa     %%xmm8,%%xmm7\n"
+  "neg        %4\n"
+   // Second round of bit swap.
+  "movdqa     %%xmm0,%%xmm8\n"
+  "movdqa     %%xmm1,%%xmm9\n"
+  "punpckhwd  %%xmm2,%%xmm8\n"
+  "punpckhwd  %%xmm3,%%xmm9\n"
+  "punpcklwd  %%xmm2,%%xmm0\n"
+  "punpcklwd  %%xmm3,%%xmm1\n"
+  "movdqa     %%xmm8,%%xmm2\n"
+  "movdqa     %%xmm9,%%xmm3\n"
+  "movdqa     %%xmm4,%%xmm8\n"
+  "movdqa     %%xmm5,%%xmm9\n"
+  "punpckhwd  %%xmm6,%%xmm8\n"
+  "punpckhwd  %%xmm7,%%xmm9\n"
+  "punpcklwd  %%xmm6,%%xmm4\n"
+  "punpcklwd  %%xmm7,%%xmm5\n"
+  "movdqa     %%xmm8,%%xmm6\n"
+  "movdqa     %%xmm9,%%xmm7\n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "movdqa     %%xmm0,%%xmm8\n"
+  "punpckldq  %%xmm4,%%xmm0\n"
+  "movlpd     %%xmm0,(%1)\n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)\n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "movdqa     %%xmm2,%%xmm8\n"
+  "punpckldq  %%xmm6,%%xmm2\n"
+  "movlpd     %%xmm2,(%1)\n"
+  "movhpd     %%xmm2,(%2)\n"
+  "punpckhdq  %%xmm6,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "movdqa     %%xmm1,%%xmm8\n"
+  "punpckldq  %%xmm5,%%xmm1\n"
+  "movlpd     %%xmm1,(%1)\n"
+  "movhpd     %%xmm1,(%2)\n"
+  "punpckhdq  %%xmm5,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "movdqa     %%xmm3,%%xmm8\n"
+  "punpckldq  %%xmm7,%%xmm3\n"
+  "movlpd     %%xmm3,(%1)\n"
+  "movhpd     %%xmm3,(%2)\n"
+  "punpckhdq  %%xmm7,%%xmm8\n"
+  "movlpd     %%xmm8,(%1,%5)\n"
+  "lea        (%1,%5,2),%1\n"
+  "movhpd     %%xmm8,(%2,%6)\n"
+  "lea        (%2,%6,2),%2\n"
+  "sub        $0x8,%3\n"
+  "ja         1b\n"
+  : "+r"(src),    // %0
+    "+r"(dst_a),  // %1
+    "+r"(dst_b),  // %2
+    "+r"(w)   // %3
+  : "r"(static_cast<intptr_t>(src_stride)),    // %4
+    "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
+    "r"(static_cast<intptr_t>(dst_stride_b))   // %6
+  : "memory"
+);
+}
+#endif
 #endif
 static void TransposeWx8_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int w) {
-  int i, j;
+  int i;
-  for (i = 0; i < w; ++i)
+  for (i = 0; i < w; ++i) {
-    for (j = 0; j < 8; ++j)
+    dst[0] = src[0 * src_stride];
-      dst[i * dst_stride + j] = src[j * src_stride + i];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
 }
 static void TransposeWxH_C(const uint8* src, int src_stride,
@@ -328,10 +574,10 @@ void RotatePlane270(const uint8* src, int src_stride,
 static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
  int i;
-  src += width;
+  src += width - 1;
  for (i = 0; i < width; ++i) {
-    --src;
    dst[i] = src[0];
+    --src;
  }
 }
@@ -407,15 +653,13 @@ void RotatePlane180(const uint8* src, int src_stride,
  {
    ReverseLine = ReverseLine_C;
  }
-  // Rotate by 180 is a mirror with the destination
+  // Rotate by 180 is a mirror and vertical flip
-  // written in reverse.
+  src += src_stride * (height - 1);
-  dst += dst_stride * (height - 1);
  for (i = 0; i < height; ++i) {
    ReverseLine(src, dst, width);
+    src -= src_stride;
-    src += src_stride;
+    dst += dst_stride;
-    dst -= dst_stride;
  }
 }
@@ -423,11 +667,27 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
                             int w) {
-  int i, j;
+  int i;
-  for (i = 0; i < w * 2; i += 2)
+  for (i = 0; i < w; ++i) {
-    for (j = 0; j < 8; ++j) {
+    dst_a[0] = src[0 * src_stride + 0];
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+    dst_b[0] = src[0 * src_stride + 1];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
  }
 }
@@ -436,7 +696,7 @@ static void TransposeUVWxH_C(const uint8* src, int src_stride,
                             uint8* dst_b, int dst_stride_b,
                             int w, int h) {
  int i, j;
-  for (i = 0; i < w*2; i += 2)
+  for (i = 0; i < w * 2; i += 2)
    for (j = 0; j < h; ++j) {
      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
@@ -452,12 +712,8 @@ void TransposeUV(const uint8* src, int src_stride,
  rotate_uv_wxh_func TransposeWxH;
 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 8 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
-      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0)) {
  unsigned long long store_reg[8];
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
    SaveRegisters_NEON(store_reg);
    TransposeWx8 = TransposeUVWx8_NEON;
    TransposeWxH = TransposeUVWxH_C;
@@ -466,9 +722,9 @@ void TransposeUV(const uint8* src, int src_stride,
 #if defined(HAS_TRANSPOSE_UVWX8_SSE2)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
      (width % 8 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0)) {
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
    TransposeWx8 = TransposeUVWx8_SSE2;
    TransposeWxH = TransposeUVWxH_C;
  } else
@@ -544,7 +800,7 @@ __asm {
    mov       edi, [esp + 4 + 12]  // dst_b
    mov       ecx, [esp + 4 + 16]  // width
    movdqa    xmm7, _kShuffleReverseUV
-    lea       eax, [eax + 2 * ecx - 16]
+    lea       eax, [eax + ecx * 2 - 16]
 convertloop :
    movdqa    xmm0, [eax]
@@ -610,13 +866,12 @@ void RotateUV180(const uint8* src, int src_stride,
  int i;
  reverse_uv_func ReverseLine;
-  // TODO(frkoenig) : do processor detection here.
 #if defined(HAS_REVERSE_LINE_UV_NEON)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0) ) {
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
    ReverseLine = ReverseLineUV_NEON;
  } else
 #endif
@@ -624,8 +879,8 @@ void RotateUV180(const uint8* src, int src_stride,
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0) ) {
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
    ReverseLine = ReverseLineUV_SSSE3;
  } else
 #endif
@@ -669,7 +924,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
  }
  switch (mode) {
-    case kRotateNone:
+    case kRotate0:
      // copy frame
      return I420Copy(src_y, src_stride_y,
                      src_u, src_stride_u,
@@ -678,7 +933,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
                      dst_u, dst_stride_u,
                      dst_v, dst_stride_v,
                      width, height);
-    case kRotateClockwise:
+    case kRotate90:
      RotatePlane90(src_y, src_stride_y,
                    dst_y, dst_stride_y,
                    width, height);
@@ -689,7 +944,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
                    dst_v, dst_stride_v,
                    halfwidth, halfheight);
      return 0;
-    case kRotateCounterClockwise:
+    case kRotate270:
      RotatePlane270(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);
@@ -738,14 +993,14 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
  }
  switch (mode) {
-    case kRotateNone:
+    case kRotate0:
      // copy frame
      return NV12ToI420(src_y, src_uv, src_stride_y,
                        dst_y, dst_stride_y,
                        dst_u, dst_stride_u,
                        dst_v, dst_stride_v,
                        width, height);
-    case kRotateClockwise:
+    case kRotate90:
      RotatePlane90(src_y, src_stride_y,
                    dst_y, dst_stride_y,
                    width, height);
@@ -754,7 +1009,7 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                 dst_v, dst_stride_v,
                 halfwidth, halfheight);
      return 0;
-    case kRotateCounterClockwise:
+    case kRotate270:
      RotatePlane270(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);