port I420ToYUY2 code to support I420ToYUY2 I422ToYUY2 and I420ToV210

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/388011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@173 16f28f9a-4ce2-e073-06de-1de4eb20be90

port I420ToYUY2 code to support I420ToYUY2 I422ToYUY2 and I420ToV210
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/388011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@173 16f28f9a-4ce2-e073-06de-1de4eb20be90
032b5f99 · fbarchard@google.com · 798197fc · 032b5f99 · 032b5f99 · 032b5f99
Commit 032b5f99 authored Feb 14, 2012 by fbarchard@google.com
Showing with 347 additions and 193 deletions

README.chromium README.chromium +1 -1

convert.h include/libyuv/convert.h +13 -0

version.h include/libyuv/version.h +1 -1

convert.cc source/convert.cc +323 -191

convertfrom.cc source/convertfrom.cc +9 -0

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 172
+Version: 173
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -69,6 +69,19 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
               uint8* dst_frame, int dst_stride_frame,
               int width, int height);
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+int I420ToV210(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
 int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 172
+#define LIBYUV_VERSION 173
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -10,12 +10,6 @@
 #include "libyuv/convert.h"
-//#define SCALEOPT //Currently for windows only. June 2010
-#ifdef SCALEOPT
-#include <emmintrin.h>  // Not currently used
-#endif
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/format_conversion.h"
@@ -32,6 +26,9 @@ extern "C" {
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
 #define HAS_I42XTOYUY2ROW_SSE2
 __declspec(naked)
@@ -50,12 +47,12 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
    sub        edx, esi
  convertloop:
-    movdqa     xmm0, [eax] // Y
-    lea        eax, [eax + 16]
    movq       xmm2, qword ptr [esi] // U
    movq       xmm3, qword ptr [esi + edx] // V
    lea        esi, [esi + 8]
    punpcklbw  xmm2, xmm3 // UV
+    movdqa     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
    movdqa     xmm1, xmm0
    punpcklbw  xmm0, xmm2 // YUYV
    punpckhbw  xmm1, xmm2
@@ -70,6 +67,44 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
    ret
  }
 }
+#define HAS_I42XTOUYVYROW_SSE2
+__declspec(naked)
+static void I42xToUYVYRow_SSE2(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqa     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqa     [edi], xmm1
+    movdqa     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
 #define HAS_I42XTOYUY2ROW_SSE2
 static void I42xToYUY2Row_SSE2(const uint8* src_y,
@@ -77,33 +112,68 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
                               const uint8* src_v,
                               uint8* dst_frame, int width) {
 asm volatile (
-  "sub        %1,%2                            \n"
+    "sub        %1,%2                            \n"
-"1:                                            \n"
+  "1:                                            \n"
-  "movdqa    (%0),%%xmm0                       \n"
+    "movq      (%1),%%xmm2                       \n"
-  "lea       0x10(%0),%0                       \n"
+    "movq      (%1,%2,1),%%xmm3                  \n"
-  "movq      (%1),%%xmm2                       \n"
+    "lea       0x8(%1),%1                        \n"
-  "movq      (%1,%2,1),%%xmm3                  \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
-  "lea       0x8(%1),%1                        \n"
+    "movdqa    (%0),%%xmm0                       \n"
-  "punpcklbw %%xmm3,%%xmm2                     \n"
+    "lea       0x10(%0),%0                       \n"
-  "movdqa    %%xmm0,%%xmm1                     \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
-  "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
-  "punpckhbw %%xmm2,%%xmm1                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
-  "movdqa    %%xmm0,(%3)                       \n"
+    "movdqa    %%xmm0,(%3)                       \n"
-  "movdqa    %%xmm1,0x10(%3)                   \n"
+    "movdqa    %%xmm1,0x10(%3)                   \n"
-  "lea       0x20(%3),%3                       \n"
+    "lea       0x20(%3),%3                       \n"
-  "sub       $0x10,%4                          \n"
+    "sub       $0x10,%4                          \n"
-  "ja         1b                               \n"
+    "ja         1b                               \n"
-  : "+r"(src_y),  // %0
+    : "+r"(src_y),  // %0
-    "+r"(src_u),  // %1
+      "+r"(src_u),  // %1
-    "+r"(src_v),  // %2
+      "+r"(src_v),  // %2
-    "+r"(dst_frame),  // %3
+      "+r"(dst_frame),  // %3
-    "+rm"(width)  // %4
+      "+rm"(width)  // %4
-  :
+    :
-  : "memory", "cc"
+    : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+#define HAS_I42XTOUYVYROW_SSE2
+static void I42xToUYVYRow_SSE2(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+  "1:                                            \n"
+    "movq      (%1),%%xmm2                       \n"
+    "movq      (%1,%2,1),%%xmm3                  \n"
+    "lea       0x8(%1),%1                        \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqa    (%0),%%xmm0                       \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       0x10(%0),%0                       \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqa    %%xmm1,(%3)                       \n"
+    "movdqa    %%xmm2,0x10(%3)                   \n"
+    "lea       0x20(%3),%3                       \n"
+    "sub       $0x10,%4                          \n"
+    "ja         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
-);
+  );
 }
 #endif
@@ -127,6 +197,104 @@ void I42xToYUY2Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v,
    }
 }
+void I42xToUYVYRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+                     uint8* dst_frame, int width) {
+    for (int x = 0; x < width - 1; x += 2) {
+      dst_frame[0] = src_u[0];
+      dst_frame[1] = src_y[0];
+      dst_frame[2] = src_v[0];
+      dst_frame[3] = src_y[1];
+      dst_frame += 4;
+      src_y += 2;
+      src_u += 1;
+      src_v += 1;
+    }
+    if (width & 1) {
+      dst_frame[0] = src_u[0];
+      dst_frame[1] = src_y[0];
+      dst_frame[2] = src_v[0];
+      dst_frame[3] = src_y[0];  // duplicate last y
+    }
+}
+// gcc provided macros
+#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+// Visual C for x86 defines these
+#elif defined(_M_X64) || defined(_M_IX86)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define READWORD(p) (*((uint32*) (p)))
+#define WRITEWORD(p, v) (*((uint32*) (p))) = v
+#else
+uint32 READWORD(const uint8* p) {
+  return (uint32) p[0] |
+         ((uint32) (p[1]) << 8) |
+         ((uint32) (p[2]) << 16) |
+         ((uint32) (p[3]) << 24);
+}
+void WRITEWORD(uint8* p, uint32 v) {
+  p[0] = (uint8)(v & 255);
+  p[1] = (uint8)((v >> 8) & 255);
+  p[2] = (uint8)((v >> 16) & 255);
+  p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+// Must be multiple of 6 pixels.  Will over convert to handle remainder.
+// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
+void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
+  for (int x = 0; x < width; x += 6) {
+    uint32 w = READWORD(src_v210 + 0);
+    dst_uyvy[0] = (w >> 2) & 0xff;
+    dst_uyvy[1] = (w >> 12) & 0xff;
+    dst_uyvy[2] = (w >> 22) & 0xff;
+    w = READWORD(src_v210 + 4);
+    dst_uyvy[3] = (w >> 2) & 0xff;
+    dst_uyvy[4] = (w >> 12) & 0xff;
+    dst_uyvy[5] = (w >> 22) & 0xff;
+    w = READWORD(src_v210 + 8);
+    dst_uyvy[6] = (w >> 2) & 0xff;
+    dst_uyvy[7] = (w >> 12) & 0xff;
+    dst_uyvy[8] = (w >> 22) & 0xff;
+    w = READWORD(src_v210 + 12);
+    dst_uyvy[9] = (w >> 2) & 0xff;
+    dst_uyvy[10] = (w >> 12) & 0xff;
+    dst_uyvy[11] = (w >> 22) & 0xff;
+    src_v210 += 16;
+    dst_uyvy += 12;
+  }
+}
+#define EIGHTTOTEN(x) (x << 2 | x >> 6)
+void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) {
+  for (int x = 0; x < width; x += 6) {
+    WRITEWORD(dst_v210 + 0, (EIGHTTOTEN(src_uyvy[0])) |
+                            (EIGHTTOTEN(src_uyvy[1]) << 10) |
+                            (EIGHTTOTEN(src_uyvy[2]) << 20));
+    WRITEWORD(dst_v210 + 4, (EIGHTTOTEN(src_uyvy[3])) |
+                            (EIGHTTOTEN(src_uyvy[4]) << 10) |
+                            (EIGHTTOTEN(src_uyvy[5]) << 20));
+    WRITEWORD(dst_v210 + 8, (EIGHTTOTEN(src_uyvy[6])) |
+                            (EIGHTTOTEN(src_uyvy[7]) << 10) |
+                            (EIGHTTOTEN(src_uyvy[8]) << 20));
+    WRITEWORD(dst_v210 + 12, (EIGHTTOTEN(src_uyvy[9])) |
+                             (EIGHTTOTEN(src_uyvy[10]) << 10) |
+                             (EIGHTTOTEN(src_uyvy[11]) << 20));
+    src_uyvy += 12;
+    dst_v210 += 16;
+  }
+}
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
@@ -143,17 +311,16 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
  }
  void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
                        const uint8* src_v, uint8* dst_frame, int width);
+  I42xToYUY2Row = I42xToYUY2Row_C;
 #if defined(HAS_I42XTOYUY2ROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
    I42xToYUY2Row = I42xToYUY2Row_SSE2;
-  } else
-#endif
-  {
-    I42xToYUY2Row = I42xToYUY2Row_C;
  }
+#endif
  for (int y = 0; y < height; ++y) {
    I42xToYUY2Row(src_y, src_u, src_y, dst_frame, width);
    src_y += src_stride_y;
@@ -180,17 +347,16 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
  }
  void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
                        const uint8* src_v, uint8* dst_frame, int width);
+  I42xToYUY2Row = I42xToYUY2Row_C;
 #if defined(HAS_I42XTOYUY2ROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
    I42xToYUY2Row = I42xToYUY2Row_SSE2;
-  } else
-#endif
-  {
-    I42xToYUY2Row = I42xToYUY2Row_C;
  }
+#endif
  for (int y = 0; y < height - 1; y += 2) {
    I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
    I42xToYUY2Row(src_y + src_stride_y, src_u, src_v,
@@ -206,6 +372,42 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
  return 0;
 }
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height) {
+  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+    dst_stride_frame = -dst_stride_frame;
+  }
+  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width);
+  I42xToUYVYRow = I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+    I42xToUYVYRow = I42xToUYVYRow_SSE2;
+  }
+#endif
+  for (int y = 0; y < height; ++y) {
+    I42xToUYVYRow(src_y, src_u, src_y, dst_frame, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_frame += dst_stride_frame;
+  }
+  return 0;
+}
 int I420ToUYVY(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
@@ -214,107 +416,85 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
  if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
    return -1;
  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+    dst_stride_frame = -dst_stride_frame;
+  }
+  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width);
+  I42xToUYVYRow = I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+    I42xToUYVYRow = I42xToUYVYRow_SSE2;
+  }
+#endif
-  int i = 0;
+  for (int y = 0; y < height - 1; y += 2) {
-  const uint8* y1 = src_y;
+    I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
-  const uint8* y2 = y1 + src_stride_y;
+    I42xToUYVYRow(src_y + src_stride_y, src_u, src_v,
-  const uint8* u = src_u;
+                  dst_frame + dst_stride_frame, width);
-  const uint8* v = src_v;
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
-  uint8* out1 = dst_frame;
+    src_v += src_stride_v;
-  uint8* out2 = dst_frame + dst_stride_frame;
+    dst_frame += dst_stride_frame * 2;
-  // Macro-pixel = 2 image pixels
-  // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5.....
-#ifndef SCALEOPT
-  for (; i < ((height + 1) >> 1); i++) {
-    for (int j = 0; j < ((width + 1) >> 1); j++) {
-      out1[0] = *u;
-      out1[1] = y1[0];
-      out1[2] = *v;
-      out1[3] = y1[1];
-      out2[0] = *u;
-      out2[1] = y2[0];
-      out2[2] = *v;
-      out2[3] = y2[1];
-      out1 += 4;
-      out2 += 4;
-      u++;
-      v++;
-      y1 += 2;
-      y2 += 2;
-    }
-    y1 += 2 * src_stride_y - width;
-    y2 += 2 * src_stride_y - width;
-    u += src_stride_u - ((width + 1) >> 1);
-    v += src_stride_v - ((width + 1) >> 1);
-    out1 += 2 * (dst_stride_frame - width);
-    out2 += 2 * (dst_stride_frame - width);
  }
-#else
+  if (height & 1) {
-  for (; i < (height >> 1);i++) {
+    I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
-    int32 width__ = (width >> 4);
+  }
-    _asm
+  return 0;
-    {
+}
-      ;pusha
-      mov       eax, DWORD PTR [in1]                       ;1939.33
+int I420ToV210(const uint8* src_y, int src_stride_y,
-      mov       ecx, DWORD PTR [in2]                       ;1939.33
+               const uint8* src_u, int src_stride_u,
-      mov       ebx, DWORD PTR [src_u]                       ;1939.33
+               const uint8* src_v, int src_stride_v,
-      mov       edx, DWORD PTR [src_v]                       ;1939.33
+               uint8* dst_frame, int dst_stride_frame,
-loop0:
+               int width, int height) {
-      movq      xmm6, QWORD PTR [ebx]          ;src_u
+  if (width * 16 / 6 > kMaxStride || // row buffer of V210 is required
-      movq      xmm0, QWORD PTR [edx]          ;src_v
+      src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
-      punpcklbw xmm6, xmm0                     ;src_u, src_v mix
+    return -1;
-      movdqa    xmm1, xmm6
+  }
-      movdqa    xmm2, xmm6
+  // Negative height means invert the image.
-      movdqa    xmm4, xmm6
+  if (height < 0) {
+    height = -height;
-      movdqu    xmm3, XMMWORD PTR [eax]        ;in1
+    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-      punpcklbw xmm1, xmm3                     ;src_u, in1, src_v
+    dst_stride_frame = -dst_stride_frame;
-      mov       esi, DWORD PTR [out1]
+  }
-      movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
-      movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
+  void (*UYVYToV210Row)(const uint8* src_uyvy, uint8* dst_v210, int pix);
-      punpcklbw xmm2, xmm5                     ;src_u, in2, src_v
+  UYVYToV210Row = UYVYToV210Row_C;
-      mov       edi, DWORD PTR [out2]
-      movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
+  void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_frame, int width);
-      punpckhbw xmm4, xmm3                     ;src_u, in1, src_v again
+  I42xToUYVYRow = I42xToUYVYRow_C;
-      movdqu    XMMWORD PTR [esi+16], xmm4     ;write to out1 again
+#if defined(HAS_I42XTOUYVYROW_SSE2)
-      add       esi, 32
+  if (TestCpuFlag(kCpuHasSSE2) &&
-      mov       DWORD PTR [out1], esi
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
-      punpckhbw xmm6, xmm5                     ;src_u, in2, src_v again
+    I42xToUYVYRow = I42xToUYVYRow_SSE2;
-      movdqu    XMMWORD PTR [edi+16], xmm6     ;write to out2 again
-      add       edi, 32
-      mov       DWORD PTR [out2], edi
-      add       ebx, 8
-      add       edx, 8
-      add       eax, 16
-      add       ecx, 16
-      mov       esi, DWORD PTR [width__]
-      sub       esi, 1
-      mov       DWORD PTR [width__], esi
-      jg        loop0
-      mov       DWORD PTR [in1], eax                       ;1939.33
-      mov       DWORD PTR [in2], ecx                       ;1939.33
-      mov       DWORD PTR [src_u], ebx                       ;1939.33
-      mov       DWORD PTR [src_v], edx                       ;1939.33
-      ;popa
-      emms
-    }
-    in1 += width;
-    in2 += width;
-    out1 += 2 * (dst_stride_frame - width);
-    out2 += 2 * (dst_stride_frame - width);
  }
 #endif
+  for (int y = 0; y < height - 1; y += 2) {
+    I42xToUYVYRow(src_y, src_u, src_v, row, width);
+    UYVYToV210Row(row, dst_frame, width);
+    I42xToUYVYRow(src_y + src_stride_y, src_u, src_v, row, width);
+    UYVYToV210Row(row, dst_frame + dst_stride_frame, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_frame += dst_stride_frame * 2;
+  }
+  if (height & 1) {
+    I42xToUYVYRow(src_y, src_u, src_v, row, width);
+    UYVYToV210Row(row, dst_frame, width);
+  }
  return 0;
 }
@@ -467,56 +647,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
  return 0;
 }
-// gcc provided macros
-#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-// Visual C for x86 defines these
-#elif defined(_M_X64) || defined(_M_IX86)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define READWORD(p) (*((uint32*) (p)))
-#else
-uint32 READWORD(const uint8* p) {
-  return (uint32) p[0] |
-         ((uint32) (p[1]) << 8) |
-         ((uint32) (p[2]) << 16) |
-         ((uint32) (p[3]) << 24);
-}
-#endif
-// Must be multiple of 6 pixels.  Will over convert to handle remainder.
-// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
-void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
-  for (int x = 0; x < width; x += 6) {
-    uint32 w = READWORD(src_v210 + 0);
-    dst_uyvy[0] = (w >> 2) & 0xff;
-    dst_uyvy[1] = (w >> 12) & 0xff;
-    dst_uyvy[2] = (w >> 22) & 0xff;
-    w = READWORD(src_v210 + 4);
-    dst_uyvy[3] = (w >> 2) & 0xff;
-    dst_uyvy[4] = (w >> 12) & 0xff;
-    dst_uyvy[5] = (w >> 22) & 0xff;
-    w = READWORD(src_v210 + 8);
-    dst_uyvy[6] = (w >> 2) & 0xff;
-    dst_uyvy[7] = (w >> 12) & 0xff;
-    dst_uyvy[8] = (w >> 22) & 0xff;
-    w = READWORD(src_v210 + 12);
-    dst_uyvy[9] = (w >> 2) & 0xff;
-    dst_uyvy[10] = (w >> 12) & 0xff;
-    dst_uyvy[11] = (w >> 22) & 0xff;
-    dst_uyvy += 12;
-    src_v210 += 16;
-  }
-}
 // Convert V210 to I420.
 // V210 is 10 bit version of UYVY.  16 bytes to store 6 pixels.
 // With is multiple of 48.
@@ -525,7 +655,7 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
-  if (width * 16 / 6 > kMaxStride) {  // row buffer is required
+  if (width * 2 * 2 > kMaxStride) {  // 2 rows of UYVY is required
    return -1;
  }
  // Negative height means invert the image.
@@ -1098,9 +1228,11 @@ int ConvertToI420(const uint8* sample, size_t sample_size,
                 dst_width, inv_dst_height);
      break;
    case FOURCC_V210:
-      // TODO(fbarchard): Confirm stride is 16 bytes per 6 pixels.
+      // stride is multiple of 48 pixels (128 bytes).
-      src = sample + (aligned_src_width * crop_y + crop_x) * 16 / 6;
+      // pixels come in groups of 6 = 16 bytes
-      V210ToI420(src, aligned_src_width * 16 / 6,
+      src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
+            crop_x / 6 * 16;
+      V210ToI420(src, (aligned_src_width + 47) / 48 * 128,
                 y, y_stride,
                 u, u_stride,
                 v, v_stride,

--- a/source/convertfrom.cc
+++ b/source/convertfrom.cc
@@ -49,6 +49,15 @@ int ConvertFromI420(const uint8* y, int y_stride,
                 dst_sample_stride ? dst_sample_stride : width * 2,
                 width, height);
      break;
+    case FOURCC_V210:
+      I420ToV210(y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dst_sample,
+                 dst_sample_stride ? dst_sample_stride :
+                     (width + 47) / 48 * 128,
+                 width, height);
+      break;
    case FOURCC_RGBP:
      I420ToRGB565(y, y_stride,
                   u, u_stride,