libyuv: Updating planar functions

Review URL: http://webrtc-codereview.appspot.com/209002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@12 16f28f9a-4ce2-e073-06de-1de4eb20be90

libyuv: Updating planar functions
Review URL: http://webrtc-codereview.appspot.com/209002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@12 16f28f9a-4ce2-e073-06de-1de4eb20be90
120d5e73 · mikhal@webrtc.org · 3660f2e5 · 120d5e73 · 120d5e73 · 120d5e73
Commit 120d5e73 authored Oct 07, 2011 by mikhal@webrtc.org
5 changed files
--- a/include/planar_functions.h
+++ b/include/planar_functions.h
@@ -19,17 +19,8 @@ namespace libyuv {
 class PlanarFunctions {
 public:
-  // Copy I420 to I420.
+    // Copy I420 to I420.
-  static void I420Copy(const uint8* src_y, int src_pitch_y,
+    static void I420Copy(const uint8* src_y, int src_pitch_y,
-                       const uint8* src_u, int src_pitch_u,
-                       const uint8* src_v, int src_pitch_v,
-                       uint8* dst_y, int dst_pitch_y,
-                       uint8* dst_u, int dst_pitch_u,
-                       uint8* dst_v, int dst_pitch_v,
-                       int width, int height);
-  // Convert I422 to I420.  Used by MJPG.
-  static void I422ToI420(const uint8* src_y, int src_pitch_y,
                         const uint8* src_u, int src_pitch_u,
                         const uint8* src_v, int src_pitch_v,
                         uint8* dst_y, int dst_pitch_y,
@@ -37,24 +28,100 @@ class PlanarFunctions {
                         uint8* dst_v, int dst_pitch_v,
                         int width, int height);
-  // Convert M420 to I420.
+    // Convert I422 to I420.  Used by MJPG.
-  static void M420ToI420(uint8* dst_y, int dst_pitch_y,
+    static void I422ToI420(const uint8* src_y, int src_pitch_y,
-                         uint8* dst_u, int dst_pitch_u,
+                           const uint8* src_u, int src_pitch_u,
-                         uint8* dst_v, int dst_pitch_v,
+                           const uint8* src_v, int src_pitch_v,
-                         const uint8* m420, int pitch_m420,
+                           uint8* dst_y, int dst_pitch_y,
-                         int width, int height);
+                           uint8* dst_u, int dst_pitch_u,
+                           uint8* dst_v, int dst_pitch_v,
+                           int width, int height);
-  // Convert NV12 to I420.  Also used for NV21.
+    // Convert M420 to I420.
-  static void NV12ToI420(uint8* dst_y, int dst_pitch_y,
+    static void M420ToI420(const uint8* src_m420, int src_pitch_m420,
-                         uint8* dst_u, int dst_pitch_u,
+                           uint8* dst_y, int dst_pitch_y,
-                         uint8* dst_v, int dst_pitch_v,
+                           uint8* dst_u, int dst_pitch_u,
-                         const uint8* src_y,
+                           uint8* dst_v, int dst_pitch_v,
-                         const uint8* src_uv,
+                           int width, int height);
-                         int src_pitch,
-                         int width, int height);
+    // Convert Q420 to I420.
+    static void Q420ToI420(const uint8* src_y, int src_pitch_y,
+                           const uint8* src_yuy2, int src_pitch_yuy2,
+                           uint8* dst_y, int dst_pitch_y,
+                           uint8* dst_u, int dst_pitch_u,
+                           uint8* dst_v, int dst_pitch_v,
+                           int width, int height);
+    // Convert NV12 to I420.  Also used for NV21.
+    static void NV12ToI420(const uint8* src_y,
+                           const uint8* src_uv, int src_pitch,
+                           uint8* dst_y, int dst_pitch_y,
+                           uint8* dst_u, int dst_pitch_u,
+                           uint8* dst_v, int dst_pitch_v,
+                           int width, int height);
+    // Convert YUY2 to I420.
+    static void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2,
+                           uint8* dst_y, int dst_pitch_y,
+                           uint8* dst_u, int dst_pitch_u,
+                           uint8* dst_v, int dst_pitch_v,
+                           int width, int height);
+    // Convert UYVY to I420.
+    static void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy,
+                           uint8* dst_y, int dst_pitch_y,
+                           uint8* dst_u, int dst_pitch_u,
+                           uint8* dst_v, int dst_pitch_v,
+                           int width, int height);
+    // Convert I420 to ARGB.
+    static void I420ToARGB(const uint8* src_y, int src_pitch_y,
+                           const uint8* src_u, int src_pitch_u,
+                           const uint8* src_v, int src_pitch_v,
+                           uint8* dst_argb, int dst_pitch_argb,
+                           int width, int height);
+    // Convert I422 to ARGB.
+    static void I422ToARGB(const uint8* src_y, int src_pitch_y,
+                           const uint8* src_u, int src_pitch_u,
+                           const uint8* src_v, int src_pitch_v,
+                           uint8* dst_argb, int dst_pitch_argb,
+                           int width, int height);
+    // Convert I444 to ARGB.
+    static void I444ToARGB(const uint8* src_y, int src_pitch_y,
+                           const uint8* src_u, int src_pitch_u,
+                           const uint8* src_v, int src_pitch_v,
+                           uint8* dst_argb, int dst_pitch_argb,
+                           int width, int height);
+    // Convert I400 to ARGB.
+    static void I400ToARGB(const uint8* src_y, int src_pitch_y,
+                           uint8* dst_argb, int dst_pitch_argb,
+                           int width, int height);
+    // Convert I400 to ARGB.
+    static void I400ToARGB_Reference(const uint8* src_y, int src_pitch_y,
+                                     uint8* dst_argb, int dst_pitch_argb,
+                                     int width, int height);
+    // Convert RAW to ARGB.
+    static void RAWToARGB(const uint8* src_raw, int src_pitch_raw,
+                          uint8* dst_argb, int dst_pitch_argb,
+                          int width, int height);
+    // Convert BG24 to ARGB.
+    static void BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24,
+                           uint8* dst_argb, int dst_pitch_argb,
+                           int width, int height);
+    // Convert ABGR to ARGB.
+    static void ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr,
+                           uint8* dst_argb, int dst_pitch_argb,
+                           int width, int height);
-  DISALLOW_IMPLICIT_CONSTRUCTORS(PlanarFunctions);
+    DISALLOW_IMPLICIT_CONSTRUCTORS(PlanarFunctions);
-};
+  };
 }  // namespace libyuv

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -14,6 +14,7 @@
 #include <string.h>
 #include "cpu_id.h"
+#include "row.h"
 namespace libyuv {
@@ -37,49 +38,93 @@ static void SplitUV_NEON(const uint8* src_uv,
  );
 }
-#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
+#elif (defined(WIN32) || defined(__i386__)) && !defined(COVERAGE_ENABLED) && \
+    !defined(__PIC__) && !TARGET_IPHONE_SIMULATOR
+#if defined(_MSC_VER)
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+#elif defined(OSX)
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#else
+#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#endif
+// shuffle constant to put even bytes in low 8 and odd bytes in high 8 bytes
+extern "C" TALIGN16(const uint8, shufevenodd[16]) =
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
 #define HAS_SPLITUV_SSE2
+__declspec(naked)
 static void SplitUV_SSE2(const uint8* src_uv,
                         uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
-    mov        esi, src_uv
+    push       edi
-    mov        edi, dst_u
+    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, dst_v
+    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        ecx, pix
+    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        eax, 0x00ff00ff       // mask for isolating low bytes
+    mov        ecx, [esp + 4 + 16]   // pix
-    movd       xmm7, eax
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    pshufd     xmm7, xmm7, 0
+    psrlw      xmm7, 8
  wloop:
-    movdqa     xmm0, [esi]
+    movdqa     xmm0, [eax]
-    movdqa     xmm1, [esi + 16]
+    movdqa     xmm1, [eax + 16]
-    lea        esi,  [esi + 32]
+    lea        eax,  [eax + 32]
    movdqa     xmm2, xmm0
    movdqa     xmm3, xmm1
    pand       xmm0, xmm7   // even bytes
    pand       xmm1, xmm7
    packuswb   xmm0, xmm1
-    movdqa     [edi], xmm0
+    movdqa     [edx], xmm0
-    lea        edi, [edi + 16]
+    lea        edx, [edx + 16]
    psrlw      xmm2, 8      // odd bytes
    psrlw      xmm3, 8
    packuswb   xmm2, xmm3
-    movdqa     [edx], xmm2
+    movdqa     [edi], xmm2
-    lea        edx, [edx + 16]
+    lea        edi, [edi + 16]
    sub        ecx, 16
    ja         wloop
+    pop        edi
+    ret
  }
 }
+#define HAS_SPLITUV_SSSE3
+__declspec(naked)
+static void SplitUV_SSSE3(const uint8* src_uv,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]       // src_uv
+    mov        edx, [esp + 4 + 8]       // dst_u
+    mov        edi, [esp + 4 + 12]      // dst_v
+    mov        ecx, [esp + 4 + 16]      // pix
+    movdqa     xmm7, _shufevenodd
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm7               // 8 u's and 8 v's
+    pshufb     xmm1, xmm7               // 8 u's and 8 v's
+    movdqa     xmm2, xmm0
+    punpcklqdq xmm0, xmm1               // 16 u's
+    punpckhqdq xmm2, xmm1               // 16 v's
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    movdqa     [edi], xmm2
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         wloop
+    pop        edi
+    ret
+  }
+}
 #elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
    !TARGET_IPHONE_SIMULATOR
-// GCC version is same as Visual C
 #define HAS_SPLITUV_SSE2
 extern "C" void SplitUV_SSE2(const uint8* src_uv,
                             uint8* dst_u, uint8* dst_v, int pix);
  asm(
    ".text\n"
 #if defined(OSX)
@@ -89,42 +134,75 @@ extern "C" void SplitUV_SSE2(const uint8* src_uv,
    ".global SplitUV_SSE2\n"
 "SplitUV_SSE2:\n"
 #endif
-    "push   %ebp\n"
-    "mov    %esp,%ebp\n"
-    "push   %esi\n"
    "push   %edi\n"
-    "mov    0x8(%ebp),%esi\n"
+    "mov    0x8(%esp),%eax\n"
-    "mov    0xc(%ebp),%edi\n"
+    "mov    0xc(%esp),%edx\n"
-    "mov    0x10(%ebp),%edx\n"
+    "mov    0x10(%esp),%edi\n"
-    "mov    0x14(%ebp),%ecx\n"
+    "mov    0x14(%esp),%ecx\n"
-    "mov    $0xff00ff,%eax\n"
+    "pcmpeqb %xmm7,%xmm7\n"
-    "movd   %eax,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
-    "pshufd $0x0,%xmm7,%xmm7\n"
 "1:"
-    "movdqa (%esi),%xmm0\n"
+    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%esi),%xmm1\n"
+    "movdqa 0x10(%eax),%xmm1\n"
-    "lea    0x20(%esi),%esi\n"
+    "lea    0x20(%eax),%eax\n"
    "movdqa %xmm0,%xmm2\n"
    "movdqa %xmm1,%xmm3\n"
    "pand   %xmm7,%xmm0\n"
    "pand   %xmm7,%xmm1\n"
    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,(%edi)\n"
+    "movdqa %xmm0,(%edx)\n"
-    "lea    0x10(%edi),%edi\n"
+    "lea    0x10(%edx),%edx\n"
    "psrlw  $0x8,%xmm2\n"
    "psrlw  $0x8,%xmm3\n"
    "packuswb %xmm3,%xmm2\n"
-    "movdqa %xmm2,(%edx)\n"
+    "movdqa %xmm2,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "pop    %edi\n"
+    "ret\n"
+);
+#define HAS_SPLITUV_SSSE3
+extern "C" void SplitUV_SSSE3(const uint8* src_uv,
+                             uint8* dst_u, uint8* dst_v, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _SplitUV_SSSE3\n"
+"_SplitUV_SSSE3:\n"
+#else
+    ".global SplitUV_SSSE3\n"
+"SplitUV_SSSE3:\n"
+#endif
+    "push   %edi\n"
+    "mov    0x8(%esp),%eax\n"
+    "mov    0xc(%esp),%edx\n"
+    "mov    0x10(%esp),%edi\n"
+    "mov    0x14(%esp),%ecx\n"
+    "movdqa _shufevenodd,%xmm7\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "lea    0x20(%eax),%eax\n"
+    "pshufb %xmm7,%xmm0\n"
+    "pshufb %xmm7,%xmm1\n"
+    "movdqa %xmm0,%xmm2\n"
+    "punpcklqdq %xmm1,%xmm0\n"
+    "punpckhqdq %xmm1,%xmm2\n"
+    "movdqa %xmm0,(%edx)\n"
    "lea    0x10(%edx),%edx\n"
+    "movdqa %xmm2,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
    "sub    $0x10,%ecx\n"
    "ja     1b\n"
    "pop    %edi\n"
-    "pop    %esi\n"
-    "pop    %ebp\n"
    "ret\n"
 );
 #endif
+#endif
 static void SplitUV_C(const uint8* src_uv,
                      uint8* dst_u, uint8* dst_v, int pix) {
@@ -163,70 +241,6 @@ static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1,
  }
 }
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
-// Chroma is half width / half height. (420)
-// pitch_m420 is row planar.  Normally this will be the width in pixels.
-//   The UV plane is half width, but 2 values, so pitch_m420 applies to this
-//   as well as the two Y planes.
-// TODO(fbarchard): Do NV21/NV12 formats with this function
-static void X420ToI420(uint8* dst_y, int dst_pitch_y,
-                       uint8* dst_u, int dst_pitch_u,
-                       uint8* dst_v, int dst_pitch_v,
-                       const uint8* src_y,
-                       int src_pitch_y0, int src_pitch_y1,
-                       const uint8* src_uv, int src_pitch_uv,
-                       int width, int height) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_pitch_y;
-    dst_u = dst_u + (height - 1) * dst_pitch_u;
-    dst_v = dst_v + (height - 1) * dst_pitch_v;
-    dst_pitch_y = -dst_pitch_y;
-    dst_pitch_u = -dst_pitch_u;
-    dst_pitch_v = -dst_pitch_v;
-  }
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-#if defined(HAS_SPLITUV_NEON)
-  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasNEON) &&
-      (halfwidth % 16 == 0) &&
-      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
-      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
-      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
-    SplitUV = SplitUV_NEON;
-  } else
-#elif defined(HAS_SPLITUV_SSE2)
-  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
-      (halfwidth % 16 == 0) &&
-      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
-      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
-      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
-    SplitUV = SplitUV_SSE2;
-  } else
-#endif
-  {
-    SplitUV = SplitUV_C;
-  }
-  I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y,
-                width, height);
-  int halfheight = (height + 1) >> 1;
-  for (int y = 0; y < halfheight; ++y) {
-    // Copy a row of UV.
-    SplitUV(src_uv, dst_u, dst_v, halfwidth);
-    dst_u += dst_pitch_u;
-    dst_v += dst_pitch_v;
-    src_uv += src_pitch_uv;
-  }
-}
 // TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same
 // as I420, and only the chroma plane varies. Copy the Y plane by reference,
 // and just convert the UV.  This method can be used for NV21, NV12, I420,
@@ -312,30 +326,914 @@ void PlanarFunctions::I422ToI420(const uint8* src_y, int src_pitch_y,
  }
 }
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
+// Chroma is half width / half height. (420)
+// src_pitch_m420 is row planar.  Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_pitch_m420 applies to this
+//   as well as the two Y planes.
+// TODO(fbarchard): Do NV21/NV12 formats with this function
+static void X420ToI420(const uint8* src_y,
+                       int src_pitch_y0, int src_pitch_y1,
+                       const uint8* src_uv, int src_pitch_uv,
+                       uint8* dst_y, int dst_pitch_y,
+                       uint8* dst_u, int dst_pitch_u,
+                       uint8* dst_v, int dst_pitch_v,
+                       int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_pitch_y;
+    dst_u = dst_u + (height - 1) * dst_pitch_u;
+    dst_v = dst_v + (height - 1) * dst_pitch_v;
+    dst_pitch_y = -dst_pitch_y;
+    dst_pitch_u = -dst_pitch_u;
+    dst_pitch_v = -dst_pitch_v;
+  }
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+#if defined(HAS_SPLITUV_NEON)
+  if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasNEON) &&
+      (halfwidth % 16 == 0) &&
+      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
+    SplitUV = SplitUV_NEON;
+  } else
+#elif defined(HAS_SPLITUV_SSSE3)
+  if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSSE3) &&
+      (halfwidth % 16 == 0) &&
+      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
+    SplitUV = SplitUV_SSSE3;
+  } else
+#elif defined(HAS_SPLITUV_SSE2)
+  if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) &&
+      (halfwidth % 16 == 0) &&
+      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
+    SplitUV = SplitUV_SSE2;
+  } else
+#endif
+  {
+    SplitUV = SplitUV_C;
+  }
+  I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y,
+                 width, height);
+  int halfheight = (height + 1) >> 1;
+  for (int y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUV(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_pitch_u;
+    dst_v += dst_pitch_v;
+    src_uv += src_pitch_uv;
+  }
+}
 // Convert M420 to I420.
-void PlanarFunctions::M420ToI420(uint8* dst_y, int dst_pitch_y,
+void PlanarFunctions::M420ToI420(const uint8* src_m420, int src_pitch_m420,
+                                 uint8* dst_y, int dst_pitch_y,
                                 uint8* dst_u, int dst_pitch_u,
                                 uint8* dst_v, int dst_pitch_v,
-                                 const uint8* m420, int pitch_m420,
                                 int width, int height) {
-  X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
+  X420ToI420(src_m420, src_pitch_m420, src_pitch_m420 * 2,
-             m420, pitch_m420, pitch_m420 * 2,
+             src_m420 + src_pitch_m420 * 2, src_pitch_m420 * 3,
-             m420 + pitch_m420 * 2, pitch_m420 * 3,
+             dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
             width, height);
 }
 // Convert NV12 to I420.
-void PlanarFunctions::NV12ToI420(uint8* dst_y, int dst_pitch_y,
+void PlanarFunctions::NV12ToI420(const uint8* src_y,
-                                 uint8* dst_u, int dst_pitch_u,
-                                 uint8* dst_v, int dst_pitch_v,
-                                 const uint8* src_y,
                                 const uint8* src_uv,
                                 int src_pitch,
+                                 uint8* dst_y, int dst_pitch_y,
+                                 uint8* dst_u, int dst_pitch_u,
+                                 uint8* dst_v, int dst_pitch_v,
                                 int width, int height) {
-  X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
+  X420ToI420(src_y, src_pitch, src_pitch,
-             src_y, src_pitch, src_pitch,
             src_uv, src_pitch,
+             dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
             width, height);
 }
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_SPLITYUY2_SSE2
+__declspec(naked)
+static void SplitYUY2_SSE2(const uint8* src_yuy2,
+                           uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        edx, [esp + 8 + 8]    // dst_y
+    mov        esi, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm2, xmm7   // even bytes are Y
+    pand       xmm3, xmm7
+    packuswb   xmm2, xmm3
+    movdqa     [edx], xmm2
+    lea        edx, [edx + 16]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm7  // U
+    packuswb   xmm0, xmm0
+    movq       qword ptr [esi], xmm0
+    lea        esi, [esi + 8]
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edi], xmm1
+    lea        edi, [edi + 8]
+    sub        ecx, 16
+    ja         wloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+    !TARGET_IPHONE_SIMULATOR
+#define HAS_SPLITYUY2_SSE2
+extern "C" void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
+                               uint8* dst_u, uint8* dst_v, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _SplitYUY2_SSE2\n"
+"_SplitYUY2_SSE2:\n"
+#else
+    ".global SplitYUY2_SSE2\n"
+"SplitYUY2_SSE2:\n"
+#endif
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0xc(%esp),%eax\n"
+    "mov    0x10(%esp),%edx\n"
+    "mov    0x14(%esp),%esi\n"
+    "mov    0x18(%esp),%edi\n"
+    "mov    0x1c(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "lea    0x20(%eax),%eax\n"
+    "movdqa %xmm0,%xmm2\n"
+    "movdqa %xmm1,%xmm3\n"
+    "pand   %xmm7,%xmm2\n"
+    "pand   %xmm7,%xmm3\n"
+    "packuswb %xmm3,%xmm2\n"
+    "movdqa %xmm2,(%edx)\n"
+    "lea    0x10(%edx),%edx\n"
+    "psrlw  $0x8,%xmm0\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,%xmm1\n"
+    "pand   %xmm7,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%esi)\n"
+    "lea    0x8(%esi),%esi\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm1\n"
+    "movq   %xmm1,(%edi)\n"
+    "lea    0x8(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+#endif
+static void SplitYUY2_C(const uint8* src_yuy2,
+                        uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
+  // Copy a row of YUY2.
+  for (int x = 0; x < pix; x += 2) {
+    dst_y[0] = src_yuy2[0];
+    dst_y[1] = src_yuy2[2];
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_y += 2;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+// Convert Q420 to I420.
+// Format is rows of YY/YUYV
+void PlanarFunctions::Q420ToI420(const uint8* src_y, int src_pitch_y,
+                                 const uint8* src_yuy2, int src_pitch_yuy2,
+                                 uint8* dst_y, int dst_pitch_y,
+                                 uint8* dst_u, int dst_pitch_u,
+                                 uint8* dst_v, int dst_pitch_v,
+                                 int width, int height) {
+  void (*SplitYUY2)(const uint8* src_yuy2,
+                    uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
+#if defined(HAS_SPLITYUY2_SSE2)
+  if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) {
+    SplitYUY2 = SplitYUY2_SSE2;
+  } else
+#endif
+  {
+    SplitYUY2 = SplitYUY2_C;
+  }
+  for (int y = 0; y < height; y += 2) {
+    memcpy(dst_y, src_y, width);
+    dst_y += dst_pitch_y;
+    src_y += src_pitch_y;
+    // Copy a row of YUY2.
+    SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width);
+    dst_y += dst_pitch_y;
+    dst_u += dst_pitch_u;
+    dst_v += dst_pitch_v;
+    src_yuy2 += src_pitch_yuy2;
+  }
+}
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_YUY2TOI420ROW_SSE2
+__declspec(naked)
+void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
+                         uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    pcmpeqb    xmm7, xmm7        // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm7   // even bytes are Y
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+    ret
+  }
+}
+__declspec(naked)
+void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2,
+                          uint8* dst_u, uint8* dst_y, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // pitch_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm7  // U
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edi], xmm1
+    lea        edi, [edi + 8]
+    sub        ecx, 16
+    ja         wloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#define HAS_UYVYTOI420ROW_SSE2
+__declspec(naked)
+void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
+                         uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+    ret
+  }
+}
+__declspec(naked)
+void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy,
+                          uint8* dst_u, uint8* dst_y, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // pitch_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm7   // UYVY -> UVUV
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm7  // U
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edi], xmm1
+    lea        edi, [edi + 8]
+    sub        ecx, 16
+    ja         wloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+    !TARGET_IPHONE_SIMULATOR
+#define HAS_YUY2TOI420ROW_SSE2
+extern "C" void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
+                                    uint8* dst_y, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _YUY2ToI420RowY_SSE2\n"
+"_YUY2ToI420RowY_SSE2:\n"
+#else
+    ".global YUY2ToI420RowY_SSE2\n"
+"YUY2ToI420RowY_SSE2:\n"
+#endif
+    "mov    0x4(%esp),%eax\n"
+    "mov    0x8(%esp),%edx\n"
+    "mov    0xc(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "lea    0x20(%eax),%eax\n"
+    "pand   %xmm7,%xmm0\n"
+    "pand   %xmm7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edx)\n"
+    "lea    0x10(%edx),%edx\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "ret\n"
+);
+extern "C" void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2,
+                                     uint8* dst_u, uint8* dst_y, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _YUY2ToI420RowUV_SSE2\n"
+"_YUY2ToI420RowUV_SSE2:\n"
+#else
+    ".global YUY2ToI420RowUV_SSE2\n"
+"YUY2ToI420RowUV_SSE2:\n"
+#endif
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0xc(%esp),%eax\n"
+    "mov    0x10(%esp),%esi\n"
+    "mov    0x14(%esp),%edx\n"
+    "mov    0x18(%esp),%edi\n"
+    "mov    0x1c(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "movdqa (%eax,%esi,1),%xmm2\n"
+    "movdqa 0x10(%eax,%esi,1),%xmm3\n"
+    "lea    0x20(%eax),%eax\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "psrlw  $0x8,%xmm0\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,%xmm1\n"
+    "pand   %xmm7,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edx)\n"
+    "lea    0x8(%edx),%edx\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm1\n"
+    "movq   %xmm1,(%edi)\n"
+    "lea    0x8(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+#define HAS_UYVYTOI420ROW_SSE2
+extern "C" void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
+                                    uint8* dst_y, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _UYVYToI420RowY_SSE2\n"
+"_UYVYToI420RowY_SSE2:\n"
+#else
+    ".global UYVYToI420RowY_SSE2\n"
+"UYVYToI420RowY_SSE2:\n"
+#endif
+    "mov    0x4(%esp),%eax\n"
+    "mov    0x8(%esp),%edx\n"
+    "mov    0xc(%esp),%ecx\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "lea    0x20(%eax),%eax\n"
+    "psrlw  $0x8,%xmm0\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edx)\n"
+    "lea    0x10(%edx),%edx\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "ret\n"
+);
+extern "C" void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy,
+                                     uint8* dst_u, uint8* dst_y, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _UYVYToI420RowUV_SSE2\n"
+"_UYVYToI420RowUV_SSE2:\n"
+#else
+    ".global UYVYToI420RowUV_SSE2\n"
+"UYVYToI420RowUV_SSE2:\n"
+#endif
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0xc(%esp),%eax\n"
+    "mov    0x10(%esp),%esi\n"
+    "mov    0x14(%esp),%edx\n"
+    "mov    0x18(%esp),%edi\n"
+    "mov    0x1c(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "movdqa (%eax,%esi,1),%xmm2\n"
+    "movdqa 0x10(%eax,%esi,1),%xmm3\n"
+    "lea    0x20(%eax),%eax\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "pand   %xmm7,%xmm0\n"
+    "pand   %xmm7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,%xmm1\n"
+    "pand   %xmm7,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edx)\n"
+    "lea    0x8(%edx),%edx\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm1\n"
+    "movq   %xmm1,(%edi)\n"
+    "lea    0x8(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+#endif
+void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_pitch_yuy2,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  // Copy a row of yuy2 UV values
+  for (int x = 0; x < pix; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_pitch_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_pitch_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+void YUY2ToI420RowY_C(const uint8* src_yuy2,
+                      uint8* dst_y, int pix) {
+  // Copy a row of yuy2 Y values
+  for (int x = 0; x < pix; ++x) {
+    dst_y[0] = src_yuy2[0];
+    src_yuy2 += 2;
+    dst_y += 1;
+  }
+}
+void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_pitch_uyvy,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  // Copy a row of uyvy UV values
+  for (int x = 0; x < pix; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_pitch_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_pitch_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+void UYVYToI420RowY_C(const uint8* src_uyvy,
+                      uint8* dst_y, int pix) {
+  // Copy a row of uyvy Y values
+  for (int x = 0; x < pix; ++x) {
+    dst_y[0] = src_uyvy[1];
+    src_uyvy += 2;
+    dst_y += 1;
+  }
+}
+// Convert YUY2 to I420.
+void PlanarFunctions::YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2,
+                                 uint8* dst_y, int dst_pitch_y,
+                                 uint8* dst_u, int dst_pitch_u,
+                                 uint8* dst_v, int dst_pitch_v,
+                                 int width, int height) {
+  void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_pitch_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+  void (*YUY2ToI420RowY)(const uint8* src_yuy2,
+                         uint8* dst_y, int pix);
+#if defined(HAS_YUY2TOI420ROW_SSE2)
+  if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) {
+    YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
+    YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
+  } else
+#endif
+  {
+    YUY2ToI420RowY = YUY2ToI420RowY_C;
+    YUY2ToI420RowUV = YUY2ToI420RowUV_C;
+  }
+  for (int y = 0; y < height; ++y) {
+    if ((y & 1) == 0) {
+      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
+        src_pitch_yuy2 = 0;
+      }
+      YUY2ToI420RowUV(src_yuy2, src_pitch_yuy2, dst_u, dst_v, width);
+      dst_u += dst_pitch_u;
+      dst_v += dst_pitch_v;
+    }
+    YUY2ToI420RowY(src_yuy2, dst_y, width);
+    dst_y += dst_pitch_y;
+    src_yuy2 += src_pitch_yuy2;
+  }
+}
+// Convert UYVY to I420.
+void PlanarFunctions::UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy,
+                                 uint8* dst_y, int dst_pitch_y,
+                                 uint8* dst_u, int dst_pitch_u,
+                                 uint8* dst_v, int dst_pitch_v,
+                                 int width, int height) {
+  void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_pitch_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+  void (*UYVYToI420RowY)(const uint8* src_uyvy,
+                         uint8* dst_y, int pix);
+#if defined(HAS_UYVYTOI420ROW_SSE2)
+  if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_uyvy, 16) && (src_pitch_uyvy % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) {
+    UYVYToI420RowY = UYVYToI420RowY_SSE2;
+    UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
+  } else
+#endif
+  {
+    UYVYToI420RowY = UYVYToI420RowY_C;
+    UYVYToI420RowUV = UYVYToI420RowUV_C;
+  }
+  for (int y = 0; y < height; ++y) {
+    if ((y & 1) == 0) {
+      if (y >= (height - 1) ) {  // last chroma on odd height clamp height
+        src_pitch_uyvy = 0;
+      }
+      UYVYToI420RowUV(src_uyvy, src_pitch_uyvy, dst_u, dst_v, width);
+      dst_u += dst_pitch_u;
+      dst_v += dst_pitch_v;
+    }
+    UYVYToI420RowY(src_uyvy, dst_y, width);
+    dst_y += dst_pitch_y;
+    src_uyvy += src_pitch_uyvy;
+  }
+}
+// Convert I420 to ARGB.
+// TODO(fbarchard): Add SSSE3 version and supply C version for fallback.
+void PlanarFunctions::I420ToARGB(const uint8* src_y, int src_pitch_y,
+                                 const uint8* src_u, int src_pitch_u,
+                                 const uint8* src_v, int src_pitch_v,
+                                 uint8* dst_argb, int dst_pitch_argb,
+                                 int width, int height) {
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_pitch_argb;
+    src_y += src_pitch_y;
+    if (y & 1) {
+      src_u += src_pitch_u;
+      src_v += src_pitch_v;
+    }
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+}
+// Convert I422 to ARGB.
+void PlanarFunctions::I422ToARGB(const uint8* src_y, int src_pitch_y,
+                                 const uint8* src_u, int src_pitch_u,
+                                 const uint8* src_v, int src_pitch_v,
+                                 uint8* dst_argb, int dst_pitch_argb,
+                                 int width, int height) {
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_pitch_argb;
+    src_y += src_pitch_y;
+    src_u += src_pitch_u;
+    src_v += src_pitch_v;
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+}
+// Convert I444 to ARGB.
+void PlanarFunctions::I444ToARGB(const uint8* src_y, int src_pitch_y,
+                                 const uint8* src_u, int src_pitch_u,
+                                 const uint8* src_v, int src_pitch_v,
+                                 uint8* dst_argb, int dst_pitch_argb,
+                                 int width, int height) {
+  for (int y = 0; y < height; ++y) {
+    FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_pitch_argb;
+    src_y += src_pitch_y;
+    src_u += src_pitch_u;
+    src_v += src_pitch_v;
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+}
+// Convert I400 to ARGB.
+void PlanarFunctions::I400ToARGB_Reference(const uint8* src_y, int src_pitch_y,
+                                           uint8* dst_argb, int dst_pitch_argb,
+                                           int width, int height) {
+  for (int y = 0; y < height; ++y) {
+    FastConvertYToRGB32Row(src_y, dst_argb, width);
+    dst_argb += dst_pitch_argb;
+    src_y += src_pitch_y;
+  }
+  // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
+  EMMS();
+}
+// TODO(fbarchard): 64 bit version
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_I400TOARGBROW_SSE2
+__declspec(naked)
+static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // pix
+    pcmpeqb    xmm7, xmm7            // generate mask 0xff000000
+    pslld      xmm7, 24
+  wloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm7
+    por        xmm1, xmm7
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    ja         wloop
+    ret
+  }
+}
+#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+    !TARGET_IPHONE_SIMULATOR
+#define HAS_I400TOARGBROW_SSE2
+extern "C" void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb,
+                                   int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _I400ToARGBRow_SSE2\n"
+"_I400ToARGBRow_SSE2:\n"
+#else
+    ".global I400ToARGBRow_SSE2\n"
+"I400ToARGBRow_SSE2:\n"
+#endif
+    "mov    0x4(%esp),%eax\n"
+    "mov    0x8(%esp),%edx\n"
+    "mov    0xc(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "pslld  $0x18,%xmm7\n"
+"1:"
+    "movq   (%eax),%xmm0\n"
+    "lea    0x8(%eax),%eax\n"
+    "punpcklbw %xmm0,%xmm0\n"
+    "movdqa %xmm0,%xmm1\n"
+    "punpcklwd %xmm0,%xmm0\n"
+    "punpckhwd %xmm1,%xmm1\n"
+    "por    %xmm7,%xmm0\n"
+    "por    %xmm7,%xmm1\n"
+    "movdqa %xmm0,(%edx)\n"
+    "movdqa %xmm1,0x10(%edx)\n"
+    "lea    0x20(%edx),%edx\n"
+    "sub    $0x8,%ecx\n"
+    "ja     1b\n"
+    "ret\n"
+);
+#endif
+static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
+  // Copy a Y to RGB.
+  for (int x = 0; x < pix; ++x) {
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = src_y[0];
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_y += 1;
+  }
+}
+// Convert I400 to ARGB.
+void PlanarFunctions::I400ToARGB(const uint8* src_y, int src_pitch_y,
+                                 uint8* dst_argb, int dst_pitch_argb,
+                                 int width, int height) {
+  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) &&
+      (width % 8 == 0) &&
+      IS_ALIGNED(src_y, 8) && (src_pitch_y % 8 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_pitch_argb % 16 == 0)) {
+    I400ToARGBRow = I400ToARGBRow_SSE2;
+  } else
+#endif
+  {
+    I400ToARGBRow = I400ToARGBRow_C;
+  }
+  for (int y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_pitch_y;
+    dst_argb += dst_pitch_argb;
+  }
+}
+static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    dst_argb[0] = src_raw[2];
+    dst_argb[1] = src_raw[1];
+    dst_argb[2] = src_raw[0];
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+// Convert RAW to ARGB.
+void PlanarFunctions::RAWToARGB(const uint8* src_raw, int src_pitch_raw,
+                                uint8* dst_argb, int dst_pitch_argb,
+                                int width, int height) {
+  for (int y = 0; y < height; ++y) {
+    RAWToARGBRow_C(src_raw, dst_argb, width);
+    src_raw += src_pitch_raw;
+    dst_argb += dst_pitch_argb;
+  }
+}
+static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    dst_argb[0] = src_bg24[0];
+    dst_argb[1] = src_bg24[1];
+    dst_argb[2] = src_bg24[2];
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_bg24 += 3;
+  }
+}
+// Convert BG24 to ARGB.
+void PlanarFunctions::BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24,
+                                 uint8* dst_argb, int dst_pitch_argb,
+                                 int width, int height) {
+  for (int y = 0; y < height; ++y) {
+    BG24ToARGBRow_C(src_bg24, dst_argb, width);
+    src_bg24 += src_pitch_bg24;
+    dst_argb += dst_pitch_argb;
+  }
+}
+static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    dst_argb[0] = src_abgr[2];
+    dst_argb[1] = src_abgr[1];
+    dst_argb[2] = src_abgr[0];
+    dst_argb[3] = src_abgr[3];
+    dst_argb += 4;
+    src_abgr += 4;
+  }
+}
+// Convert ABGR to ARGB.
+void PlanarFunctions::ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr,
+                                 uint8* dst_argb, int dst_pitch_argb,
+                                 int width, int height) {
+  for (int y = 0; y < height; ++y) {
+    ABGRToARGBRow_C(src_abgr, dst_argb, width);
+    src_abgr += src_pitch_abgr;
+    dst_argb += dst_pitch_argb;
+  }
+}
 }  // namespace libyuv
--- a/source/row.h
+++ b/source/row.h
@@ -22,6 +22,16 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
                              uint8* rgb_buf,
                              int width);
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+void FastConvertYToRGB32Row(const uint8* y_buf,
+                            uint8* rgb_buf,
+                            int width);
 #if defined(_MSC_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #else
@@ -68,6 +78,7 @@ extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
 #define EMMS()
 #endif
 }  // extern "C"
 #endif  // LIBYUV_SOURCE_ROW_H_
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -55,6 +55,68 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
 );
 }
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,  // rdi
+                                 const uint8* u_buf,  // rsi
+                                 const uint8* v_buf,  // rdx
+                                 uint8* rgb_buf,      // rcx
+                                 int width) {         // r8
+  asm(
+"1:"
+  "movzb  (%1),%%r10\n"
+  "lea    1(%1),%1\n"
+  "movzb  (%2),%%r11\n"
+  "lea    1(%2),%2\n"
+  "movq   2048(%5,%%r10,8),%%xmm0\n"
+  "movzb  (%0),%%r10\n"
+  "movq   4096(%5,%%r11,8),%%xmm1\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%r10,8),%%xmm2\n"
+  "lea    1(%0),%0\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "shufps $0x44,%%xmm2,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movd   %%xmm2,0x0(%3)\n"
+  "lea    4(%3),%3\n"
+  "sub    $0x1,%4\n"
+  "ja     1b\n"
+  :
+  : "r"(y_buf),  // %0
+    "r"(u_buf),  // %1
+    "r"(v_buf),  // %2
+    "r"(rgb_buf),  // %3
+    "r"(width),  // %4
+    "r" (_kCoefficientsRgbY)  // %5
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
+);
+}
+void FastConvertYToRGB32Row(const uint8* y_buf,  // rdi
+                            uint8* rgb_buf,      // rcx
+                            int width) {         // r8
+  asm(
+"1:"
+  "movzb  (%0),%%r10\n"
+  "movzb  0x1(%0),%%r11\n"
+  "movq   (%3,%%r10,8),%%xmm2\n"
+  "lea    2(%0),%0\n"
+  "movq   (%3,%%r11,8),%%xmm3\n"
+  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movq   %%xmm2,0x0(%1)\n"
+  "lea    8(%1),%1\n"
+  "sub    $0x2,%2\n"
+  "ja     1b\n"
+  :
+  : "r"(y_buf),  // %0
+    "r"(rgb_buf),  // %1
+    "r"(width),  // %2
+    "r" (_kCoefficientsRgbY)  // %3
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
 #elif defined(__i386__)
 // 32 bit gcc version
@@ -104,6 +166,81 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
  "ret\n"
 );
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUV444ToRGB32Row\n"
+"_FastConvertYUV444ToRGB32Row:\n"
+#else
+  ".global FastConvertYUV444ToRGB32Row\n"
+"FastConvertYUV444ToRGB32Row:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+  "lea    1(%edx),%edx\n"
+  "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
+  "psraw  $0x6,%mm0\n"
+  "packuswb %mm0,%mm0\n"
+  "movd   %mm0,0x0(%ebp)\n"
+  "lea    4(%ebp),%ebp\n"
+  "sub    $0x1,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+void FastConvertYToRGB32Row(const uint8* y_buf,
+                            uint8* rgb_buf,
+                            int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYToRGB32Row\n"
+"_FastConvertYToRGB32Row:\n"
+#else
+  ".global FastConvertYToRGB32Row\n"
+"FastConvertYToRGB32Row:\n"
+#endif
+  "push   %ebx\n"
+  "mov    0x8(%esp),%eax\n"
+  "mov    0xc(%esp),%edx\n"
+  "mov    0x10(%esp),%ecx\n"
+"1:"
+  "movzbl (%eax),%ebx\n"
+  "movq   _kCoefficientsRgbY(,%ebx,8),%mm0\n"
+  "psraw  $0x6,%mm0\n"
+  "movzbl 0x1(%eax),%ebx\n"
+  "movq   _kCoefficientsRgbY(,%ebx,8),%mm1\n"
+  "psraw  $0x6,%mm1\n"
+  "packuswb %mm1,%mm0\n"
+  "lea    0x2(%eax),%eax\n"
+  "movq   %mm0,(%edx)\n"
+  "lea    0x8(%edx),%edx\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "pop    %ebx\n"
+  "ret\n"
+);
 #else
 // C reference code that mimic the YUV assembly.
 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
@@ -158,6 +295,30 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
    rgb_buf += 8;  // Advance 2 pixels.
  }
 }
-#endif
+void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 u = u_buf[x];
+    uint8 v = v_buf[x];
+    uint8 y = y_buf[x];
+    YuvPixel(y, u, v, rgb_buf);
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+void FastConvertYToRGB32Row(const uint8* y_buf,
+                            uint8* rgb_buf,
+                            int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 y = y_buf[x];
+    YuvPixel(y, 128, 128, rgb_buf);
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+#endif
 }  // extern "C"
--- a/source/row_table.cc
+++ b/source/row_table.cc
@@ -16,14 +16,14 @@ extern "C" {
  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
-  0 \
+  static_cast<int16>(256 * 64 - 1) \
 }
 #define RGBU(i) { \
  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
  0, \
-  static_cast<int16>(256 * 64 - 1) \
+  0 \
 }
 #define RGBV(i) { \