ARGB to and from I420 ported to x64

BUG=none TEST=media_unittests Review URL: http://webrtc-codereview.appspot.com/266003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@61 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGB to and from I420 ported to x64
BUG=none TEST=media_unittests Review URL: http://webrtc-codereview.appspot.com/266003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@61 16f28f9a-4ce2-e073-06de-1de4eb20be90
b6149763 · fbarchard@google.com · 755de365 · b6149763 · b6149763 · b6149763
Commit b6149763 authored Nov 07, 2011 by fbarchard@google.com
12 changed files
--- a/include/libyuv/basic_types.h
+++ b/include/libyuv/basic_types.h
@@ -19,15 +19,6 @@

 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
-#ifdef COMPILER_MSVC
-typedef __int64 int64;
-#else
-typedef long long int64;
-#endif /* COMPILER_MSVC */
-typedef int int32;
-typedef short int16;
-typedef char int8;
-
 #ifdef COMPILER_MSVC
 typedef unsigned __int64 uint64;
 typedef __int64 int64;
@@ -38,7 +29,18 @@ typedef __int64 int64;
 #define UINT64_C(x) x ## UI64
 #endif
 #define INT64_F "I64"
-#else
+#else  // COMPILER_MSVC
+#ifdef __LP64__
+typedef unsigned long uint64;
+typedef long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // __LP64__
 typedef unsigned long long uint64;
 typedef long long int64;
 #ifndef INT64_C
@@ -48,10 +50,14 @@ typedef long long int64;
 #define UINT64_C(x) x ## ULL
 #endif
 #define INT64_F "ll"
-#endif /* COMPILER_MSVC */
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
 typedef unsigned int uint32;
+typedef int int32;
 typedef unsigned short uint16;
+typedef short int16;
 typedef unsigned char uint8;
+typedef char int8;
 #endif  // INT_TYPES_DEFINED

 // Detect compiler is for x86 or x64.

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -13,6 +13,7 @@
 #define INCLUDE_LIBYUV_CONVERT_H_

 #include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"

 namespace libyuv {

@@ -92,6 +93,17 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 uint8* dst_frame, int dst_stride_frame,
                 int width, int height);

-} //  namespace libyuv
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int horiz_crop, int vert_crop,
+                  int w, int h,
+                  int dw, int idh,
+                  RotationMode rotation,
+                  uint32 format);
+
+}  // namespace libyuv

 #endif // INCLUDE_LIBYUV_CONVERT_H_
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -27,7 +27,9 @@ static const int kCpuInitialized = 8;
 bool TestCpuFlag(int flag);

 // For testing, allow CPU flags to be disabled.
-// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.  -1 to enable all.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// -1 to enable all cpu specific optimizations.
+// 0 to disable all cpu specific optimizations.
 void MaskCpuFlags(int enable_flags);

 }  // namespace libyuv

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -13,7 +13,11 @@
 #include "conversion_tables.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
 #include "row.h"
+#include "video_common.h"

 //#define SCALEOPT //Currently for windows only. June 2010

@@ -650,7 +654,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int width);
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
@@ -661,7 +665,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
    ARGBToYRow = ARGBToYRow_C;
  }
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
@@ -703,7 +707,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int width);
 #if defined(HAS_BGRATOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
@@ -714,7 +718,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
    ARGBToYRow = BGRAToYRow_C;
  }
 #if defined(HAS_BGRATOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
@@ -756,7 +760,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int width);
 #if defined(HAS_ABGRTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
@@ -767,7 +771,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
    ARGBToYRow = ABGRToYRow_C;
  }
 #if defined(HAS_ABGRTOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
@@ -809,7 +813,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int width);
 #if defined(HAS_RGB24TOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
@@ -820,7 +824,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
    ARGBToYRow = RGB24ToYRow_C;
  }
 #if defined(HAS_RGB24TOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
@@ -862,7 +866,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int width);
 #if defined(HAS_RAWTOYROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
@@ -873,7 +877,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
    ARGBToYRow = RAWToYRow_C;
  }
 #if defined(HAS_RAWTOUVROW_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
@@ -901,4 +905,163 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
  return 0;
 }

+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+int ConvertToI420(const uint8* sample, size_t sample_size,
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int horiz_crop, int vert_crop,
+                  int w, int h,
+                  int dw, int idh,
+                  RotationMode rotation,
+                  uint32 format) {
+  int aw = (w + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_h = (h < 0) ? -h : h;
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aw * vert_crop + horiz_crop) * 2 ;
+      YUY2ToI420(src, aw * 2,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aw * vert_crop + horiz_crop) * 2;
+      UYVYToI420(src, aw * 2,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh);
+      break;
+    case FOURCC_24BG:
+      src = sample + (w * vert_crop + horiz_crop) * 3;
+      RGB24ToI420(src, w * 3,
+                  y, y_stride,
+                  u, u_stride,
+                  v, v_stride,
+                  dw, idh);
+      break;
+    case FOURCC_RAW:
+      src = sample + (w * vert_crop + horiz_crop) * 3;
+      RAWToI420(src, w * 3,
+                y, y_stride,
+                u, u_stride,
+                v, v_stride,
+                dw, idh);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (w * vert_crop + horiz_crop) * 4;
+      ARGBToI420(src, w * 4,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (w * vert_crop + horiz_crop) * 4;
+      BGRAToI420(src, w * 4,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (w * vert_crop + horiz_crop) * 4;
+      ABGRToI420(src, w * 4,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh);
+      break;
+    case FOURCC_BGGR:
+    case FOURCC_RGGB:
+    case FOURCC_GRBG:
+    case FOURCC_GBRG:
+      // TODO(fbarchard): We could support cropping by odd numbers by
+      // adjusting fourcc.
+      src = sample + (w * vert_crop + horiz_crop);
+      BayerRGBToI420(src, w, format,
+                     y, y_stride, u, u_stride, v, v_stride,
+                     dw, idh);
+      break;
+    // Biplanar formats
+    case FOURCC_M420:
+      src = sample + (w * vert_crop) * 12 / 8 + horiz_crop;
+      M420ToI420(src, w,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh);
+      break;
+    case FOURCC_NV12:
+      src = sample + (w * vert_crop + horiz_crop);
+      src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop;
+      NV12ToI420Rotate(src, w,
+                       src_uv, aw,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dw, idh, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (w * vert_crop + horiz_crop);
+      src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop;
+      // Call NV12 but with u and v parameters swapped.
+      NV12ToI420Rotate(src, w,
+                       src_uv, aw,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dw, idh, rotation);
+      break;
+    case FOURCC_Q420:
+      src = sample + (w + aw * 2) * vert_crop + horiz_crop;
+      src_uv = sample + (w + aw * 2) * vert_crop + w + horiz_crop * 2;
+      Q420ToI420(src, w * 3,
+                 src_uv, w * 3,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (w * vert_crop + horiz_crop);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (w + 1) / 2;
+      int halfheight = (abs_h + 1) / 2;
+      if (format == FOURCC_I420) {
+        src_u = sample + w * abs_h +
+            (halfwidth * vert_crop + horiz_crop) / 2;
+        src_v = sample + w * abs_h +
+            halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2;
+      } else {
+        src_v = sample + w * abs_h +
+            (halfwidth * vert_crop + horiz_crop) / 2;
+        src_u = sample + w * abs_h +
+            halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2;
+      }
+      I420Rotate(src_y, w,
+                 src_u, halfwidth,
+                 src_v, halfwidth,
+                 y, y_stride,
+                 u, u_stride,
+                 v, v_stride,
+                 dw, idh, rotation);
+      break;
+    }
+    // Formats not supported
+    case FOURCC_MJPG:
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return 0;
+}
+
 } // namespace libyuv
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -14,11 +14,14 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
+#ifdef __ANDROID__
+#include <cpu-features.h>
+#endif

 // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
 static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
+  asm volatile (
    "mov %%ebx, %%edi\n"
    "cpuid\n"
    "xchg %%edi, %%ebx\n"
@@ -28,7 +31,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) {
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
+  asm volatile (
    "cpuid\n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type)
@@ -49,6 +52,10 @@ static void InitCpuFlags() {
  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
              kCpuInitialized;
+#elif defined(__ANDROID__) && defined(__ARM_NEON__)
+  features = android_getCpuFeatures();
+  cpu_info_ = (features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
+              kCpuInitialized;
 #elif defined(__ARM_NEON__)
  // gcc -mfpu=neon defines __ARM_NEON__
  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
@@ -61,14 +68,14 @@ static void InitCpuFlags() {

 void MaskCpuFlags(int enable_flags) {
  InitCpuFlags();
-  cpu_info_ &= enable_flags;
+  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
 }

 bool TestCpuFlag(int flag) {
  if (0 == cpu_info_) {
    InitCpuFlags();
  }
-  return cpu_info_ & flag ? true : false;
+  return (cpu_info_ & flag) ? true : false;
 }

 }  // namespace libyuv
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -14,8 +14,6 @@
 #include "video_common.h"
 #include "row.h"

-#define kMaxStride (2048 * 4)
-
 namespace libyuv {

 // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
@@ -168,7 +166,7 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 g = src_bayer0[1];
  uint8 r = src_bayer1[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
+  for (int x = 0; x < pix - 3; x += 2) {
    dst_rgb[0] = src_bayer0[0];
    dst_rgb[1] = AVG(g, src_bayer0[1]);
    dst_rgb[2] = AVG(r, src_bayer1[1]);
@@ -187,10 +185,12 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
  dst_rgb[1] = AVG(g, src_bayer0[1]);
  dst_rgb[2] = AVG(r, src_bayer1[1]);
  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer0[0];
-  dst_rgb[5] = src_bayer0[1];
-  dst_rgb[6] = src_bayer1[1];
-  dst_rgb[7] = 255U;
+  if (pix & 1) {
+    dst_rgb[4] = src_bayer0[0];
+    dst_rgb[5] = src_bayer0[1];
+    dst_rgb[6] = src_bayer1[1];
+    dst_rgb[7] = 255U;
+  }
 }

 static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
@@ -198,7 +198,7 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 g = src_bayer0[1];
  uint8 b = src_bayer1[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
+  for (int x = 0; x < pix - 3; x += 2) {
    dst_rgb[0] = AVG(b, src_bayer1[1]);
    dst_rgb[1] = AVG(g, src_bayer0[1]);
    dst_rgb[2] = src_bayer0[0];
@@ -217,17 +217,19 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
  dst_rgb[1] = AVG(g, src_bayer0[1]);
  dst_rgb[2] = src_bayer0[0];
  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer1[1];
-  dst_rgb[5] = src_bayer0[1];
-  dst_rgb[6] = src_bayer0[0];
-  dst_rgb[7] = 255U;
+  if (pix & 1) {
+    dst_rgb[4] = src_bayer1[1];
+    dst_rgb[5] = src_bayer0[1];
+    dst_rgb[6] = src_bayer0[0];
+    dst_rgb[7] = 255U;
+  }
 }

 static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
                       uint8* dst_rgb, int pix) {
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 b = src_bayer0[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
+  for (int x = 0; x < pix - 3; x += 2) {
    dst_rgb[0] = AVG(b, src_bayer0[1]);
    dst_rgb[1] = src_bayer0[0];
    dst_rgb[2] = src_bayer1[0];
@@ -245,17 +247,19 @@ static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
  dst_rgb[1] = src_bayer0[0];
  dst_rgb[2] = src_bayer1[0];
  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer0[1];
-  dst_rgb[5] = src_bayer0[0];
-  dst_rgb[6] = src_bayer1[0];
-  dst_rgb[7] = 255U;
+  if (pix & 1) {
+    dst_rgb[4] = src_bayer0[1];
+    dst_rgb[5] = src_bayer0[0];
+    dst_rgb[6] = src_bayer1[0];
+    dst_rgb[7] = 255U;
+  }
 }

 static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
                       uint8* dst_rgb, int pix) {
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 r = src_bayer0[1];
-  for (int x = 0; x < (pix - 2); x += 2) {
+  for (int x = 0; x < pix - 3; x += 2) {
    dst_rgb[0] = src_bayer1[0];
    dst_rgb[1] = src_bayer0[0];
    dst_rgb[2] = AVG(r, src_bayer0[1]);
@@ -273,10 +277,12 @@ static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
  dst_rgb[1] = src_bayer0[0];
  dst_rgb[2] = AVG(r, src_bayer0[1]);
  dst_rgb[3] = 255U;
-  dst_rgb[4] = src_bayer1[0];
-  dst_rgb[5] = src_bayer0[0];
-  dst_rgb[6] = src_bayer0[1];
-  dst_rgb[7] = 255U;
+  if (pix & 1) {
+    dst_rgb[4] = src_bayer1[0];
+    dst_rgb[5] = src_bayer0[0];
+    dst_rgb[6] = src_bayer0[1];
+    dst_rgb[7] = 255U;
+  }
 }

 // Converts any Bayer RGB format to ARGB.
@@ -315,7 +321,7 @@ int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
      break;
  }

-  for (int y = 0; y < (height - 1); y += 2) {
+  for (int y = 0; y < height - 1; y += 2) {
    BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
        dst_rgb + dst_stride_rgb, width);
@@ -403,7 +409,7 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
      break;
  }

-  for (int y = 0; y < (height - 1); y += 2) {
+  for (int y = 0; y < height - 1; y += 2) {
    BayerRow0(src_bayer, src_stride_bayer, row, width);
    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
              row + kMaxStride, width);

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
--- a/source/row.h
+++ b/source/row.h
@@ -13,9 +13,13 @@

 #include "libyuv/basic_types.h"

+#define kMaxStride (2048 * 4)
+
 // The following are available on all x86 platforms
-#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_BGRATOARGBROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
 #define HAS_BG24TOARGBROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
@@ -23,19 +27,41 @@
 #define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOUVROW_SSSE3
 #define HAS_RAWTOUVROW_SSSE3
-#endif
-
-// The following are available only on Windows
-#if defined(WIN32) \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#endif
+
+// The following are available on Windows and Linux
+#if (defined(WIN32) || defined(__x86_64__) || \
+    (defined(__i386__) && !defined(__pic__))) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_ARGBTOUVROW_SSSE3
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_ABGRTOUVROW_SSSE3
 #endif

+// The following are available on Linux (32/64 bit)
+// TODO(fbarchard): enable for fpic on linux
+#if (defined(__x86_64__) || \
+    (defined(__i386__) && !defined(__pic__))) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_FASTCONVERTYUVTOARGBROW_SSE2
+#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
+#define HAS_FASTCONVERTYUVTOABGRROW_SSE2
+#endif
+
+// The following are available on Windows and GCC 32 bit
+#if (defined(WIN32) || \
+    defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_FASTCONVERTYUVTOARGBROW_MMX
+#define HAS_FASTCONVERTYUVTOBGRAROW_MMX
+#define HAS_FASTCONVERTYUVTOABGRROW_MMX
+#endif
+
 extern "C" {
+
 #ifdef HAS_ARGBTOYROW_SSSE3
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@@ -75,56 +101,128 @@ void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
                  uint8* dst_u, uint8* dst_v, int width);

 #ifdef HAS_BG24TOARGBROW_SSSE3
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
 void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
 #endif
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
 void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
 void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);

+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+#endif
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+
 #if defined(_MSC_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#else
+#else // __GNUC__
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
+typedef signed char __attribute__((vector_size(16))) vec8;
+typedef unsigned char __attribute__((vector_size(16))) uvec8;
 #endif

-#ifdef OSX
-extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
-#else
-extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
-#endif
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-
-void FastConvertYUVToBGRARow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width);
+extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
+extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
+
+void FastConvertYUVToARGBRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUVToBGRARow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUVToABGRRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);

-void FastConvertYUVToABGRRow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
+void FastConvertYToARGBRow_C(const uint8* y_buf,
                             uint8* rgb_buf,
                             int width);

-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
+void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     uint8* rgb_buf,
+                                     int width);
+
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
+                                uint8* rgb_buf,
+                                int width);
+#endif
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
+void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);

-void FastConvertYToRGB32Row(const uint8* y_buf,
-                            uint8* rgb_buf,
-                            int width);
+void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+
+void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+
+void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width);
+
+void FastConvertYToARGBRow_MMX(const uint8* y_buf,
+                               uint8* rgb_buf,
+                               int width);
+#endif

 // Method to force C version.
 //#define USE_MMX 0

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
--- a/source/row_table.cc
+++ b/source/row_table.cc
@@ -10,8 +10,6 @@

 #include "row.h"

-#define kMaxStride (2048 * 4)
-
 extern "C" {

 #define MAKETABLE(NAME) \
@@ -232,11 +230,7 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
  0 \
 }

-#ifdef OSX
 MAKETABLE(kCoefficientsRgbY)
-#else
-MAKETABLE(_kCoefficientsRgbY)
-#endif

 #undef RGBY
 #undef RGBU
@@ -264,12 +258,7 @@ MAKETABLE(_kCoefficientsRgbY)
  0 \
 }

-#ifdef OSX
 MAKETABLE(kCoefficientsBgraY)
-#else
-MAKETABLE(_kCoefficientsBgraY)
-#endif
-

 #undef RGBY
 #undef RGBU
@@ -297,12 +286,39 @@ MAKETABLE(_kCoefficientsBgraY)
  0 \
 }

-#ifdef OSX
 MAKETABLE(kCoefficientsAbgrY)
-#else
-MAKETABLE(_kCoefficientsAbgrY)
-#endif

+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 r = src_abgr[0];
+    uint8 g = src_abgr[1];
+    uint8 b = src_abgr[2];
+    uint8 a = src_abgr[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_abgr += 4;
+  }
+}
+
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 a = src_bgra[0];
+    uint8 r = src_bgra[1];
+    uint8 g = src_bgra[2];
+    uint8 b = src_bgra[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_bgra += 4;
+  }
+}

 void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
  for (int x = 0; x < pix; ++x) {
@@ -466,4 +482,133 @@ void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
 #endif
 #endif

+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
+  // Copy a Y to RGB.
+  for (int x = 0; x < pix; ++x) {
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// C reference code that mimic the YUV assembly.
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YuvPixel(uint8 y,
+                            uint8 u,
+                            uint8 v,
+                            uint8* rgb_buf,
+                            int ashift,
+                            int rshift,
+                            int gshift,
+                            int bshift) {
+
+  int b = kCoefficientsRgbY[256+u][0];
+  int g = kCoefficientsRgbY[256+u][1];
+  int r = kCoefficientsRgbY[256+u][2];
+  int a = kCoefficientsRgbY[256+u][3];
+
+  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+
+  b = paddsw(b, kCoefficientsRgbY[y][0]);
+  g = paddsw(g, kCoefficientsRgbY[y][1]);
+  r = paddsw(r, kCoefficientsRgbY[y][2]);
+  a = paddsw(a, kCoefficientsRgbY[y][3]);
+
+  b >>= 6;
+  g >>= 6;
+  r >>= 6;
+  a >>= 6;
+
+  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
+                                        (packuswb(g) << gshift) |
+                                        (packuswb(r) << rshift) |
+                                        (packuswb(a) << ashift);
+}
+
+void FastConvertYUVToARGBRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+  }
+}
+
+void FastConvertYUVToBGRARow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
+  }
+}
+
+void FastConvertYUVToABGRRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+  }
+}
+
+void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width) {
+  for (int x = 0; x < width; ++x) {
+    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
+    y_buf += 1;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+void FastConvertYToARGBRow_C(const uint8* y_buf,
+                             uint8* rgb_buf,
+                             int width) {
+  for (int x = 0; x < width; ++x) {
+    YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
+    y_buf += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
 }  // extern "C"
--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/video_common.h
+++ b/source/video_common.h
@@ -42,6 +42,7 @@ enum FourCC {
  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),