first draft

git-svn-id: http://libyuv.googlecode.com/svn/trunk@2 16f28f9a-4ce2-e073-06de-1de4eb20be90

first draft
git-svn-id: http://libyuv.googlecode.com/svn/trunk@2 16f28f9a-4ce2-e073-06de-1de4eb20be90
aed1cc94 · mikhal@webrtc.org · 81a7628e · aed1cc94 · aed1cc94 · aed1cc94
Commit aed1cc94 authored Sep 28, 2011 by mikhal@webrtc.org
24 changed files
--- a/common/basic_types.h
+++ b/common/basic_types.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_COMMON_BASIC_TYPES_H_
+#define LIBYUV_COMMON_BASIC_TYPES_H_
+#include <stddef.h>  // for NULL, size_t
+#ifndef WIN32
+#include <stdint.h>  // for uintptr_t
+#endif
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "constructor_magic.h"
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef __int64 int64;
+#else
+typedef long long int64;
+#endif /* COMPILER_MSVC */
+typedef int int32;
+typedef short int16;
+typedef char int8;
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else
+typedef unsigned long long uint64;
+typedef long long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif /* COMPILER_MSVC */
+typedef unsigned int uint32;
+typedef unsigned short uint16;
+typedef unsigned char uint8;
+#endif  // INT_TYPES_DEFINED
+#ifdef WIN32
+typedef int socklen_t;
+#endif
+namespace libyuv {
+  template<class T> inline T _min(T a, T b) { return (a > b) ? b : a; }
+  template<class T> inline T _max(T a, T b) { return (a < b) ? b : a; }
+  // For wait functions that take a number of milliseconds, kForever indicates
+  // unlimited time.
+  const int kForever = -1;
+}
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+    defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+#ifdef WIN32
+#define alignof(t) __alignof(t)
+#else  // !WIN32
+#define alignof(t) __alignof__(t)
+#endif  // !WIN32
+#define IS_ALIGNED(p, a) (0==(reinterpret_cast<uintptr_t>(p) & ((a)-1)))
+#define ALIGNP(p, t) \
+  (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+  ((t)-1)) & ~((t)-1))))
+#ifndef UNUSED
+#define UNUSED(x) Unused(static_cast<const void *>(&x))
+#define UNUSED2(x,y) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y))
+#define UNUSED3(x,y,z) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z))
+#define UNUSED4(x,y,z,a) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z)); Unused(static_cast<const void *>(&a))
+#define UNUSED5(x,y,z,a,b) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z)); Unused(static_cast<const void *>(&a)); Unused(static_cast<const void *>(&b))
+inline void Unused(const void *) { }
+#endif // UNUSED
+#if defined(__GNUC__)
+#define GCC_ATTR(x) __attribute__ ((x))
+#else  // !__GNUC__
+#define GCC_ATTR(x)
+#endif  // !__GNUC__
+#endif // LIBYUV_COMMON_BASIC_TYPES_H_
--- a/common/common.h
+++ b/common/common.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_SOURCE_COMMON_H_
+#define LIBYUV_SOURCE_COMMON_H_
+#include "constructor_magic.h"
+#if defined(_MSC_VER)
+// warning C4355: 'this' : used in base member initializer list
+#pragma warning(disable:4355)
+#endif
+//////////////////////////////////////////////////////////////////////
+// General Utilities
+//////////////////////////////////////////////////////////////////////
+#ifndef UNUSED
+#define UNUSED(x) Unused(static_cast<const void *>(&x))
+#define UNUSED2(x,y) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y))
+#define UNUSED3(x,y,z) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z))
+#define UNUSED4(x,y,z,a) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z)); Unused(static_cast<const void *>(&a))
+#define UNUSED5(x,y,z,a,b) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z)); Unused(static_cast<const void *>(&a)); Unused(static_cast<const void *>(&b))
+inline void Unused(const void *) { }
+#endif // UNUSED
+#ifndef WIN32
+#define strnicmp(x,y,n) strncasecmp(x,y,n)
+#define stricmp(x,y) strcasecmp(x,y)
+// TODO(sergeyu): Remove this. std::max should be used everywhere in the code.
+// NOMINMAX must be defined where we include <windows.h>.
+#define stdmax(x,y) std::max(x,y)
+#else
+#define stdmax(x,y) libyuv::_max(x,y)
+#endif
+#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0]))))
+/////////////////////////////////////////////////////////////////////////////
+// Assertions
+/////////////////////////////////////////////////////////////////////////////
+#ifndef ENABLE_DEBUG
+#define ENABLE_DEBUG _DEBUG
+#endif  // !defined(ENABLE_DEBUG)
+#if ENABLE_DEBUG
+namespace libyuv {
+// Break causes the debugger to stop executing, or the program to abort
+void Break();
+// LogAssert writes information about an assertion to the log
+void LogAssert(const char * function, const char * file, int line,
+               const char * expression);
+inline bool Assert(bool result, const char * function, const char * file,
+                   int line, const char * expression) {
+  if (!result) {
+    LogAssert(function, file, line, expression);
+    Break();
+    return false;
+  }
+  return true;
+}
+}  // namespace libyuv
+#if defined(_MSC_VER) && _MSC_VER < 1300
+#define __FUNCTION__ ""
+#endif
+#ifndef ASSERT
+#define ASSERT(x) (void)libyuv::Assert((x),__FUNCTION__,__FILE__,__LINE__,#x)
+#endif
+#ifndef VERIFY
+#define VERIFY(x) libyuv::Assert((x),__FUNCTION__,__FILE__,__LINE__,#x)
+#endif
+#else // !ENABLE_DEBUG
+namespace libyuv {
+inline bool libyuv(bool result) { return result; }
+}  // namespace libyuv
+#ifndef ASSERT
+#define ASSERT(x) (void)0
+#endif
+#ifndef VERIFY
+#define VERIFY(x) libyuv::ImplicitCastToBool(x)
+#endif
+#endif // !ENABLE_DEBUG
+#define COMPILE_TIME_ASSERT(expr)       char CTA_UNIQUE_NAME[expr]
+#define CTA_UNIQUE_NAME                 CTA_MAKE_NAME(__LINE__)
+#define CTA_MAKE_NAME(line)             CTA_MAKE_NAME2(line)
+#define CTA_MAKE_NAME2(line)            constraint_ ## line
+#ifdef __GNUC__
+// Forces compiler to inline, even against its better judgement. Use wisely.
+#define FORCE_INLINE __attribute__((always_inline))
+#else
+#define FORCE_INLINE
+#endif
+#endif // LIBYUV_SOURCE_COMMON_H_
--- a/common/constructor_magic.h
+++ b/common/constructor_magic.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_COMMON_CONSTRUCTOR_MAGIC_H_
+#define LIBYUV_COMMON_CONSTRUCTOR_MAGIC_H_
+#define DISALLOW_ASSIGN(TypeName) \
+  void operator=(const TypeName&)
+// A macro to disallow the evil copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define DISALLOW_COPY_AND_ASSIGN(TypeName)    \
+  TypeName(const TypeName&);                    \
+  DISALLOW_ASSIGN(TypeName)
+// Alternative, less-accurate legacy name.
+#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
+  DISALLOW_COPY_AND_ASSIGN(TypeName)
+// A macro to disallow all the implicit constructors, namely the
+// default constructor, copy constructor and operator= functions.
+//
+// This should be used in the private: declarations for a class
+// that wants to prevent anyone from instantiating it. This is
+// especially useful for classes containing only static methods.
+#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
+  TypeName();                                    \
+  DISALLOW_EVIL_CONSTRUCTORS(TypeName)
+#endif  // LIBYUV_COMMON_CONSTRUCTOR_MAGIC_H_
--- a/include/convert.h
+++ b/include/convert.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_INCLUDE_CONVERT_H_
+#define LIBYUV_INCLUDE_CONVERT_H_
+#include "basic_types.h"
+namespace libyuv
+{
+int
+ConvertI420ToRGB24(const uint8* src_yplane, int src_ystride,
+                   const uint8* src_uplane, int src_ustride,
+                   const uint8* src_vplane, int src_vstride,
+                   uint8* dst_frame, int dst_stride,
+                   int src_width,
+                   int src_height
+                   );
+int
+ConvertI420ToARGB(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  );
+int
+ConvertI420ToARGB4444(const uint8* src_yplane, int src_ystride,
+                      const uint8* src_uplane, int src_ustride,
+                      const uint8* src_vplane, int src_vstride,
+                      uint8* dst_frame, int dst_stride,
+                      int src_width,
+                      int src_height
+                      );
+int
+ConvertI420ToRGB565(const uint8* src_yplane, int src_ystride,
+                    const uint8* src_uplane, int src_ustride,
+                    const uint8* src_vplane, int src_vstride,
+                    uint8* dst_frame, int dst_stride,
+                    int src_width,
+                    int src_height
+                    );
+int
+ConvertI420ToRGB565Android(const uint8* src_yplane, int src_ystride,
+                           const uint8* src_uplane, int src_ustride,
+                           const uint8* src_vplane, int src_vstride,
+                           uint8* dst_frame, int dst_stride,
+                           int src_width,
+                           int src_height
+                           );
+int
+ConvertI420ToARGB1555(const uint8* src_yplane, int src_ystride,
+                      const uint8* src_uplane, int src_ustride,
+                      const uint8* src_vplane, int src_vstride,
+                      uint8* dst_frame, int dst_stride,
+                      int src_width,
+                      int src_height
+                      );
+int
+ConvertYV12ToARGB(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  );
+int
+ConvertYV12ToRGBA(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  );
+int
+ConvertI420ToYUY2(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  );
+int
+ConvertUYVYToI420(const uint8* src_frame, int src_stride,
+                  uint8* dst_yplane, int dst_ystride,
+                  uint8* dst_uplane, int dst_ustride,
+                  uint8* dst_vplane, int dst_vstride,
+                  int src_width,
+                  int src_height);
+int
+ConvertI420ToYUY2(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  );
+int
+ConvertI420ToYV12(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  );
+int
+ConvertRGB24ToARGB(const uint8* src_frame, int src_stride,
+                   uint8* dst_frame, int dst_stride,
+                   int src_width, int src_height
+                   );
+int
+ConvertRGB24ToI420(const uint8* src_frame, int src_stride,
+                   uint8* dst_yplane, int dst_ystride,
+                   uint8* dst_uplane, int dst_ustride,
+                   uint8* dst_vplane, int dst_vstride,
+                   int src_width,
+                   int src_height
+                   );
+int
+ConvertABGRToI420(const uint8* src_frame, int src_stride,
+                  uint8* dst_yplane, int dst_ystride,
+                  uint8* dst_uplane, int dst_ustride,
+                  uint8* dst_vplane, int dst_vstride,
+                  int src_width,
+                  int src_height
+                  );
+int
+ConvertNv12ToRGB565(const uint8* src_yplane, int src_ystride,
+                    const  uint8* src_uvplane, int src_uvstride,
+                    uint8* dst_frame, int dst_stride,
+                    int src_width,
+                    int src_height
+                    );
+int
+ConvertI420ToABGR(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  );
+} //  namespace libyuv
+#endif // LIBYUV_INCLUDE_CONVERT_H_
--- a/include/format_conversion.h
+++ b/include/format_conversion.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_INCLUDE_FORMAT_CONVERSION_H_
+#define LIBYUV_INCLUDE_FORMAT_CONVERSION_H_
+#include "basic_types.h"
+namespace libyuv {
+// Converts any Bayer RGB format to I420.
+void BayerRGBToI420(const uint8* src_bayer, int src_pitch_bayer,
+                    uint32 src_fourcc_bayer,
+                    uint8* dst_y, int dst_pitch_y,
+                    uint8* dst_u, int dst_pitch_u,
+                    uint8* dst_v, int dst_pitch_v,
+                    int width, int height);
+// Converts any 32 bit ARGB to any Bayer RGB format.
+void RGB32ToBayerRGB(const uint8* src_rgb, int src_pitch_rgb,
+                     uint32 src_fourcc_rgb,
+                     uint8* dst_bayer, int dst_pitch_bayer,
+                     uint32 dst_fourcc_bayer,
+                     int width, int height);
+}  // namespace libyuv
+#endif  // LIBYUV_INCLUDE_FORMAT_CONVERSION_H_
--- a/include/general.h
+++ b/include/general.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+/*
+ * General operations on YUV images.
+ */
+#ifndef LIBYUV_INCLUDE_GENERAL_H_
+#define LIBYUV_INCLUDE_GENERAL_H_
+#include "basic_types.h"
+namespace libyuv {
+// Supported rotation
+enum VideoRotationMode
+{
+    kRotateNone = 0,
+    kRotateClockwise = 90,
+    kRotateAntiClockwise = -90,
+    kRotate180 = 180,
+};
+// Mirror functions
+// The following 2 functions perform mirroring on an image (LeftRight/UpDown)
+// Input:
+//    - width       : Image width in pixels.
+//    - height      : Image height in pixels.
+//    - inFrame     : Reference to input image.
+//    - outFrame    : Reference to converted image.
+// Return value: 0 if OK, < 0 otherwise.
+int
+MirrorI420LeftRight(const uint8* src_frame, int src_stride,
+                     uint8* dst_frame, int dst_stride,
+                     int src_width, int src_height);
+// Cut/Pad I420 frame to match desird dimensions.
+int
+CutPadI420Frame(const uint8* inFrame, int inWidth,
+                  int inHeight, uint8* outFrame,
+                  int outWidth, int outHeight);
+// I420 Cut - make a center cut
+int
+CutI420Frame(uint8* frame, int fromWidth,
+             int fromHeight, int toWidth,
+             int toHeight);
+} // namespace libyuv
+#endif // LIBYUV_INCLUDE_GENERAL_H_
--- a/include/planar_functions.h
+++ b/include/planar_functions.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_
+#define LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_
+#include "basic_types.h"
+namespace libyuv {
+class PlanarFunctions {
+ public:
+  // Copy I420 to I420.
+  static void I420Copy(const uint8* src_y, int src_pitch_y,
+                       const uint8* src_u, int src_pitch_u,
+                       const uint8* src_v, int src_pitch_v,
+                       uint8* dst_y, int dst_pitch_y,
+                       uint8* dst_u, int dst_pitch_u,
+                       uint8* dst_v, int dst_pitch_v,
+                       int width, int height);
+  // Convert I422 to I420.  Used by MJPG.
+  static void I422ToI420(const uint8* src_y, int src_pitch_y,
+                         const uint8* src_u, int src_pitch_u,
+                         const uint8* src_v, int src_pitch_v,
+                         uint8* dst_y, int dst_pitch_y,
+                         uint8* dst_u, int dst_pitch_u,
+                         uint8* dst_v, int dst_pitch_v,
+                         int width, int height);
+  // Convert M420 to I420.
+  static void M420ToI420(uint8* dst_y, int dst_pitch_y,
+                         uint8* dst_u, int dst_pitch_u,
+                         uint8* dst_v, int dst_pitch_v,
+                         const uint8* m420, int pitch_m420,
+                         int width, int height);
+  // Convert NV12 to I420.  Also used for NV21.
+  static void NV12ToI420(uint8* dst_y, int dst_pitch_y,
+                         uint8* dst_u, int dst_pitch_u,
+                         uint8* dst_v, int dst_pitch_v,
+                         const uint8* src_y,
+                         const uint8* src_uv,
+                         int src_pitch,
+                         int width, int height);
+  DISALLOW_IMPLICIT_CONSTRUCTORS(PlanarFunctions);
+};
+}  // namespace libyuv
+#endif  // LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_
--- a/include/scale.h
+++ b/include/scale.h
+#ifndef LIBYUV_INCLUDE_SCALE_H_
+#define LIBYUV_INCLUDE_SCALE_H_
+#include "basic_types.h"
+#if defined(_MSC_VER)
+#define ALIGN16(var) __declspec(align(16)) var
+#else
+#define ALIGN16(var) var __attribute__((aligned(16)))
+#endif
+namespace libyuv
+{
+class YuvScaler {
+ public:
+  // Scales a YUV 4:2:0 image from the input width and height to the
+  // output width and height. If outh_offset is nonzero, the image is
+  // offset by that many pixels and stretched to (outh - outh_offset * 2)
+  // pixels high, instead of outh.
+  // If interpolate is not set, a simple nearest-neighbor algorithm is
+  // used. This produces basic (blocky) quality at the fastest speed.
+  // If interpolate is set, interpolation is used to produce a better
+  // quality image, at the expense of speed.
+  // Returns true if successful.
+  static bool Scale(const uint8 *in, int32 inw, int32 inh,
+                    uint8 *out, int32 outw, int32 outh, int32 outh_offset,
+                    bool interpolate);
+  // Same, but specified in terms of each plane location and stride.
+  static bool Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV,
+                    int32 istrideY, int32 istrideU, int32 istrideV,
+                    int32 iwidth, int32 iheight,
+                    uint8 *outY, uint8 *outU, uint8 *outV,
+                    int32 ostrideY, int32 ostrideU, int32 ostrideV,
+                    int32 owidth, int32 oheight,
+                    bool interpolate);
+  // For testing, allow disabling of optimizations.
+  static void SetUseReferenceImpl(bool use) { use_reference_impl_ = use; }
+ private:
+  static bool use_reference_impl_;
+  DISALLOW_IMPLICIT_CONSTRUCTORS(YuvScaler);
+};
+} // namespace libyuv
+#endif // LIBYUV_INCLUDE_SCALE_H_
--- a/libyuv.gyp
+++ b/libyuv.gyp
+# Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+{
+  'targets': [
+    {
+      'target_name': 'libyuv',
+      'type': 'static_library',
+      'dependencies': [
+      ],
+      'include_dirs': [
+        'include',
+        'common',
+      ],
+      'sources': [
+        # includes
+        'include/convert.h',
+        'include/general.h',
+        'include/scale.h',
+        'include/planar_functions.h',
+        # headers
+        'common/basic_types.h',
+        'common/common.h', 
+        'common/constructor_magic.h',
+        'source/cpu_id.h',        
+        'source/row.h',
+        'source/video_common.h',
+        # sources
+        'source/convert.cc',
+        'source/general.cc',
+        'source/scale.cc',
+        'source/cpu_id.cc',
+        'source/format_conversion.cc',
+        'source/planar_functions.cc',    
+        'source/row_posix.cc',    
+        'source/row_table.cc',    
+        'source/video_common.cc',   
+      ],
+    },
+   ], # targets
+}
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
--- a/source/conversion_tables.h
+++ b/source/conversion_tables.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+/**************************************************************
+*  conversion_tables.h
+*
+*    Pre-compiled definitions of the conversion equations: YUV -> RGB.
+*
+***************************************************************/
+#ifndef WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES
+#define WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES
+namespace libyuv
+{
+/******************************************************************************
+* YUV TO RGB approximation
+*
+*  R = clip( (298 * (Y - 16)                   + 409 * (V - 128) + 128 ) >> 8 )
+*  G = clip( (298 * (Y - 16) - 100 * (U - 128) - 208 * (V - 128) + 128 ) >> 8 )
+*  B = clip( (298 * (Y - 16) + 516 * (U - 128)                   + 128 ) >> 8 )
+*******************************************************************************/
+    #define Yc(i)  static_cast<int> ( 298  * ( i - 16 )) // Y contribution
+    #define Ucg(i) static_cast<int> ( -100 * ( i - 128 ))// U contribution to G
+    #define Ucb(i) static_cast<int> ( 516  * ( i - 128 ))// U contribution to B
+    #define Vcr(i) static_cast<int> ( 409  * ( i - 128 ))// V contribution to R
+    #define Vcg(i) static_cast<int> ( -208 * ( i - 128 ))// V contribution to G
+    static const int mapYc[256] = {
+        Yc(0),Yc(1),Yc(2),Yc(3),Yc(4),Yc(5),Yc(6),Yc(7),Yc(8),Yc(9),
+        Yc(10),Yc(11),Yc(12),Yc(13),Yc(14),Yc(15),Yc(16),Yc(17),Yc(18),Yc(19),
+        Yc(20),Yc(21),Yc(22),Yc(23),Yc(24),Yc(25),Yc(26),Yc(27),Yc(28),Yc(29),
+        Yc(30),Yc(31),Yc(32),Yc(33),Yc(34),Yc(35),Yc(36),Yc(37),Yc(38),Yc(39),
+        Yc(40),Yc(41),Yc(42),Yc(43),Yc(44),Yc(45),Yc(46),Yc(47),Yc(48),Yc(49),
+        Yc(50),Yc(51),Yc(52),Yc(53),Yc(54),Yc(55),Yc(56),Yc(57),Yc(58),Yc(59),
+        Yc(60),Yc(61),Yc(62),Yc(63),Yc(64),Yc(65),Yc(66),Yc(67),Yc(68),Yc(69),
+        Yc(70),Yc(71),Yc(72),Yc(73),Yc(74),Yc(75),Yc(76),Yc(77),Yc(78),Yc(79),
+        Yc(80),Yc(81),Yc(82),Yc(83),Yc(84),Yc(85),Yc(86),Yc(87),Yc(88),Yc(89),
+        Yc(90),Yc(91),Yc(92),Yc(93),Yc(94),Yc(95),Yc(96),Yc(97),Yc(98),Yc(99),
+        Yc(100),Yc(101),Yc(102),Yc(103),Yc(104),Yc(105),Yc(106),Yc(107),Yc(108),
+        Yc(109),Yc(110),Yc(111),Yc(112),Yc(113),Yc(114),Yc(115),Yc(116),Yc(117),
+        Yc(118),Yc(119),Yc(120),Yc(121),Yc(122),Yc(123),Yc(124),Yc(125),Yc(126),
+        Yc(127),Yc(128),Yc(129),Yc(130),Yc(131),Yc(132),Yc(133),Yc(134),Yc(135),
+        Yc(136),Yc(137),Yc(138),Yc(139),Yc(140),Yc(141),Yc(142),Yc(143),Yc(144),
+        Yc(145),Yc(146),Yc(147),Yc(148),Yc(149),Yc(150),Yc(151),Yc(152),Yc(153),
+        Yc(154),Yc(155),Yc(156),Yc(157),Yc(158),Yc(159),Yc(160),Yc(161),Yc(162),
+        Yc(163),Yc(164),Yc(165),Yc(166),Yc(167),Yc(168),Yc(169),Yc(170),Yc(171),
+        Yc(172),Yc(173),Yc(174),Yc(175),Yc(176),Yc(177),Yc(178),Yc(179),Yc(180),
+        Yc(181),Yc(182),Yc(183),Yc(184),Yc(185),Yc(186),Yc(187),Yc(188),Yc(189),
+        Yc(190),Yc(191),Yc(192),Yc(193),Yc(194),Yc(195),Yc(196),Yc(197),Yc(198),
+        Yc(199),Yc(200),Yc(201),Yc(202),Yc(203),Yc(204),Yc(205),Yc(206),Yc(207),
+        Yc(208),Yc(209),Yc(210),Yc(211),Yc(212),Yc(213),Yc(214),Yc(215),Yc(216),
+        Yc(217),Yc(218),Yc(219),Yc(220),Yc(221),Yc(222),Yc(223),Yc(224),Yc(225),
+        Yc(226),Yc(227),Yc(228),Yc(229),Yc(230),Yc(231),Yc(232),Yc(233),Yc(234),
+        Yc(235),Yc(236),Yc(237),Yc(238),Yc(239),Yc(240),Yc(241),Yc(242),Yc(243),
+        Yc(244),Yc(245),Yc(246),Yc(247),Yc(248),Yc(249),Yc(250),Yc(251),Yc(252),
+        Yc(253),Yc(254),Yc(255)};
+   static const int mapUcg[256] = {
+        Ucg(0),Ucg(1),Ucg(2),Ucg(3),Ucg(4),Ucg(5),Ucg(6),Ucg(7),Ucg(8),Ucg(9),
+        Ucg(10),Ucg(11),Ucg(12),Ucg(13),Ucg(14),Ucg(15),Ucg(16),Ucg(17),Ucg(18),
+        Ucg(19),Ucg(20),Ucg(21),Ucg(22),Ucg(23),Ucg(24),Ucg(25),Ucg(26),Ucg(27),
+        Ucg(28),Ucg(29),Ucg(30),Ucg(31),Ucg(32),Ucg(33),Ucg(34),Ucg(35),Ucg(36),
+        Ucg(37),Ucg(38),Ucg(39),Ucg(40),Ucg(41),Ucg(42),Ucg(43),Ucg(44),Ucg(45),
+        Ucg(46),Ucg(47),Ucg(48),Ucg(49),Ucg(50),Ucg(51),Ucg(52),Ucg(53),Ucg(54),
+        Ucg(55),Ucg(56),Ucg(57),Ucg(58),Ucg(59),Ucg(60),Ucg(61),Ucg(62),Ucg(63),
+        Ucg(64),Ucg(65),Ucg(66),Ucg(67),Ucg(68),Ucg(69),Ucg(70),Ucg(71),Ucg(72),
+        Ucg(73),Ucg(74),Ucg(75),Ucg(76),Ucg(77),Ucg(78),Ucg(79),Ucg(80),Ucg(81),
+        Ucg(82),Ucg(83),Ucg(84),Ucg(85),Ucg(86),Ucg(87),Ucg(88),Ucg(89),Ucg(90),
+        Ucg(91),Ucg(92),Ucg(93),Ucg(94),Ucg(95),Ucg(96),Ucg(97),Ucg(98),Ucg(99),
+        Ucg(100),Ucg(101),Ucg(102),Ucg(103),Ucg(104),Ucg(105),Ucg(106),Ucg(107),
+        Ucg(108),Ucg(109),Ucg(110),Ucg(111),Ucg(112),Ucg(113),Ucg(114),Ucg(115),
+        Ucg(116),Ucg(117),Ucg(118),Ucg(119),Ucg(120),Ucg(121),Ucg(122),Ucg(123),
+        Ucg(124),Ucg(125),Ucg(126),Ucg(127),Ucg(128),Ucg(129),Ucg(130),Ucg(131),
+        Ucg(132),Ucg(133),Ucg(134),Ucg(135),Ucg(136),Ucg(137),Ucg(138),Ucg(139),
+        Ucg(140),Ucg(141),Ucg(142),Ucg(143),Ucg(144),Ucg(145),Ucg(146),Ucg(147),
+        Ucg(148),Ucg(149),Ucg(150),Ucg(151),Ucg(152),Ucg(153),Ucg(154),Ucg(155),
+        Ucg(156),Ucg(157),Ucg(158),Ucg(159),Ucg(160),Ucg(161),Ucg(162),Ucg(163),
+        Ucg(164),Ucg(165),Ucg(166),Ucg(167),Ucg(168),Ucg(169),Ucg(170),Ucg(171),
+        Ucg(172),Ucg(173),Ucg(174),Ucg(175),Ucg(176),Ucg(177),Ucg(178),Ucg(179),
+        Ucg(180),Ucg(181),Ucg(182),Ucg(183),Ucg(184),Ucg(185),Ucg(186),Ucg(187),
+        Ucg(188),Ucg(189),Ucg(190),Ucg(191),Ucg(192),Ucg(193),Ucg(194),Ucg(195),
+        Ucg(196),Ucg(197),Ucg(198),Ucg(199),Ucg(200),Ucg(201),Ucg(202),Ucg(203),
+        Ucg(204),Ucg(205),Ucg(206),Ucg(207),Ucg(208),Ucg(209),Ucg(210),Ucg(211),
+        Ucg(212),Ucg(213),Ucg(214),Ucg(215),Ucg(216),Ucg(217),Ucg(218),Ucg(219),
+        Ucg(220),Ucg(221),Ucg(222),Ucg(223),Ucg(224),Ucg(225),Ucg(226),Ucg(227),
+        Ucg(228),Ucg(229),Ucg(230),Ucg(231),Ucg(232),Ucg(233),Ucg(234),Ucg(235),
+        Ucg(236),Ucg(237),Ucg(238),Ucg(239),Ucg(240),Ucg(241),Ucg(242),Ucg(243),
+        Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251),
+        Ucg(252),Ucg(253),Ucg(254),Ucg(255)};
+   static const int mapUcb[256] = {
+        Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9),
+        Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18),
+        Ucb(19),Ucb(20),Ucb(21),Ucb(22),Ucb(23),Ucb(24),Ucb(25),Ucb(26),Ucb(27),
+        Ucb(28),Ucb(29),Ucb(30),Ucb(31),Ucb(32),Ucb(33),Ucb(34),Ucb(35),Ucb(36),
+        Ucb(37),Ucb(38),Ucb(39),Ucb(40),Ucb(41),Ucb(42),Ucb(43),Ucb(44),Ucb(45),
+        Ucb(46),Ucb(47),Ucb(48),Ucb(49),Ucb(50),Ucb(51),Ucb(52),Ucb(53),Ucb(54),
+        Ucb(55),Ucb(56),Ucb(57),Ucb(58),Ucb(59),Ucb(60),Ucb(61),Ucb(62),Ucb(63),
+        Ucb(64),Ucb(65),Ucb(66),Ucb(67),Ucb(68),Ucb(69),Ucb(70),Ucb(71),Ucb(72),
+        Ucb(73),Ucb(74),Ucb(75),Ucb(76),Ucb(77),Ucb(78),Ucb(79),Ucb(80),Ucb(81),
+        Ucb(82),Ucb(83),Ucb(84),Ucb(85),Ucb(86),Ucb(87),Ucb(88),Ucb(89),Ucb(90),
+        Ucb(91),Ucb(92),Ucb(93),Ucb(94),Ucb(95),Ucb(96),Ucb(97),Ucb(98),Ucb(99),
+        Ucb(100),Ucb(101),Ucb(102),Ucb(103),Ucb(104),Ucb(105),Ucb(106),Ucb(107),
+        Ucb(108),Ucb(109),Ucb(110),Ucb(111),Ucb(112),Ucb(113),Ucb(114),Ucb(115),
+        Ucb(116),Ucb(117),Ucb(118),Ucb(119),Ucb(120),Ucb(121),Ucb(122),Ucb(123),
+        Ucb(124),Ucb(125),Ucb(126),Ucb(127),Ucb(128),Ucb(129),Ucb(130),Ucb(131),
+        Ucb(132),Ucb(133),Ucb(134),Ucb(135),Ucb(136),Ucb(137),Ucb(138),Ucb(139),
+        Ucb(140),Ucb(141),Ucb(142),Ucb(143),Ucb(144),Ucb(145),Ucb(146),Ucb(147),
+        Ucb(148),Ucb(149),Ucb(150),Ucb(151),Ucb(152),Ucb(153),Ucb(154),Ucb(155),
+        Ucb(156),Ucb(157),Ucb(158),Ucb(159),Ucb(160),Ucb(161),Ucb(162),Ucb(163),
+        Ucb(164),Ucb(165),Ucb(166),Ucb(167),Ucb(168),Ucb(169),Ucb(170),Ucb(171),
+        Ucb(172),Ucb(173),Ucb(174),Ucb(175),Ucb(176),Ucb(177),Ucb(178),Ucb(179),
+        Ucb(180),Ucb(181),Ucb(182),Ucb(183),Ucb(184),Ucb(185),Ucb(186),Ucb(187),
+        Ucb(188),Ucb(189),Ucb(190),Ucb(191),Ucb(192),Ucb(193),Ucb(194),Ucb(195),
+        Ucb(196),Ucb(197),Ucb(198),Ucb(199),Ucb(200),Ucb(201),Ucb(202),Ucb(203),
+        Ucb(204),Ucb(205),Ucb(206),Ucb(207),Ucb(208),Ucb(209),Ucb(210),Ucb(211),
+        Ucb(212),Ucb(213),Ucb(214),Ucb(215),Ucb(216),Ucb(217),Ucb(218),Ucb(219),
+        Ucb(220),Ucb(221),Ucb(222),Ucb(223),Ucb(224),Ucb(225),Ucb(226),Ucb(227),
+        Ucb(228),Ucb(229),Ucb(230),Ucb(231),Ucb(232),Ucb(233),Ucb(234),Ucb(235),
+        Ucb(236),Ucb(237),Ucb(238),Ucb(239),Ucb(240),Ucb(241),Ucb(242),Ucb(243),
+        Ucb(244),Ucb(245),Ucb(246),Ucb(247),Ucb(248),Ucb(249),Ucb(250),Ucb(251),
+        Ucb(252),Ucb(253),Ucb(254),Ucb(255)};
+    static const int mapVcr[256] = {
+        Vcr(0),Vcr(1),Vcr(2),Vcr(3),Vcr(4),Vcr(5),Vcr(6),Vcr(7),Vcr(8),Vcr(9),
+        Vcr(10),Vcr(11),Vcr(12),Vcr(13),Vcr(14),Vcr(15),Vcr(16),Vcr(17),Vcr(18),
+        Vcr(19),Vcr(20),Vcr(21),Vcr(22),Vcr(23),Vcr(24),Vcr(25),Vcr(26),Vcr(27),
+        Vcr(28),Vcr(29),Vcr(30),Vcr(31),Vcr(32),Vcr(33),Vcr(34),Vcr(35),Vcr(36),
+        Vcr(37),Vcr(38),Vcr(39),Vcr(40),Vcr(41),Vcr(42),Vcr(43),Vcr(44),Vcr(45),
+        Vcr(46),Vcr(47),Vcr(48),Vcr(49),Vcr(50),Vcr(51),Vcr(52),Vcr(53),Vcr(54),
+        Vcr(55),Vcr(56),Vcr(57),Vcr(58),Vcr(59),Vcr(60),Vcr(61),Vcr(62),Vcr(63),
+        Vcr(64),Vcr(65),Vcr(66),Vcr(67),Vcr(68),Vcr(69),Vcr(70),Vcr(71),Vcr(72),
+        Vcr(73),Vcr(74),Vcr(75),Vcr(76),Vcr(77),Vcr(78),Vcr(79),Vcr(80),Vcr(81),
+        Vcr(82),Vcr(83),Vcr(84),Vcr(85),Vcr(86),Vcr(87),Vcr(88),Vcr(89),Vcr(90),
+        Vcr(91),Vcr(92),Vcr(93),Vcr(94),Vcr(95),Vcr(96),Vcr(97),Vcr(98),Vcr(99),
+        Vcr(100),Vcr(101),Vcr(102),Vcr(103),Vcr(104),Vcr(105),Vcr(106),Vcr(107),
+        Vcr(108),Vcr(109),Vcr(110),Vcr(111),Vcr(112),Vcr(113),Vcr(114),Vcr(115),
+        Vcr(116),Vcr(117),Vcr(118),Vcr(119),Vcr(120),Vcr(121),Vcr(122),Vcr(123),
+        Vcr(124),Vcr(125),Vcr(126),Vcr(127),Vcr(128),Vcr(129),Vcr(130),Vcr(131),
+        Vcr(132),Vcr(133),Vcr(134),Vcr(135),Vcr(136),Vcr(137),Vcr(138),Vcr(139),
+        Vcr(140),Vcr(141),Vcr(142),Vcr(143),Vcr(144),Vcr(145),Vcr(146),Vcr(147),
+        Vcr(148),Vcr(149),Vcr(150),Vcr(151),Vcr(152),Vcr(153),Vcr(154),Vcr(155),
+        Vcr(156),Vcr(157),Vcr(158),Vcr(159),Vcr(160),Vcr(161),Vcr(162),Vcr(163),
+        Vcr(164),Vcr(165),Vcr(166),Vcr(167),Vcr(168),Vcr(169),Vcr(170),Vcr(171),
+        Vcr(172),Vcr(173),Vcr(174),Vcr(175),Vcr(176),Vcr(177),Vcr(178),Vcr(179),
+        Vcr(180),Vcr(181),Vcr(182),Vcr(183),Vcr(184),Vcr(185),Vcr(186),Vcr(187),
+        Vcr(188),Vcr(189),Vcr(190),Vcr(191),Vcr(192),Vcr(193),Vcr(194),Vcr(195),
+        Vcr(196),Vcr(197),Vcr(198),Vcr(199),Vcr(200),Vcr(201),Vcr(202),Vcr(203),
+        Vcr(204),Vcr(205),Vcr(206),Vcr(207),Vcr(208),Vcr(209),Vcr(210),Vcr(211),
+        Vcr(212),Vcr(213),Vcr(214),Vcr(215),Vcr(216),Vcr(217),Vcr(218),Vcr(219),
+        Vcr(220),Vcr(221),Vcr(222),Vcr(223),Vcr(224),Vcr(225),Vcr(226),Vcr(227),
+        Vcr(228),Vcr(229),Vcr(230),Vcr(231),Vcr(232),Vcr(233),Vcr(234),Vcr(235),
+        Vcr(236),Vcr(237),Vcr(238),Vcr(239),Vcr(240),Vcr(241),Vcr(242),Vcr(243),
+        Vcr(244),Vcr(245),Vcr(246),Vcr(247),Vcr(248),Vcr(249),Vcr(250),Vcr(251),
+        Vcr(252),Vcr(253),Vcr(254),Vcr(255)};
+         static const int mapVcg[256] = {
+        Vcg(0),Vcg(1),Vcg(2),Vcg(3),Vcg(4),Vcg(5),Vcg(6),Vcg(7),Vcg(8),Vcg(9),
+        Vcg(10),Vcg(11),Vcg(12),Vcg(13),Vcg(14),Vcg(15),Vcg(16),Vcg(17),Vcg(18),
+        Vcg(19),Vcg(20),Vcg(21),Vcg(22),Vcg(23),Vcg(24),Vcg(25),Vcg(26),Vcg(27),
+        Vcg(28),Vcg(29),Vcg(30),Vcg(31),Vcg(32),Vcg(33),Vcg(34),Vcg(35),Vcg(36),
+        Vcg(37),Vcg(38),Vcg(39),Vcg(40),Vcg(41),Vcg(42),Vcg(43),Vcg(44),Vcg(45),
+        Vcg(46),Vcg(47),Vcg(48),Vcg(49),Vcg(50),Vcg(51),Vcg(52),Vcg(53),Vcg(54),
+        Vcg(55),Vcg(56),Vcg(57),Vcg(58),Vcg(59),Vcg(60),Vcg(61),Vcg(62),Vcg(63),
+        Vcg(64),Vcg(65),Vcg(66),Vcg(67),Vcg(68),Vcg(69),Vcg(70),Vcg(71),Vcg(72),
+        Vcg(73),Vcg(74),Vcg(75),Vcg(76),Vcg(77),Vcg(78),Vcg(79),Vcg(80),Vcg(81),
+        Vcg(82),Vcg(83),Vcg(84),Vcg(85),Vcg(86),Vcg(87),Vcg(88),Vcg(89),Vcg(90),
+        Vcg(91),Vcg(92),Vcg(93),Vcg(94),Vcg(95),Vcg(96),Vcg(97),Vcg(98),Vcg(99),
+        Vcg(100),Vcg(101),Vcg(102),Vcg(103),Vcg(104),Vcg(105),Vcg(106),Vcg(107),
+        Vcg(108),Vcg(109),Vcg(110),Vcg(111),Vcg(112),Vcg(113),Vcg(114),Vcg(115),
+        Vcg(116),Vcg(117),Vcg(118),Vcg(119),Vcg(120),Vcg(121),Vcg(122),Vcg(123),
+        Vcg(124),Vcg(125),Vcg(126),Vcg(127),Vcg(128),Vcg(129),Vcg(130),Vcg(131),
+        Vcg(132),Vcg(133),Vcg(134),Vcg(135),Vcg(136),Vcg(137),Vcg(138),Vcg(139),
+        Vcg(140),Vcg(141),Vcg(142),Vcg(143),Vcg(144),Vcg(145),Vcg(146),Vcg(147),
+        Vcg(148),Vcg(149),Vcg(150),Vcg(151),Vcg(152),Vcg(153),Vcg(154),Vcg(155),
+        Vcg(156),Vcg(157),Vcg(158),Vcg(159),Vcg(160),Vcg(161),Vcg(162),Vcg(163),
+        Vcg(164),Vcg(165),Vcg(166),Vcg(167),Vcg(168),Vcg(169),Vcg(170),Vcg(171),
+        Vcg(172),Vcg(173),Vcg(174),Vcg(175),Vcg(176),Vcg(177),Vcg(178),Vcg(179),
+        Vcg(180),Vcg(181),Vcg(182),Vcg(183),Vcg(184),Vcg(185),Vcg(186),Vcg(187),
+        Vcg(188),Vcg(189),Vcg(190),Vcg(191),Vcg(192),Vcg(193),Vcg(194),Vcg(195),
+        Vcg(196),Vcg(197),Vcg(198),Vcg(199),Vcg(200),Vcg(201),Vcg(202),Vcg(203),
+        Vcg(204),Vcg(205),Vcg(206),Vcg(207),Vcg(208),Vcg(209),Vcg(210),Vcg(211),
+        Vcg(212),Vcg(213),Vcg(214),Vcg(215),Vcg(216),Vcg(217),Vcg(218),Vcg(219),
+        Vcg(220),Vcg(221),Vcg(222),Vcg(223),Vcg(224),Vcg(225),Vcg(226),Vcg(227),
+        Vcg(228),Vcg(229),Vcg(230),Vcg(231),Vcg(232),Vcg(233),Vcg(234),Vcg(235),
+        Vcg(236),Vcg(237),Vcg(238),Vcg(239),Vcg(240),Vcg(241),Vcg(242),Vcg(243),
+        Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251),
+        Vcg(252),Vcg(253),Vcg(254),Vcg(255)};
+} // namespace libyuv
+#endif
--- a/source/convert.cc
+++ b/source/convert.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "convert.h"
+#include <string.h>     // memcpy(), memset()
+#include <assert.h>
+#include <stdlib.h>     // abs
+//#define SCALEOPT //Currently for windows only. June 2010
+#ifdef SCALEOPT
+#include <emmintrin.h>
+#endif
+#include "conversion_tables.h"
+namespace libyuv
+{
+// Clip value to [0,255]
+inline uint8 Clip(int32 val);
+#ifdef SCALEOPT
+void *memcpy_16(void * dest, const void * src, size_t n);
+void *memcpy_8(void * dest, const void * src, size_t n);
+#endif
+int
+ConvertI420ToRGB24(const uint8* src_yplane, int src_ystride,
+                   const uint8* src_uplane, int src_ustride,
+                   const uint8* src_vplane, int src_vstride,
+                   uint8* dst_frame, int dst_stride,
+                   int src_width,
+                   int src_height
+                   )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    // RGB orientation - bottom up
+    uint8* out = dst_frame + dst_stride * src_height * 3 - dst_stride * 3;
+    uint8* out2 = out - dst_stride * 3;
+    int h, w;
+    int tmpR, tmpG, tmpB;
+    const uint8 *y1, *y2 ,*u, *v;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    u = src_uplane;
+    v = src_vplane;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // 2 rows at a time, 2 y's at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Vertical and horizontal sub-sampling
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+            out[2] = Clip(tmpR);
+            out[1] = Clip(tmpG);
+            out[0] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[2] = Clip(tmpR);
+            out2[1] = Clip(tmpG);
+            out2[0] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out[5] = Clip(tmpR);
+            out[4] = Clip(tmpG);
+            out[3] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[5] = Clip(tmpR);
+            out2[4] = Clip(tmpG);
+            out2[3] = Clip(tmpB);
+            out  += 6;
+            out2 += 6;
+            y1 += 2;
+            y2 += 2;
+            u++;
+            v++;
+        }
+        y1 += src_ystride + src_ystride - src_width;
+        y2 += src_ystride + src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out -= dst_stride * 9;
+        out2 -= dst_stride * 9;
+    } // end height for
+    return 0;
+}
+int
+ConvertI420ToARGB(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    uint8* out1 = dst_frame;
+    uint8* out2 = out1 + dst_stride * 4;
+    const uint8 *y1,*y2, *u, *v;
+    y1 = src_yplane;
+    y2 = src_yplane + src_ystride;
+    u = src_uplane;
+    v = src_vplane;
+    int h, w;
+    int tmpR, tmpG, tmpB;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // Do 2 rows at the time
+        for (w = 0; w < (src_width >> 1); w++)
+        {   // Vertical and horizontal sub-sampling
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+            out1[3] = 0xff;
+            out1[2] = Clip(tmpR);
+            out1[1] = Clip(tmpG);
+            out1[0] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[3] = 0xff;
+            out2[2] = Clip(tmpR);
+            out2[1] = Clip(tmpG);
+            out2[0] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out1[7] = 0xff;
+            out1[6] = Clip(tmpR);
+            out1[5] = Clip(tmpG);
+            out1[4] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[7] = 0xff;
+            out2[6] = Clip(tmpR);
+            out2[5] = Clip(tmpG);
+            out2[4] = Clip(tmpB);
+            out1 += 8;
+            out2 += 8;
+            y1 += 2;
+            y2 += 2;
+            u++;
+            v++;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out1 += (2 * dst_stride - src_width) * 4;
+        out2 += (2 * dst_stride - src_width) * 4;
+    } // end height for
+    return 0;
+}
+int
+ConvertYV12ToRGBA(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    int32 diff = dst_stride - src_width;
+    uint8 * out = dst_frame;
+    uint8 * out2 = out + dst_stride * 4;
+    const uint8 *y1,*y2, *u, *v;
+    int tmpG, tmpB, tmpR;
+    int h, w;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    v = src_vplane;
+    u = src_uplane;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // Do 2 rows at the time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128  ) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128 ) >> 8);
+            out[1] = Clip(tmpR);
+            out[2] = Clip(tmpG);
+            out[3] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[1] = Clip(tmpR);
+            out2[2] = Clip(tmpG);
+            out2[3] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out[5] = Clip(tmpR);
+            out[6] = Clip(tmpG);
+            out[7] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[5] = Clip(tmpR);
+            out2[6] = Clip(tmpG);
+            out2[7] = Clip(tmpB);
+            out[0] = 0xff;
+            out[4] = 0xff;
+            out += 8;
+            out2[0] = 0xff;
+            out2[4] = 0xff;
+            out2 += 8;
+            y1 += 2;
+            y2 += 2;
+            u++;
+            v++;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out += (src_width + diff * 2) * 4;
+        out2 += (src_width + diff * 2) * 4;
+    }
+    return 0;
+}
+// Little Endian...
+int
+ConvertI420ToARGB4444(const uint8* src_yplane, int src_ystride,
+                      const uint8* src_uplane, int src_ustride,
+                      const uint8* src_vplane, int src_vstride,
+                      uint8* dst_frame, int dst_stride,
+                      int src_width,
+                      int src_height
+                     )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+         return -1;
+    }
+    // RGB orientation - bottom up
+    uint8* out = dst_frame + dst_stride * (src_height - 1) * 2;
+    uint8* out2 = out - (2 * dst_stride);
+    int tmpR, tmpG, tmpB;
+    const uint8 *y1,*y2, *u, *v;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    u = src_uplane;
+    v = src_vplane;
+    int h, w;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // 2 rows at a time, 2 y's at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Vertical and horizontal sub-sampling
+            // Convert to RGB888 and re-scale to 4 bits
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+            out[0] =(uint8)((Clip(tmpG) & 0xf0) + (Clip(tmpB) >> 4));
+            out[1] = (uint8)(0xf0 + (Clip(tmpR) >> 4));
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[0] = (uint8)((Clip(tmpG) & 0xf0 ) + (Clip(tmpB) >> 4));
+            out2[1] = (uint8) (0xf0 + (Clip(tmpR) >> 4));
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out[2] = (uint8)((Clip(tmpG) & 0xf0 ) + (Clip(tmpB) >> 4));
+            out[3] = (uint8)(0xf0 + (Clip(tmpR) >> 4));
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[2] = (uint8)((Clip(tmpG) & 0xf0 ) + (Clip(tmpB) >> 4));
+            out2[3] = (uint8)(0xf0 + (Clip(tmpR) >> 4));
+            out  += 4;
+            out2 += 4;
+            y1 += 2;
+            y2 += 2;
+            u++;
+            v++;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out -= (2 * dst_stride + src_width) * 2;
+        out2 -= (2 * dst_stride + src_width) * 2;
+     } // end height for
+    return 0;
+}
+int
+ConvertI420ToRGB565(const uint8* src_yplane, int src_ystride,
+                    const uint8* src_uplane, int src_ustride,
+                    const uint8* src_vplane, int src_vstride,
+                    uint8* dst_frame, int dst_stride,
+                    int src_width,
+                    int src_height
+                     )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+         return -1;
+    }
+    uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1);
+    uint16* out2 = out - dst_stride;
+    int tmpR, tmpG, tmpB;
+    const uint8 *y1,*y2, *u, *v;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    u = src_uplane;
+    v = src_vplane;
+    int h, w;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // 2 rows at a time, 2 y's at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Vertical and horizontal sub-sampling
+            // 1. Convert to RGB888
+            // 2. Shift to adequate location (in the 16 bit word) - RGB 565
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+            out[0]  = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                               & 0xfc) << 3) + (Clip(tmpB ) >> 3);
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            y1 += 2;
+            y2 += 2;
+            out += 2;
+            out2 += 2;
+            u++;
+            v++;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out -= 2 * dst_stride + src_width;
+        out2 -=  2 * dst_stride + src_width;
+    } // end height for
+    return 0;
+}
+//Same as ConvertI420ToRGB565 but doesn't flip vertically.
+int
+ConvertI420ToRGB565Android(const uint8* src_yplane, int src_ystride,
+                           const uint8* src_uplane, int src_ustride,
+                           const uint8* src_vplane, int src_vstride,
+                           uint8* dst_frame, int dst_stride,
+                           int src_width,
+                           int src_height
+                           )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+         return -1;
+    }
+    uint16* out = (uint16*)(dst_frame);
+    uint16* out2 = out +  dst_stride;
+    int tmpR, tmpG, tmpB;
+    const uint8 *y1,*y2, *u, *v;
+    int h, w;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    u = src_uplane;
+    v = src_vplane;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // 2 rows at a time, 2 y's at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Vertical and horizontal sub-sampling
+            // 1. Convert to RGB888
+            // 2. Shift to adequate location (in the 16 bit word) - RGB 565
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                          + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+            out[0]  = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                          + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                          + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                               & 0xfc) << 3) + (Clip(tmpB ) >> 3);
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                          + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            y1 += 2;
+            y2 += 2;
+            out += 2;
+            out2 += 2;
+            u++;
+            v++;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out +=  2 * dst_stride - src_width;
+        out2 +=  2 * dst_stride - src_width;
+    } // end height for
+    return 0;
+}
+int
+ConvertI420ToARGB1555(const uint8* src_yplane, int src_ystride,
+                    const uint8* src_uplane, int src_ustride,
+                    const uint8* src_vplane, int src_vstride,
+                    uint8* dst_frame, int dst_stride,
+                    int src_width,
+                    int src_height
+                    )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1);
+    uint16* out2 = out - dst_stride ;
+    int32 tmpR, tmpG, tmpB;
+    const uint8 *y1,*y2, *u, *v;
+    int h, w;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    u = src_uplane;
+    v = src_vplane;
+    for (h = (src_height >> 1); h > 0; h--)
+    {  // 2 rows at a time, 2 y's at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Vertical and horizontal sub-sampling
+            // 1. Convert to RGB888
+            // 2. Shift to adequate location (in the 16 bit word) - RGB 555
+            // 3. Add 1 for alpha value
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
+            out[0]  = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) +
+                      ((Clip(tmpG) & 0xf8) << 3) + (Clip(tmpB) >> 3));
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[0]  = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) +
+                       ((Clip(tmpG) & 0xf8) << 3) + (Clip(tmpB) >> 3));
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out[1]  = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) +
+                      ((Clip(tmpG) & 0xf8) << 3)  + (Clip(tmpB) >> 3));
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[1]  = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) +
+                       ((Clip(tmpG) & 0xf8) << 3)  + (Clip(tmpB) >> 3));
+            y1 += 2;
+            y2 += 2;
+            out += 2;
+            out2 += 2;
+            u++;
+            v++;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out -= 2 * dst_stride + src_width;
+        out2 -=  2 * dst_stride + src_width;
+    } // end height for
+    return 0;
+}
+int
+ConvertI420ToYUY2(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    const uint8* in1 = src_yplane;
+    const uint8* in2 = src_yplane + src_ystride ;
+    const uint8* inU = src_uplane;
+    const uint8* inV = src_vplane;
+    uint8* out1 = dst_frame;
+    uint8* out2 = dst_frame + 2 * dst_stride;
+    // YUY2 - Macro-pixel = 2 image pixels
+    // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+#ifndef SCALEOPT
+    for (int i = 0; i < (src_height >> 1);i++)
+    {
+        for (int j = 0; j < (src_width >> 1);j++)
+        {
+            out1[0] = in1[0];
+            out1[1] = *inU;
+            out1[2] = in1[1];
+            out1[3] = *inV;
+            out2[0] = in2[0];
+            out2[1] = *inU;
+            out2[2] = in2[1];
+            out2[3] = *inV;
+            out1 += 4;
+            out2 += 4;
+            inU++;
+            inV++;
+            in1 += 2;
+            in2 += 2;
+        }
+        in1 += 2 * src_ystride - src_width;
+        in2 += 2 * src_ystride - src_width;
+        inU += src_ustride - (src_width >> 1);
+        inV += src_vstride - (src_width >> 1);
+        out1 += 2 * dst_stride + 2 * (dst_stride - src_width);
+        out2 += 2 * dst_stride + 2 * (dst_stride - src_width);
+    }
+#else
+    for (WebRtc_UWord32 i = 0; i < (height >> 1);i++)
+    {
+        int32 width__ = (width >> 4);
+        _asm
+        {
+            ;pusha
+            mov       eax, DWORD PTR [in1]                       ;1939.33
+            mov       ecx, DWORD PTR [in2]                       ;1939.33
+            mov       ebx, DWORD PTR [inU]                       ;1939.33
+            mov       edx, DWORD PTR [inV]                       ;1939.33
+            loop0:
+            movq      xmm6, QWORD PTR [ebx]          ;inU
+            movq      xmm0, QWORD PTR [edx]          ;inV
+            punpcklbw xmm6, xmm0                     ;inU, inV mix
+            ;movdqa    xmm1, xmm6
+            ;movdqa    xmm2, xmm6
+            ;movdqa    xmm4, xmm6
+            movdqu    xmm3, XMMWORD PTR [eax]        ;in1
+            movdqa    xmm1, xmm3
+            punpcklbw xmm1, xmm6                     ;in1, inU, in1, inV
+            mov       esi, DWORD PTR [out1]
+            movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
+            movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
+            movdqa    xmm2, xmm5
+            punpcklbw xmm2, xmm6                     ;in2, inU, in2, inV
+            mov       edi, DWORD PTR [out2]
+            movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
+            punpckhbw xmm3, xmm6                     ;in1, inU, in1, inV again
+            movdqu    XMMWORD PTR [esi+16], xmm3     ;write to out1 again
+            add       esi, 32
+            mov       DWORD PTR [out1], esi
+            punpckhbw xmm5, xmm6                     ;inU, in2, inV again
+            movdqu    XMMWORD PTR [edi+16], xmm5     ;write to out2 again
+            add       edi, 32
+            mov       DWORD PTR [out2], edi
+            add       ebx, 8
+            add       edx, 8
+            add       eax, 16
+            add       ecx, 16
+            mov       esi, DWORD PTR [width__]
+            sub       esi, 1
+            mov       DWORD PTR [width__], esi
+            jg        loop0
+            mov       DWORD PTR [in1], eax                       ;1939.33
+            mov       DWORD PTR [in2], ecx                       ;1939.33
+            mov       DWORD PTR [inU], ebx                       ;1939.33
+            mov       DWORD PTR [inV], edx                       ;1939.33
+            ;popa
+            emms
+        }
+        in1 += 2 * src_ystride - src_width;
+        in2 += 2 * src_ystride - src_width;
+        out1 += 2 * strideOut + 2 * (strideOut - width);
+        out2 += 2 * strideOut + 2 * (strideOut - width);
+    }
+#endif
+    return 0;
+}
+int
+ConvertI420ToUYVY(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    int i = 0;
+    const uint8* y1 = src_yplane;
+    const uint8* y2 = y1 + src_ystride;
+    const uint8* u = src_uplane;
+    const uint8* v = src_vplane;
+    uint8* out1 = dst_frame;
+    uint8* out2 = dst_frame + 2 * dst_stride;
+    // Macro-pixel = 2 image pixels
+    // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5.....
+#ifndef SCALEOPT
+    for (; i < (src_height >> 1);i++)
+    {
+        for (uint32 j = 0; j < (src_width >> 1) ;j++)
+        {
+            out1[0] = *u;
+            out1[1] = y1[0];
+            out1[2] = *v;
+            out1[3] = y1[1];
+            out2[0] = *u;
+            out2[1] = y2[0];
+            out2[2] = *v;
+            out2[3] = y2[1];
+            out1 += 4;
+            out2 += 4;
+            u++;
+            v++;
+            y1 += 2;
+            y2 += 2;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out1 += 2 * (dst_stride + (dst_stride - src_width));
+        out2 += 2 * (dst_stride + (dst_stride - src_width));
+    }
+#else
+    for (; i < (height >> 1);i++)
+    {
+        int32 width__ = (width >> 4);
+        _asm
+        {
+            ;pusha
+            mov       eax, DWORD PTR [in1]                       ;1939.33
+            mov       ecx, DWORD PTR [in2]                       ;1939.33
+            mov       ebx, DWORD PTR [inU]                       ;1939.33
+            mov       edx, DWORD PTR [inV]                       ;1939.33
+loop0:
+            movq      xmm6, QWORD PTR [ebx]          ;inU
+            movq      xmm0, QWORD PTR [edx]          ;inV
+            punpcklbw xmm6, xmm0                     ;inU, inV mix
+            movdqa    xmm1, xmm6
+            movdqa    xmm2, xmm6
+            movdqa    xmm4, xmm6
+            movdqu    xmm3, XMMWORD PTR [eax]        ;in1
+            punpcklbw xmm1, xmm3                     ;inU, in1, inV
+            mov       esi, DWORD PTR [out1]
+            movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
+            movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
+            punpcklbw xmm2, xmm5                     ;inU, in2, inV
+            mov       edi, DWORD PTR [out2]
+            movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
+            punpckhbw xmm4, xmm3                     ;inU, in1, inV again
+            movdqu    XMMWORD PTR [esi+16], xmm4     ;write to out1 again
+            add       esi, 32
+            mov       DWORD PTR [out1], esi
+            punpckhbw xmm6, xmm5                     ;inU, in2, inV again
+            movdqu    XMMWORD PTR [edi+16], xmm6     ;write to out2 again
+            add       edi, 32
+            mov       DWORD PTR [out2], edi
+            add       ebx, 8
+            add       edx, 8
+            add       eax, 16
+            add       ecx, 16
+            mov       esi, DWORD PTR [width__]
+            sub       esi, 1
+            mov       DWORD PTR [width__], esi
+            jg        loop0
+            mov       DWORD PTR [in1], eax                       ;1939.33
+            mov       DWORD PTR [in2], ecx                       ;1939.33
+            mov       DWORD PTR [inU], ebx                       ;1939.33
+            mov       DWORD PTR [inV], edx                       ;1939.33
+            ;popa
+            emms
+        }
+        in1 += width;
+        in2 += width;
+        out1 += 2 * (strideOut + (strideOut - width));
+        out2 += 2 * (strideOut + (strideOut - width));
+    }
+#endif
+    return 0;
+}
+int
+ConvertI420ToYV12(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    const uint8* inFrame  = src_yplane;
+    uint8* outFrame = dst_frame;
+    // Copy Y
+    for (int i = 0; i < src_height; i++)
+    {
+#ifndef SCALEOPT
+        memcpy(outFrame, inFrame, src_width);
+#else
+        memcpy_16(outFrame, inFrame, src_width);
+#endif
+        inFrame += src_ystride;
+        outFrame += dst_stride;
+    }
+    // Copy U
+    inFrame = src_uplane;
+    outFrame += (dst_stride >> 1) * src_height >> 1;
+    for (uint32 i = 0; i < src_height >>1; i++)
+    {
+#ifndef SCALEOPT
+        memcpy(outFrame, inFrame, src_width >> 1);
+#else
+        memcpy_8(outFrame, inFrame, src_width >> 1);
+#endif
+        inFrame += src_ustride;
+        outFrame += dst_stride >> 1;
+    }
+    outFrame -= dst_stride * src_height >> 1;
+    // Copy V
+    for (uint32 i = 0; i < src_height >> 1; i++)
+    {
+#ifndef SCALEOPT
+        memcpy(outFrame, inFrame, src_width >> 1);
+#else
+        memcpy_8(outFrame, inFrame, width >> 1);
+#endif
+        inFrame += src_vstride;
+        outFrame += dst_stride >> 1;
+    }
+    return 0;
+}
+int
+ConvertNv12ToRGB565(const uint8* src_yplane, int src_ystride,
+                    const  uint8* src_uvplane, int src_uvstride,
+                    uint8* dst_frame, int dst_stride,
+                    int src_width,
+                    int src_height
+                    )
+{
+    if (src_height < 1 || src_height < 1 || src_ystride < 1 || src_uvstride < 1)
+    {
+        return -1;
+    }
+    // Bi-Planar: Y plane followed by an interlaced U and V plane
+    const uint8* interlacedSrc = src_uvplane;
+    uint16* out = (uint16*)(src_yplane) + dst_stride * (src_height - 1);
+    uint16* out2 = out - dst_stride;
+    int32 tmpR, tmpG, tmpB;
+    const uint8 *y1,*y2;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    int h, w;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // 2 rows at a time, 2 y's at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Vertical and horizontal sub-sampling
+            // 1. Convert to RGB888
+            // 2. Shift to adequate location (in the 16 bit word) - RGB 565
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[interlacedSrc[1]]
+                            + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[interlacedSrc[0]]
+                            + mapVcg[interlacedSrc[1]] + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[interlacedSrc[0]]
+                            + 128) >> 8);
+            out[0]  = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[interlacedSrc[1]]
+                            + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[interlacedSrc[0]]
+                            + mapVcg[interlacedSrc[1]] + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[interlacedSrc[0]]
+                            + 128) >> 8);
+            out2[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[interlacedSrc[1]]
+                            + 128) >> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[interlacedSrc[0]]
+                            + mapVcg[interlacedSrc[1]] + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[interlacedSrc[0]]
+                            + 128) >> 8);
+            out[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                               & 0xfc) << 3) + (Clip(tmpB ) >> 3);
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[interlacedSrc[1]]
+                            + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[interlacedSrc[0]]
+                            + mapVcg[interlacedSrc[1]] + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[interlacedSrc[0]]
+                            + 128) >> 8);
+            out2[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG)
+                                & 0xfc) << 3) + (Clip(tmpB) >> 3);
+            y1 += 2;
+            y2 += 2;
+            out += 2;
+            out2 += 2;
+            interlacedSrc += 2;
+            }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        interlacedSrc += src_uvstride - (src_width >> 1);
+        out -= 3 * dst_stride + dst_stride - src_width;
+        out2 -= 3 * dst_stride + dst_stride - src_width;
+    } // end height for
+    return 0;
+}
+int
+ConvertI420ToABGR(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    // RGB orientation - bottom up
+    uint8* out = dst_frame + 4 * dst_stride * (src_height - 1);
+    uint8* out2 = out - dst_stride * 4;
+    int32 tmpR, tmpG, tmpB;
+    const uint8 *y1,*y2, *u, *v;
+    int h, w;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    u = src_uplane;
+    v = src_vplane;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // 2 rows at a time, 2 y's at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Vertical and horizontal sub-sampling
+            tmpR = (int32)((298 * (y1[0] - 16) + 409 * (v[0] - 128)
+                            + 128) >> 8);
+            tmpG = (int32)((298 * (y1[0] - 16) - 100 * (u[0] - 128)
+                            - 208 * (v[0] - 128) + 128 ) >> 8);
+            tmpB = (int32)((298 * (y1[0] - 16) + 516 * (u[0] - 128)
+                            + 128 ) >> 8);
+            out[3] = 0xff;
+            out[0] = Clip(tmpR);
+            out[1] = Clip(tmpG);
+            out[2] = Clip(tmpB);
+            tmpR = (int32)((298 * (y2[0] - 16) + 409 * (v[0] - 128)
+                            + 128) >> 8);
+            tmpG = (int32)((298 * (y2[0] - 16) - 100 * (u[0] - 128)
+                            - 208 * (v[0] - 128) + 128) >> 8);
+            tmpB = (int32)((298 * (y2[0] - 16) + 516 * (u[0] - 128)
+                            + 128) >> 8);
+            out2[3] = 0xff;
+            out2[0] = Clip(tmpR);
+            out2[1] = Clip(tmpG);
+            out2[2] = Clip(tmpB);
+            tmpR = (int32)((298 * (y1[1] - 16) + 409 * (v[0] - 128)
+                            + 128 ) >> 8);
+            tmpG = (int32)((298 * (y1[1] - 16) - 100 * (u[0] - 128)
+                            - 208 * (v[0] - 128) + 128 ) >> 8);
+            tmpB = (int32)((298 * (y1[1] - 16) + 516 * (u[0] - 128)
+                            + 128) >> 8);
+            out[7] = 0xff;
+            out[4] = Clip(tmpR);
+            out[5] = Clip(tmpG);
+            out[6] = Clip(tmpB);
+            tmpR = (int32)((298 * (y2[1] - 16) + 409 * (v[0] - 128)
+                            + 128) >> 8);
+            tmpG = (int32)((298 * (y2[1] - 16) - 100 * (u[0] - 128)
+                            - 208 * (v[0] - 128) + 128) >> 8);
+            tmpB = (int32)((298 * (y2[1] - 16) + 516 * (u[0] - 128)
+                            + 128 ) >> 8);
+            out2[7] = 0xff;
+            out2[4] = Clip(tmpR);
+            out2[5] = Clip(tmpG);
+            out2[6] = Clip(tmpB);
+            out  += 8;
+            out2 += 8;
+            y1 += 2;
+            y2 += 2;
+            u++;
+            v++;
+        }
+        y1 += src_ystride + src_ystride - src_width;
+        y2 += src_ystride + src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out -= (2 * dst_stride + src_width) * 4;
+        out2 -= (2 * dst_stride + src_width) * 4;
+    } // end height for
+    return 0;
+}
+int
+ConvertUYVYToI420(const uint8* src_frame, int src_stride,
+                  uint8* dst_yplane, int dst_ystride,
+                  uint8* dst_uplane, int dst_ustride,
+                  uint8* dst_vplane, int dst_vstride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || dst_ystride == 0 ||
+        dst_ustride == 0 || dst_vstride == 0 || src_stride == 0)
+    {
+        return -1;
+    }
+    int i, j;
+    uint8* outI = dst_yplane;
+    uint8* outU = dst_uplane;
+    uint8* outV = dst_vplane;
+    // U0Y0V0Y1..U2Y2V2Y3.....
+    for (i = 0; i < (src_height >> 1); i++)
+    {
+        for (j = 0; j < (src_width >> 1); j++)
+        {
+            outI[0] = src_frame[1];
+            *outU = src_frame[0];
+            outI[1] = src_frame[3];
+            *outV = src_frame[2];
+            src_frame += 4;
+            outI += 2;
+            outU++;
+            outV++;
+        }
+        for (j = 0; j < (src_width >> 1); j++)
+        {
+            outI[0] = src_frame[1];
+            outI[1] = src_frame[3];
+            src_frame += 4;
+            outI += 2;
+        }
+        outI += dst_ystride - src_width;
+        outU += dst_ustride - src_width << 1;
+        outV += dst_vstride - src_width << 1;
+    }
+    return 0;
+}
+int
+ConvertRGB24ToARGB(const uint8* src_frame, int src_stride,
+                   uint8* dst_frame, int dst_stride,
+                   int src_width, int src_height
+                   )
+{
+    if (src_width < 1 || src_height < 1 || dst_stride < 1)
+    {
+        return -1;
+    }
+    int i, j, offset;
+    uint8* outFrame = dst_frame;
+    const uint8* inFrame = src_frame;
+    outFrame += dst_stride * (src_height - 1) * 4;
+    for (i = 0; i < src_height; i++)
+    {
+        for (j = 0; j < src_width; j++)
+        {
+            offset = j*4;
+            outFrame[0 + offset] = inFrame[0];
+            outFrame[1 + offset] = inFrame[1];
+            outFrame[2 + offset] = inFrame[2];
+            outFrame[3 + offset] = 0xff;
+            inFrame += 3;
+        }
+        outFrame -= 4 * (dst_stride - src_width);
+        inFrame += src_stride - src_width;
+    }
+    return 0;
+}
+int
+ConvertRGB24ToI420(const uint8* src_frame, int src_stride,
+                   uint8* dst_yplane, int dst_ystride,
+                   uint8* dst_uplane, int dst_ustride,
+                   uint8* dst_vplane, int dst_vstride,
+                   int src_width,
+                   int src_height
+                   )
+{
+    if (src_width == 0 || src_height == 0 || dst_ystride == 0 ||
+        dst_ustride == 0 || dst_vstride == 0 || src_stride == 0)
+    {
+        return -1;
+    }
+    uint8* yStartPtr;
+    uint8* yStartPtr2;
+    uint8* uStartPtr;
+    uint8* vStartPtr;
+    const uint8* inpPtr;
+    const uint8* inpPtr2;
+    int h, w;
+    // Assuming RGB in a bottom up orientation.
+    yStartPtr = dst_yplane;
+    yStartPtr2 = yStartPtr + dst_ystride;
+    uStartPtr = dst_uplane;
+    vStartPtr =dst_vplane;
+    inpPtr = src_frame + src_stride * src_height * 3 - 3 * src_height;
+    inpPtr2 = inpPtr - 3 * src_stride;
+    for (h = 0; h < (src_height >> 1); h++ )
+    {
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            // Y
+            yStartPtr[0] =  (uint8)((66 * inpPtr[2] + 129 * inpPtr[1]
+                                            + 25 * inpPtr[0] + 128) >> 8) + 16;
+            yStartPtr2[0] = (uint8)((66 * inpPtr2[2] + 129 * inpPtr2[1]
+                                            + 25 * inpPtr2[0] + 128) >> 8) + 16;
+            // Moving to next column
+            yStartPtr[1] = (uint8)((66 * inpPtr[5] + 129 * inpPtr[4]
+                                           + 25 * inpPtr[3]  + 128) >> 8) + 16;
+            yStartPtr2[1] = (uint8)((66 * inpPtr2[5] + 129 * inpPtr2[4]
+                                            + 25 * inpPtr2[3] + 128) >> 8 ) + 16;
+            // U
+            uStartPtr[0] = (uint8)((-38 * inpPtr[2] - 74 * inpPtr[1] +
+                                            112 * inpPtr[0] + 128) >> 8) + 128;
+            // V
+            vStartPtr[0] = (uint8)((112 * inpPtr[2] -94 * inpPtr[1] -
+                                           18 * inpPtr[0] + 128) >> 8) + 128;
+            yStartPtr += 2;
+            yStartPtr2 += 2;
+            uStartPtr++;
+            vStartPtr++;
+            inpPtr += 6;
+            inpPtr2 += 6;
+        } // end for w
+        yStartPtr += dst_ystride + dst_ystride - src_width;
+        yStartPtr2 += dst_ystride + dst_ystride - src_width;
+        uStartPtr += dst_ustride + dst_ustride - (src_width >> 1);
+        vStartPtr += dst_vstride + dst_vstride - (src_width >> 1);
+        inpPtr -= 3 * (2 * src_stride + src_width);
+        inpPtr2 -= 3 * (2 * src_stride + src_width);
+    } // end for h
+    return 0;
+}
+int
+ConvertYV12ToARGB(const uint8* src_yplane, int src_ystride,
+                  const uint8* src_uplane, int src_ustride,
+                  const uint8* src_vplane, int src_vstride,
+                  uint8* dst_frame, int dst_stride,
+                  int src_width,
+                  int src_height
+                     )
+{
+    if (src_width == 0 || src_height == 0 || src_ystride == 0 ||
+        src_ustride == 0 || src_vstride == 0 || dst_stride == 0)
+    {
+        return -1;
+    }
+    int32 diff = dst_stride - src_width;
+    uint8* out = dst_frame;
+    uint8* out2 = out + dst_stride * 4;
+    const uint8 *y1,*y2, *u, *v;
+    int h, w;
+    y1 = src_yplane;
+    y2 = y1 + src_ystride;
+    v = src_vplane;
+    u = src_uplane;
+    int32 tmpG, tmpB, tmpR;
+    for (h = (src_height >> 1); h > 0; h--)
+    {
+        // 2 rows at a time
+        for (w = 0; w < (src_width >> 1); w++)
+        {
+            tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128  )>> 8);
+            tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128 )>> 8);
+            out[2] = Clip(tmpR);
+            out[1] = Clip(tmpG);
+            out[0] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
+            out2[2] = Clip(tmpR);
+            out2[1] = Clip(tmpG);
+            out2[0] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128)>> 8);
+            tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
+            out[6] = Clip(tmpR);
+            out[5] = Clip(tmpG);
+            out[4] = Clip(tmpB);
+            tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
+            tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]]
+                            + 128) >> 8);
+            tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
+            out2[6] = Clip(tmpR);
+            out2[5] = Clip(tmpG);
+            out2[4] = Clip(tmpB);
+            out[3] = 0xff;
+            out[7] = 0xff;
+            out += 8;
+            out2[3] = 0xff;
+            out2[7] = 0xff;
+            out2 += 8;
+            y1 += 2;
+            y2 += 2;
+            u++;
+            v++;
+        }
+        y1 += 2 * src_ystride - src_width;
+        y2 += 2 * src_ystride - src_width;
+        u += src_ustride - (src_width >> 1);
+        v += src_vstride - (src_width >> 1);
+        out += 4 * (2 * dst_stride - src_width);
+        out2 += 4 * (2 * dst_stride - src_width);
+    }
+    return 0;
+}
+int
+ConvertABGRToI420(const uint8* src_frame, int src_stride,
+                  uint8* dst_yplane, int dst_ystride,
+                  uint8* dst_uplane, int dst_ustride,
+                  uint8* dst_vplane, int dst_vstride,
+                  int src_width,
+                  int src_height
+                  )
+{
+    if (src_width == 0 || src_height == 0 || dst_ystride == 0 ||
+        dst_ustride == 0 || dst_vstride == 0 || src_stride == 0)
+    {
+        return -1;
+    }
+    uint8* yStartPtr;
+    uint8* yStartPtr2;
+    uint8* uStartPtr;
+    uint8* vStartPtr;
+    const uint8* inpPtr;
+    const uint8* inpPtr2;
+    yStartPtr = dst_yplane;
+    yStartPtr2 = yStartPtr + dst_ystride;
+    uStartPtr = dst_uplane;
+    vStartPtr = dst_vplane;
+    inpPtr = src_frame;
+    inpPtr2 = inpPtr + 4 * src_stride;
+    int h, w;
+    for (h = 0; h < (src_height >> 1); h++)
+    {
+        for (w = 0; w < (src_height >> 1); w++)
+        {
+            // Y
+            yStartPtr[0]  = (uint8)((66 * inpPtr[1] + 129 * inpPtr[2]
+                                     + 25 * inpPtr[3] + 128) >> 8) + 16;
+            yStartPtr2[0] = (uint8)((66 * inpPtr2[1] + 129 * inpPtr2[2]
+                                     + 25 * inpPtr2[3] + 128) >> 8) + 16;
+            // Moving to next column
+            yStartPtr[1] =  (uint8)((66 * inpPtr[5] + 129 * inpPtr[6]
+                                     + 25 * inpPtr[7] + 128) >> 8) + 16;
+            yStartPtr2[1] = (uint8)((66 * inpPtr2[5] + 129 * inpPtr2[6]
+                                      + 25 * inpPtr2[7] + 128) >> 8) + 16;
+            // U
+            uStartPtr[0] = (uint8)((-38 * inpPtr[1] - 74 * inpPtr[2]
+                                    + 112 * inpPtr[3] + 128) >> 8) + 128;
+            // V
+            vStartPtr[0] = (uint8)((112 * inpPtr[1] - 94 * inpPtr[2]
+                                    - 18 * inpPtr[3] + 128) >> 8) + 128;
+            yStartPtr += 2;
+            yStartPtr2 += 2;
+            uStartPtr++;
+            vStartPtr++;
+            inpPtr += 8;
+            inpPtr2 += 8;
+        }
+        yStartPtr += 2 * dst_ystride - src_width;
+        yStartPtr2 += 2 * dst_ystride - src_width;
+        uStartPtr += dst_ustride + dst_ustride - (src_width >> 1);
+        vStartPtr += dst_vstride + dst_vstride - (src_width >> 1);
+        inpPtr += 4 * (2 * src_stride - src_width);
+        inpPtr2 += 4 * (2 * src_stride - src_width);
+    }
+    return 0;
+}
+inline
+uint8 Clip(int32 val)
+{
+    if (val < 0)
+    {
+        return (uint8)0;
+    } else if (val > 255)
+    {
+        return (uint8)255;
+    }
+    return (uint8)val;
+}
+#ifdef SCALEOPT
+//memcpy_16 assumes that width is an integer multiple of 16!
+void
+*memcpy_16(void * dest, const void * src, size_t n)
+{
+    _asm
+    {
+        mov eax, dword ptr [src]
+        mov ebx, dword ptr [dest]
+        mov ecx, dword ptr [n]
+    loop0:
+        movdqu    xmm0, XMMWORD PTR [eax]
+        movdqu    XMMWORD PTR [ebx], xmm0
+        add       eax, 16
+        add       ebx, 16
+        sub       ecx, 16
+        jg        loop0
+    }
+}
+// memcpy_8 assumes that width is an integer multiple of 8!
+void
+*memcpy_8(void * dest, const void * src, size_t n)
+{
+    _asm
+    {
+        mov eax, dword ptr [src]
+        mov ebx, dword ptr [dest]
+        mov ecx, dword ptr [n]
+    loop0:
+        movq    mm0, QWORD PTR [eax]
+        movq    QWORD PTR [ebx], mm0
+        add       eax, 8
+        add       ebx, 8
+        sub       ecx, 8
+        jg        loop0
+    emms
+    }
+}
+#endif
+} // namespace libyuv
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "cpu_id.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#elif LINUX
+#include "linux.h"
+#endif
+// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#endif
+namespace libyuv {
+// CPU detect function for SIMD instruction sets.
+bool CpuInfo::cpu_info_initialized_ = false;
+int CpuInfo::cpu_info_ = 0;
+// Global lock for cpu initialization.
+#ifdef CPU_X86
+void cpuid(int cpu_info[4], int info_type) {
+  __cpuid(cpu_info, info_type);
+}
+#endif
+void CpuInfo::InitCpuFlags() {
+#ifdef CPU_X86
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  cpu_info_ = (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
+    (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0);
+#elif defined(__ARM_NEON__)
+  // gcc -mfpu=neon defines __ARM_NEON__
+  // if code is specifically built for Neon-only, enable the flag.
+  cpu_info_ |= kCpuHasNEON;
+#elif LINUX && defined(__arm__)
+  cpu_info_ = 0;
+  // Look for NEON support in /proc/cpuinfo
+  ProcCpuInfo proc_info;
+  size_t section_count;
+  if (proc_info.LoadFromSystem() &&
+      proc_info.GetSectionCount(&section_count)) {
+    for (size_t i = 0; i < section_count; ++i) {
+      std::string out_features;
+      if (proc_info.GetSectionStringValue(i, "Features", &out_features)) {
+        if (out_features.find("neon") != std::string::npos) {
+          cpu_info_ |= kCpuHasNEON;
+        }
+        break;
+      }
+    }
+  }
+#else
+  cpu_info_ = 0;
+#endif
+  cpu_info_initialized_ = true;
+}
+void CpuInfo::MaskCpuFlagsForTest(int enable_flags) {
+  InitCpuFlags();
+  cpu_info_ &= enable_flags;
+}
+bool CpuInfo::TestCpuFlag(int flag) {
+  if (!cpu_info_initialized_) {
+    InitCpuFlags();
+  }
+  return cpu_info_ & flag ? true : false;
+}
+// Returns the vendor string from the cpu, e.g. "GenuineIntel", "AuthenticAMD".
+// See "Intel Processor Identification and the CPUID Instruction"
+// (Intel document number: 241618)
+std::string CpuInfo::GetCpuVendor() {
+#ifdef CPU_X86
+  int cpu_info[4];
+  cpuid(cpu_info, 0);
+  cpu_info[0] = cpu_info[1];  // Reorder output
+  cpu_info[1] = cpu_info[3];
+  cpu_info[2] = cpu_info[2];
+  cpu_info[3] = 0;
+  return std::string(reinterpret_cast<char *>(&cpu_info[0]));
+#else
+  return std::string("Undefined");
+#endif
+}
+}  // namespace libyuv
--- a/source/cpu_id.h
+++ b/source/cpu_id.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_SOURCE_CPU_ID_H_
+#define LIBYUV_SOURCE_CPU_ID_H_
+#include <string>
+#include "basic_types.h"
+namespace libyuv {
+#ifdef CPU_X86
+void cpuid(int cpu_info[4], int info_type);
+#endif
+class CpuInfo {
+ public:
+  // These flags are only valid on x86 processors
+  static const int kCpuHasSSE2 = 1;
+  static const int kCpuHasSSSE3 = 2;
+  // SIMD support on ARM processors
+  static const int kCpuHasNEON = 4;
+  // Detect CPU has SSE2 etc.
+  static bool TestCpuFlag(int flag);
+  // Detect CPU vendor: "GenuineIntel" or "AuthenticAMD"
+  static std::string GetCpuVendor();
+  // For testing, allow CPU flags to be disabled.
+  static void MaskCpuFlagsForTest(int enable_flags);
+ private:
+  // Global lock for the cpu initialization
+  static bool cpu_info_initialized_;
+  static int cpu_info_;
+  static void InitCpuFlags();
+  DISALLOW_IMPLICIT_CONSTRUCTORS(CpuInfo);
+};
+}  // namespace libyuv
+#endif  // LIBYUV_SOURCE_CPU_ID_H_
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "format_conversion.h"
+#include "common.h"
+#include "video_common.h"
+namespace libyuv {
+enum {
+  RED = 0,
+  BLUE = 1,
+  GREEN_BETWEEN_RED = 2,
+  GREEN_BETWEEN_BLUE = 3,
+};
+enum Position {
+  LEFT = 0,
+  RIGHT = 1,
+  TOP = 2,
+  BOTTOM = 4,
+  CENTER = 6,
+  // Due to the choice of the above values, these are all distinct and the
+  // corner values and edge values are each contiguous. This allows us to
+  // figure out the position type of a pixel with a single addition operation
+  // using the above values, rather than having to use a 3x3 nested switch
+  // statement.
+  TOP_LEFT = TOP + LEFT,          // 2
+  TOP_RIGHT = TOP + RIGHT,        // 3
+  BOTTOM_LEFT = BOTTOM + LEFT,    // 4
+  BOTTOM_RIGHT = BOTTOM + RIGHT,  // 5
+  LEFT_EDGE = CENTER + LEFT,      // 6
+  RIGHT_EDGE = CENTER + RIGHT,    // 7
+  TOP_EDGE = TOP + CENTER,        // 8
+  BOTTOM_EDGE = BOTTOM + CENTER,  // 10
+  MIDDLE = CENTER + CENTER,       // 12
+};
+static FORCE_INLINE Position GetPosition(int x, int y, int width, int height) {
+  Position xpos = CENTER;
+  Position ypos = CENTER;
+  if (x == 0) {
+    xpos = LEFT;
+  } else if (x == width - 1) {
+    xpos = RIGHT;
+  }
+  if (y == 0) {
+    ypos = TOP;
+  } else if (y == height - 1) {
+    ypos = BOTTOM;
+  }
+  return static_cast<Position>(xpos + ypos);
+}
+static FORCE_INLINE bool IsRedBlue(uint8 colour) {
+  return colour <= BLUE;
+}
+static FORCE_INLINE uint32 FourCcToBayerPixelColourMap(uint32 fourcc) {
+  // The colour map is a 4-byte array-as-uint32 containing the colours for the
+  // four pixels in each 2x2 grid, in left-to-right and top-to-bottom order.
+  switch (fourcc) {
+    default:
+      ASSERT(false);
+    case FOURCC_RGGB:
+      return FOURCC(RED, GREEN_BETWEEN_RED, GREEN_BETWEEN_BLUE, BLUE);
+    case FOURCC_BGGR:
+      return FOURCC(BLUE, GREEN_BETWEEN_BLUE, GREEN_BETWEEN_RED, RED);
+    case FOURCC_GRBG:
+      return FOURCC(GREEN_BETWEEN_RED, RED, BLUE, GREEN_BETWEEN_BLUE);
+    case FOURCC_GBRG:
+      return FOURCC(GREEN_BETWEEN_BLUE, BLUE, RED, GREEN_BETWEEN_RED);
+  }
+}
+static FORCE_INLINE void RGBToYUV(uint8 r, uint8 g, uint8 b,
+                                  uint8* y, uint8* u, uint8* v) {
+  // Taken from http://en.wikipedia.org/wiki/YUV
+  *y = (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
+  *u = ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
+  *v = ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
+}
+static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r,
+                                                   uint8* g,
+                                                   uint8* b,
+                                                   const uint8* src,
+                                                   int src_pitch,
+                                                   Position pos,
+                                                   uint8 colour) {
+  // Compute the offsets to use for fetching the adjacent pixels.
+  int adjacent_row;
+  int adjacent_column;
+  switch (pos) {
+    case TOP_LEFT:
+      adjacent_row = src_pitch;
+      adjacent_column = 1;
+      break;
+    case TOP_RIGHT:
+      adjacent_row = src_pitch;
+      adjacent_column = -1;
+      break;
+    case BOTTOM_LEFT:
+      adjacent_row = -src_pitch;
+      adjacent_column = 1;
+      break;
+    case BOTTOM_RIGHT:
+    default:
+      adjacent_row = -src_pitch;
+      adjacent_column = -1;
+      break;
+  }
+  // Now interpolate.
+  if (IsRedBlue(colour)) {
+    uint8 current_pixel = src[0];
+    // Average of the adjacent green pixels (there's only two).
+    *g = (src[adjacent_column] + src[adjacent_row]) / 2;
+    // Average of the oppositely-coloured corner pixels (there's only one).
+    uint8 corner_average = src[adjacent_row + adjacent_column];
+    if (colour == RED) {
+      *r = current_pixel;
+      *b = corner_average;
+    } else {  // i.e., BLUE
+      *b = current_pixel;
+      *r = corner_average;
+    }
+  } else {  // i.e., GREEN_BETWEEN_*
+    *g = src[0];
+    // Average of the adjacent same-row pixels (there's only one).
+    uint8 row_average = src[adjacent_column];
+    // Average of the adjacent same-column pixels (there's only one).
+    uint8 column_average = src[adjacent_row];
+    if (colour == GREEN_BETWEEN_RED) {
+      *r = row_average;
+      *b = column_average;
+    } else {  // i.e., GREEN_BETWEEN_BLUE
+      *b = row_average;
+      *r = column_average;
+    }
+  }
+}
+static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r,
+                                                 uint8* g,
+                                                 uint8* b,
+                                                 const uint8* src,
+                                                 int src_pitch,
+                                                 Position pos,
+                                                 uint8 colour) {
+  // Compute the offsets to use for fetching the adjacent pixels.
+  // Goes one pixel "in" to the image (i.e. towards the center)
+  int inner;
+  // Goes one pixel to the side (i.e. along the edge) in either the clockwise or
+  // counter-clockwise direction, and its negative value goes in the other
+  // direction.
+  int side;
+  switch (pos) {
+    case TOP_EDGE:
+      inner = src_pitch;
+      side = 1;
+      break;
+    case RIGHT_EDGE:
+      inner = -1;
+      side = src_pitch;
+      break;
+    case BOTTOM_EDGE:
+      inner = -src_pitch;
+      side = 1;
+      break;
+    case LEFT_EDGE:
+    default:
+      inner = 1;
+      side = src_pitch;
+      break;
+  }
+  // Now interpolate.
+  if (IsRedBlue(colour)) {
+    uint8 current_pixel = src[0];
+    // Average of the adjacent green pixels (there's only three).
+    *g = (src[inner] + src[side] + src[-side]) / 3;
+    // Average of the oppositely-coloured corner pixels (there's only two).
+    uint8 corner_average = (src[inner + side] + src[inner - side]) / 2;
+    if (colour == RED) {
+      *r = current_pixel;
+      *b = corner_average;
+    } else {  // i.e., BLUE
+      *b = current_pixel;
+      *r = corner_average;
+    }
+  } else {  // i.e., GREEN_BETWEEN_*
+    *g = src[0];
+    // Average of the adjacent side-ways pixels (there's only two).
+    uint8 side_average = (src[side] + src[-side]) / 2;
+    // Average of the adjacent inner-ways pixels (there's only one).
+    uint8 inner_pixel = src[inner];
+    // Including && side == 1 effectively transposes the colour logic for
+    // processing the left/right sides, which is needed since the "T" shape
+    // formed by the pixels is transposed.
+    if (colour == GREEN_BETWEEN_RED && side == 1) {
+      *r = side_average;
+      *b = inner_pixel;
+    } else {  // i.e., GREEN_BETWEEN_BLUE || side != 1
+      *b = side_average;
+      *r = inner_pixel;
+    }
+  }
+}
+// We inline this one because it runs 99% of the time, so inlining it is
+// probably beneficial.
+static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
+                                                   uint8* g,
+                                                   uint8* b,
+                                                   const uint8* src,
+                                                   int src_pitch,
+                                                   uint8 colour) {
+  if (IsRedBlue(colour)) {
+    uint8 current_pixel = src[0];
+    // Average of the adjacent green pixels (there's four).
+    // NOTE(tschmelcher): The material at
+    // http://www.siliconimaging.com/RGB%20Bayer.htm discusses a way to improve
+    // quality here by using only two of the green pixels based on the
+    // correlation to the nearby red/blue pixels, but that is slower and would
+    // result in more edge cases.
+    *g = (src[1] + src[-1] + src[src_pitch] + src[-src_pitch]) / 4;
+    // Average of the oppositely-coloured corner pixels (there's four).
+    uint8 corner_average = (src[src_pitch + 1] +
+                            src[src_pitch - 1] +
+                            src[-src_pitch + 1] +
+                            src[-src_pitch - 1]) / 4;
+    if (colour == RED) {
+      *r = current_pixel;
+      *b = corner_average;
+    } else {  // i.e., BLUE
+      *b = current_pixel;
+      *r = corner_average;
+    }
+  } else {  // i.e., GREEN_BETWEEN_*
+    *g = src[0];
+    // Average of the adjacent same-row pixels (there's two).
+    uint8 row_adjacent = (src[1] + src[-1]) / 2;
+    // Average of the adjacent same-column pixels (there's two).
+    uint8 column_adjacent = (src[src_pitch] + src[-src_pitch]) / 2;
+    if (colour == GREEN_BETWEEN_RED) {
+      *r = row_adjacent;
+      *b = column_adjacent;
+    } else {  // i.e., GREEN_BETWEEN_BLUE
+      *b = row_adjacent;
+      *r = column_adjacent;
+    }
+  }
+}
+// Converts any Bayer RGB format to I420.
+void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc,
+                    uint8* y, int y_pitch,
+                    uint8* u, int u_pitch,
+                    uint8* v, int v_pitch,
+                    int width, int height) {
+  ASSERT(width % 2 == 0);
+  ASSERT(height % 2 == 0);
+  uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
+  int src_row_inc = src_pitch * 2 - width;
+  int y_row_inc = y_pitch * 2 - width;
+  int u_row_inc = u_pitch - width / 2;
+  int v_row_inc = v_pitch - width / 2;
+  // Iterate over the 2x2 grids.
+  for (int y1 = 0; y1 < height; y1 += 2) {
+    for (int x1 = 0; x1 < width; x1 += 2) {
+      uint32 colours = colour_map;
+      int total_u = 0;
+      int total_v = 0;
+      // Iterate over the four pixels within them.
+      for (int y2 = 0; y2 < 2; ++y2) {
+        for (int x2 = 0; x2 < 2; ++x2) {
+          uint8 r, g, b;
+          // The low-order byte of the colour map is the current colour.
+          uint8 current_colour = static_cast<uint8>(colours);
+          colours >>= 8;
+          Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
+          const uint8* src_pixel = &src[y2 * src_pitch + x2];
+          uint8* y_pixel = &y[y2 * y_pitch + x2];
+          // Convert from Bayer RGB to regular RGB.
+          if (pos == MIDDLE) {
+            // 99% of the image is the middle.
+            InterpolateBayerRGBCenter(&r, &g, &b,
+                                      src_pixel, src_pitch,
+                                      current_colour);
+          } else if (pos >= LEFT_EDGE) {
+            // Next most frequent is edges.
+            InterpolateBayerRGBEdge(&r, &g, &b,
+                                    src_pixel, src_pitch, pos,
+                                    current_colour);
+          } else {
+            // Last is the corners. There are only 4.
+            InterpolateBayerRGBCorner(&r, &g, &b,
+                                      src_pixel, src_pitch, pos,
+                                      current_colour);
+          }
+          // Convert from RGB to YUV.
+          uint8 tmp_u, tmp_v;
+          RGBToYUV(r, g, b, y_pixel, &tmp_u, &tmp_v);
+          total_u += tmp_u;
+          total_v += tmp_v;
+        }
+      }
+      src += 2;
+      y += 2;
+      *u = total_u / 4;
+      *v = total_v / 4;
+      ++u;
+      ++v;
+    }
+    src += src_row_inc;
+    y += y_row_inc;
+    u += u_row_inc;
+    v += v_row_inc;
+  }
+}
+// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
+// and vst would select which 2 components to write.  The low level would need
+// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_ARGBTOBAYERROW_SSSE3
+__declspec(naked)
+static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
+                                 uint8* dst_bayer, uint32 selector, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_bayer
+    movd       xmm0, [esp + 12]  // selector
+    mov        ecx, [esp + 16]   // pix
+    pshufd     xmm0, xmm0, 0
+  wloop:
+    movdqa     xmm1, [eax]
+    lea        eax, [eax + 16]
+    pshufb     xmm1, xmm0
+    movd       [edx], xmm1
+    lea        edx, [edx + 4]
+    sub        ecx, 4
+    ja         wloop
+    ret
+  }
+}
+#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+    !TARGET_IPHONE_SIMULATOR
+#define HAS_ARGBTOBAYERROW_SSSE3
+extern "C" void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+                                     uint32 selector, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ARGBToBayerRow_SSSE3\n"
+"_ARGBToBayerRow_SSSE3:\n"
+#else
+    ".global ARGBToBayerRow_SSSE3\n"
+"ARGBToBayerRow_SSSE3:\n"
+#endif
+    "mov    0x4(%esp),%eax\n"
+    "mov    0x8(%esp),%edx\n"
+    "movd   0xc(%esp),%xmm0\n"
+    "mov    0x10(%esp),%ecx\n"
+    "pshufd $0x0,%xmm0,%xmm0\n"
+"1:"
+    "movdqa (%eax),%xmm1\n"
+    "lea    0x10(%eax),%eax\n"
+    "pshufb %xmm0,%xmm1\n"
+    "movd   %xmm1,(%edx)\n"
+    "lea    0x4(%edx),%edx\n"
+    "sub    $0x4,%ecx\n"
+    "ja     1b\n"
+    "ret\n"
+);
+#endif
+static void ARGBToBayerRow_C(const uint8* src_argb,
+                             uint8* dst_bayer, uint32 selector, int pix) {
+  int index0 = selector & 0xff;
+  int index1 = (selector >> 8) & 0xff;
+  // Copy a row of Bayer.
+  for (int x = 0; x < pix; x += 2) {
+    dst_bayer[0] = src_argb[index0];
+    dst_bayer[1] = src_argb[index1];
+    src_argb += 8;
+    dst_bayer += 2;
+  }
+}
+// generate a selector mask useful for pshufb
+static uint32 GenerateSelector(int select0, int select1) {
+  return static_cast<uint32>(select0) |
+         static_cast<uint32>((select1 + 4) << 8) |
+         static_cast<uint32>((select0 + 8) << 16) |
+         static_cast<uint32>((select1 + 12) << 24);
+}
+// Converts any 32 bit ARGB to any Bayer RGB format.
+void RGB32ToBayerRGB(const uint8* src_rgb, int src_pitch_rgb,
+                     uint32 src_fourcc_rgb,
+                     uint8* dst_bayer, int dst_pitch_bayer,
+                     uint32 dst_fourcc_bayer,
+                     int width, int height) {
+  ASSERT(width % 2 == 0);
+  void (*ARGBToBayerRow)(const uint8* src_argb,
+                         uint8* dst_bayer, uint32 selector, int pix);
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+  if (CpuInfo::TestCpuFlag(CpuInfo::kCpuHasSSSE3) &&
+      (width % 4 == 0) &&
+      IS_ALIGNED(src_rgb, 16) && (src_pitch_rgb % 16 == 0) &&
+      IS_ALIGNED(dst_bayer, 4) && (dst_pitch_bayer % 4 == 0)) {
+    ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToBayerRow = ARGBToBayerRow_C;
+  }
+  ASSERT(src_fourcc_rgb == FOURCC_ARGB);
+  int blue_index = 0;
+  int green_index = 1;
+  int red_index = 2;
+  // Now build a lookup table containing the indices for the four pixels in each
+  // 2x2 Bayer grid.
+  uint32 index_map[2];
+  switch (dst_fourcc_bayer) {
+    default:
+      ASSERT(false);
+    case FOURCC_RGGB:
+      index_map[0] = GenerateSelector(red_index, green_index);
+      index_map[1] = GenerateSelector(green_index, blue_index);
+      break;
+    case FOURCC_BGGR:
+      index_map[0] = GenerateSelector(blue_index, green_index);
+      index_map[1] = GenerateSelector(green_index, red_index);
+      break;
+    case FOURCC_GRBG:
+      index_map[0] = GenerateSelector(green_index, red_index);
+      index_map[1] = GenerateSelector(blue_index, green_index);
+      break;
+    case FOURCC_GBRG:
+      index_map[0] = GenerateSelector(green_index, blue_index);
+      index_map[1] = GenerateSelector(red_index, green_index);
+      break;
+  }
+  // Now convert.
+  for (int y = 0; y < height; ++y) {
+    ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width);
+    src_rgb += src_pitch_rgb;
+    dst_bayer += dst_pitch_bayer;
+  }
+}
+}  // namespace libyuv
--- a/source/general.cc
+++ b/source/general.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "general.h"
+#include <assert.h>
+#include <string.h>     // memcpy(), memset()
+#include "video_common.h"
+namespace libyuv {
+int
+MirrorI420LeftRight( const uint8* src_frame,uint8* dst_frame,
+                     int src_width, int src_height)
+{
+    if (src_width < 1 || src_height < 1)
+    {
+        return -1;
+    }
+    assert(src_width % 2 == 0 &&  src_height % 2 == 0);
+    int indO = 0;
+    int indS  = 0;
+    int wind, hind;
+    uint8 tmpVal;
+    // Will swap two values per iteration
+    const int halfW = src_width >> 1;
+    const int halfStride = src_width >> 1;
+    // Y
+    for (wind = 0; wind < halfW; wind++ )
+    {
+        for (hind = 0; hind < src_height; hind++ )
+        {
+            indO = hind * src_width + wind;
+            indS = hind * src_width + (src_width - wind - 1); // swapping index
+            tmpVal = src_frame[indO];
+            dst_frame[indO] = src_frame[indS];
+            dst_frame[indS] = tmpVal;
+        } // end for (height)
+    } // end for(width)
+    const int lengthW = src_width >> 2;
+    const int lengthH = src_height >> 1;
+    // V
+    int zeroInd = src_width * src_height;
+    for (wind = 0; wind < lengthW; wind++ )
+    {
+        for (hind = 0; hind < lengthH; hind++ )
+        {
+            indO = zeroInd + hind * halfW + wind;
+            indS = zeroInd + hind * halfW + (halfW - wind - 1);// swapping index
+            tmpVal = src_frame[indO];
+            dst_frame[indO] = src_frame[indS];
+            dst_frame[indS] = tmpVal;
+        } // end for (height)
+    } // end for(width)
+    // U
+    zeroInd += src_width * src_height >> 2;
+    for (wind = 0; wind < lengthW; wind++ )
+    {
+        for (hind = 0; hind < lengthH; hind++ )
+        {
+            indO = zeroInd + hind * halfW + wind;
+            indS = zeroInd + hind * halfW + (halfW - wind - 1);// swapping index
+            tmpVal = src_frame[indO];
+            dst_frame[indO] = src_frame[indS];
+            dst_frame[indS] = tmpVal;
+        } // end for (height)
+    } // end for(width)
+    return 0;
+}
+// Make a center cut
+int
+CutI420Frame(uint8* frame,
+             int fromWidth, int fromHeight,
+             int toWidth, int toHeight)
+{
+    if (toWidth < 1 || fromWidth < 1 || toHeight < 1 || fromHeight < 1 )
+    {
+        return -1;
+    }
+    if (toWidth == fromWidth && toHeight == fromHeight)
+    {
+        // Nothing to do
+      return 3 * toHeight * toWidth / 2;
+    }
+    if (toWidth > fromWidth || toHeight > fromHeight)
+    {
+        // error
+        return -1;
+    }
+    int i = 0;
+    int m = 0;
+    int loop = 0;
+    int halfToWidth = toWidth / 2;
+    int halfToHeight = toHeight / 2;
+    int halfFromWidth = fromWidth / 2;
+    int halfFromHeight= fromHeight / 2;
+    int cutHeight = ( fromHeight - toHeight ) / 2;
+    int cutWidth = ( fromWidth - toWidth ) / 2;
+    for (i = fromWidth * cutHeight + cutWidth; loop < toHeight ;
+        loop++, i += fromWidth)
+    {
+        memcpy(&frame[m],&frame[i],toWidth);
+        m += toWidth;
+    }
+    i = fromWidth * fromHeight; // ilum
+    loop = 0;
+    for ( i += (halfFromWidth * cutHeight / 2 + cutWidth / 2);
+          loop < halfToHeight; loop++,i += halfFromWidth)
+    {
+        memcpy(&frame[m],&frame[i],halfToWidth);
+        m += halfToWidth;
+    }
+    loop = 0;
+    i = fromWidth * fromHeight + halfFromHeight * halfFromWidth; // ilum + Cr
+    for ( i += (halfFromWidth * cutHeight / 2 + cutWidth / 2);
+          loop < halfToHeight; loop++, i += halfFromWidth)
+    {
+        memcpy(&frame[m],&frame[i],halfToWidth);
+        m += halfToWidth;
+    }
+    return halfToWidth * toHeight * 3;
+}
+int
+CutPadI420Frame(const uint8* inFrame, int inWidth,
+                int inHeight, uint8* outFrame,
+                int outWidth, int outHeight)
+{
+    if (inWidth < 1 || outWidth < 1 || inHeight < 1 || outHeight < 1 )
+    {
+        return -1;
+    }
+    if (inWidth == outWidth && inHeight == outHeight)
+    {
+        memcpy(outFrame, inFrame, 3 * outWidth * (outHeight >> 1));
+    }
+    else
+    {
+        if ( inHeight < outHeight)
+        {
+            // pad height
+            int padH = outHeight - inHeight;
+            int i = 0;
+            int padW = 0;
+            int cutW = 0;
+            int width = inWidth;
+            if (inWidth < outWidth)
+            {
+                // pad width
+                padW = outWidth - inWidth;
+            }
+            else
+            {
+              // cut width
+              cutW = inWidth - outWidth;
+              width = outWidth;
+            }
+            if (padH)
+            {
+                memset(outFrame, 0, outWidth * (padH >> 1));
+                outFrame +=  outWidth * (padH >> 1);
+            }
+            for (i = 0; i < inHeight;i++)
+            {
+                if (padW)
+                {
+                    memset(outFrame, 0, padW / 2);
+                    outFrame +=  padW / 2;
+                }
+                inFrame += cutW >> 1; // in case we have a cut
+                memcpy(outFrame,inFrame ,width);
+                inFrame += cutW >> 1;
+                outFrame += width;
+                inFrame += width;
+                if (padW)
+                {
+                    memset(outFrame, 0, padW / 2);
+                    outFrame +=  padW / 2;
+                }
+            }
+            if (padH)
+            {
+                memset(outFrame, 0, outWidth * (padH >> 1));
+                outFrame +=  outWidth * (padH >> 1);
+            }
+            if (padH)
+            {
+                memset(outFrame, 127, (outWidth >> 2) * (padH >> 1));
+                outFrame +=  (outWidth >> 2) * (padH >> 1);
+            }
+            for (i = 0; i < (inHeight >> 1); i++)
+            {
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame +=  padW >> 2;
+                }
+                inFrame += cutW >> 2; // in case we have a cut
+                memcpy(outFrame, inFrame,width >> 1);
+                inFrame += cutW >> 2;
+                outFrame += width >> 1;
+                inFrame += width >> 1;
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame +=  padW >> 2;
+                }
+            }
+            if (padH)
+            {
+                memset(outFrame, 127, (outWidth >> 1) * (padH >> 1));
+                outFrame +=  (outWidth >> 1) * (padH >> 1);
+            }
+            for (i = 0; i < (inHeight >> 1); i++)
+            {
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame +=  padW >> 2;
+                }
+                inFrame += cutW >> 2; // in case we have a cut
+                memcpy(outFrame, inFrame,width >> 1);
+                inFrame += cutW >> 2;
+                outFrame += width >> 1;
+                inFrame += width >> 1;
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame += padW >> 2;
+                }
+            }
+            if (padH)
+            {
+                memset(outFrame, 127, (outWidth >> 2) * (padH >> 1));
+                outFrame +=  (outWidth >> 2) * (padH >> 1);
+            }
+        }
+        else
+        {
+            // cut height
+            int i = 0;
+            int padW = 0;
+            int cutW = 0;
+            int width = inWidth;
+            if (inWidth < outWidth)
+            {
+                // pad width
+                padW = outWidth - inWidth;
+            } else
+            {
+                // cut width
+                cutW = inWidth - outWidth;
+                width = outWidth;
+            }
+            int diffH = inHeight - outHeight;
+            inFrame += inWidth * (diffH >> 1);  // skip top I
+            for (i = 0; i < outHeight; i++)
+            {
+                if (padW)
+                {
+                    memset(outFrame, 0, padW / 2);
+                    outFrame +=  padW / 2;
+                }
+                inFrame += cutW >> 1; // in case we have a cut
+                memcpy(outFrame,inFrame ,width);
+                inFrame += cutW >> 1;
+                outFrame += width;
+                inFrame += width;
+                if (padW)
+                {
+                    memset(outFrame, 0, padW / 2);
+                    outFrame +=  padW / 2;
+                }
+            }
+            inFrame += inWidth * (diffH >> 1);  // skip end I
+            inFrame += (inWidth >> 2) * (diffH >> 1); // skip top of Cr
+            for (i = 0; i < (outHeight >> 1); i++)
+            {
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame +=  padW >> 2;
+                }
+                inFrame += cutW >> 2; // in case we have a cut
+                memcpy(outFrame, inFrame,width >> 1);
+                inFrame += cutW >> 2;
+                outFrame += width >> 1;
+                inFrame += width >> 1;
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame +=  padW >> 2;
+                }
+            }
+            inFrame += (inWidth >> 2) * (diffH >> 1); // skip end of Cr
+            inFrame += (inWidth >> 2) * (diffH >> 1); // skip top of Cb
+            for (i = 0; i < (outHeight >> 1); i++)
+            {
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame +=  padW >> 2;
+                }
+                inFrame += cutW >> 2; // in case we have a cut
+                memcpy(outFrame, inFrame, width >> 1);
+                inFrame += cutW >> 2;
+                outFrame += width >> 1;
+                inFrame += width >> 1;
+                if (padW)
+                {
+                    memset(outFrame, 127, padW >> 2);
+                    outFrame +=  padW >> 2;
+                }
+            }
+        }
+    }
+    return 3 * outWidth * (outHeight >> 1);
+}
+} // nmaespace libyuv
--- a/source/linux.cc
+++ b/source/linux.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#if defined(LINUX) || defined(ANDROID)
+#include "linux.h"
+#include <ctype.h>
+#include <errno.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <cstdio>
+#include <set>
+namespace libyuv {
+static const char kCpuInfoFile[] = "/proc/cpuinfo";
+static const char kCpuMaxFreqFile[] =
+    "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq";
+ProcCpuInfo::ProcCpuInfo() {
+}
+ProcCpuInfo::~ProcCpuInfo() {
+}
+bool ProcCpuInfo::LoadFromSystem() {
+  ConfigParser procfs;
+  if (!procfs.Open(kCpuInfoFile)) {
+    return false;
+  }
+  return procfs.Parse(&sections_);
+};
+bool ProcCpuInfo::GetSectionCount(size_t* count) {
+  if (sections_.empty()) {
+    return false;
+  }
+  if (count) {
+    *count = sections_.size();
+  }
+  return true;
+}
+bool ProcCpuInfo::GetNumCpus(int* num) {
+  if (sections_.empty()) {
+    return false;
+  }
+  int total_cpus = 0;
+#if defined(__arm__)
+  // Count the number of blocks that have a "processor" key defined. On ARM,
+  // there may be extra blocks of information that aren't per-processor.
+  size_t section_count = sections_.size();
+  for (size_t i = 0; i < section_count; ++i) {
+    int processor_id;
+    if (GetSectionIntValue(i, "processor", &processor_id)) {
+      ++total_cpus;
+    }
+  }
+  // Single core ARM systems don't include "processor" keys at all, so return
+  // that we have a single core if we didn't find any explicitly above.
+  if (total_cpus == 0) {
+    total_cpus = 1;
+  }
+#else
+  // On X86, there is exactly one info section per processor.
+  total_cpus = static_cast<int>(sections_.size());
+#endif
+  if (num) {
+    *num = total_cpus;
+  }
+  return true;
+}
+bool ProcCpuInfo::GetNumPhysicalCpus(int* num) {
+  if (sections_.empty()) {
+    return false;
+  }
+  // TODO(noahric): /proc/cpuinfo only reports cores that are currently
+  // _online_, so this may underreport the number of physical cores.
+#if defined(__arm__)
+  // ARM (currently) has no hyperthreading, so just return the same value
+  // as GetNumCpus.
+  return GetNumCpus(num);
+#else
+  int total_cores = 0;
+  std::set<int> physical_ids;
+  size_t section_count = sections_.size();
+  for (size_t i = 0; i < section_count; ++i) {
+    int physical_id;
+    int cores;
+    // Count the cores for the physical id only if we have not counted the id.
+    if (GetSectionIntValue(i, "physical id", &physical_id) &&
+        GetSectionIntValue(i, "cpu cores", &cores) &&
+        physical_ids.find(physical_id) == physical_ids.end()) {
+      physical_ids.insert(physical_id);
+      total_cores += cores;
+    }
+  }
+  if (num) {
+    *num = total_cores;
+  }
+  return true;
+#endif
+}
+bool ProcCpuInfo::GetCpuFamily(int* id) {
+  int cpu_family = 0;
+// shh {
+#if defined(__arm__)
+  // On ChromeOS seaboard, there is no 'cpu family' in '/proc/cpuinfo'. But
+  // there is 'CPU Architecture' which can be used as 'cpu family'.
+  // See http://en.wikipedia.org/wiki/ARM_architecture for a good list of
+  // ARM cpu families, architectures, and their mappings.
+  // There may be multiple sessions that aren't per-processor. We need to scan
+  // through each session until we find the first 'CPU architecture'.
+  size_t section_count = sections_.size();
+  for (size_t i = 0; i < section_count; ++i) {
+    if (GetSectionIntValue(i, "CPU architecture", &cpu_family)) {
+      // We returns the first one (if there are multiple entries).
+      break;
+    };
+  }
+#else
+// shh }
+  GetSectionIntValue(0, "cpu family", &cpu_family);
+// shh {
+#endif
+// shh }
+  if (id) {
+    *id = cpu_family;
+  }
+  return true;
+}
+bool ProcCpuInfo::GetSectionStringValue(size_t section_num,
+                                        const std::string& key,
+                                        std::string* result) {
+  if (section_num >= sections_.size()) {
+    return false;
+  }
+  ConfigParser::SimpleMap::iterator iter = sections_[section_num].find(key);
+  if (iter == sections_[section_num].end()) {
+    return false;
+  }
+  *result = iter->second;
+  return true;
+}
+bool ProcCpuInfo::GetSectionIntValue(size_t section_num,
+                                     const std::string& key,
+                                     int* result) {
+  if (section_num >= sections_.size()) {
+    return false;
+  }
+  ConfigParser::SimpleMap::iterator iter = sections_[section_num].find(key);
+  if (iter == sections_[section_num].end()) {
+    return false;
+  }
+  return FromString(iter->second, result);
+}
+ConfigParser::ConfigParser() {}
+ConfigParser::~ConfigParser() {}
+bool ConfigParser::Open(const std::string& filename) {
+  FileStream* fs = new FileStream();
+  if (!fs->Open(filename, "r", NULL)) {
+    return false;
+  }
+  instream_.reset(fs);
+  return true;
+}
+void ConfigParser::Attach(StreamInterface* stream) {
+  instream_.reset(stream);
+}
+bool ConfigParser::Parse(MapVector* key_val_pairs) {
+  // Parses the file and places the found key-value pairs into key_val_pairs.
+  SimpleMap section;
+  while (ParseSection(&section)) {
+    key_val_pairs->push_back(section);
+    section.clear();
+  }
+  return (!key_val_pairs->empty());
+}
+bool ConfigParser::ParseSection(SimpleMap* key_val_pair) {
+  // Parses the next section in the filestream and places the found key-value
+  // pairs into key_val_pair.
+  std::string key, value;
+  while (ParseLine(&key, &value)) {
+    (*key_val_pair)[key] = value;
+  }
+  return (!key_val_pair->empty());
+}
+bool ConfigParser::ParseLine(std::string* key, std::string* value) {
+  // Parses the next line in the filestream and places the found key-value
+  // pair into key and val.
+  std::string line;
+  if ((instream_->ReadLine(&line)) == EOF) {
+    return false;
+  }
+  std::vector<std::string> tokens;
+  if (2 != split(line, ':', &tokens)) {
+    return false;
+  }
+  // Removes whitespace at the end of Key name
+  size_t pos = tokens[0].length() - 1;
+  while ((pos > 0) && isspace(tokens[0][pos])) {
+    pos--;
+  }
+  tokens[0].erase(pos + 1);
+  // Removes whitespace at the start of value
+  pos = 0;
+  while (pos < tokens[1].length() && isspace(tokens[1][pos])) {
+    pos++;
+  }
+  tokens[1].erase(0, pos);
+  *key = tokens[0];
+  *value = tokens[1];
+  return true;
+}
+static bool ExpectLineFromStream(FileStream* stream,
+                                 std::string* out) {
+  StreamResult res = stream->ReadLine(out);
+  if (res != SR_SUCCESS) {
+    if (res != SR_EOS) {
+      LOG(LS_ERROR) << "Error when reading from stream";
+    } else {
+      LOG(LS_ERROR) << "Incorrect number of lines in stream";
+    }
+    return false;
+  }
+  return true;
+}
+static void ExpectEofFromStream(FileStream* stream) {
+  std::string unused;
+  StreamResult res = stream->ReadLine(&unused);
+  if (res == SR_SUCCESS) {
+    LOG(LS_WARNING) << "Ignoring unexpected extra lines from stream";
+  } else if (res != SR_EOS) {
+    LOG(LS_WARNING) << "Error when checking for extra lines from stream";
+  }
+}
+// For caching the lsb_release output (reading it invokes a sub-process and
+// hence is somewhat expensive).
+static std::string lsb_release_string;
+static CriticalSection lsb_release_string_critsec;
+std::string ReadLinuxLsbRelease() {
+  CritScope cs(&lsb_release_string_critsec);
+  if (!lsb_release_string.empty()) {
+    // Have cached result from previous call.
+    return lsb_release_string;
+  }
+  // No cached result. Run lsb_release and parse output.
+  POpenStream lsb_release_output;
+  if (!lsb_release_output.Open("lsb_release -idrcs", "r", NULL)) {
+    LOG_ERR(LS_ERROR) << "Can't run lsb_release";
+    return lsb_release_string;  // empty
+  }
+  // Read in the command's output and build the string.
+  std::ostringstream sstr;
+  std::string line;
+  int wait_status;
+  if (!ExpectLineFromStream(&lsb_release_output, &line)) {
+    return lsb_release_string;  // empty
+  }
+  sstr << "DISTRIB_ID=" << line;
+  if (!ExpectLineFromStream(&lsb_release_output, &line)) {
+    return lsb_release_string;  // empty
+  }
+  sstr << " DISTRIB_DESCRIPTION=\"" << line << '"';
+  if (!ExpectLineFromStream(&lsb_release_output, &line)) {
+    return lsb_release_string;  // empty
+  }
+  sstr << " DISTRIB_RELEASE=" << line;
+  if (!ExpectLineFromStream(&lsb_release_output, &line)) {
+    return lsb_release_string;  // empty
+  }
+  sstr << " DISTRIB_CODENAME=" << line;
+  // Should not be anything left.
+  ExpectEofFromStream(&lsb_release_output);
+  lsb_release_output.Close();
+  wait_status = lsb_release_output.GetWaitStatus();
+  if (wait_status == -1 ||
+      !WIFEXITED(wait_status) ||
+      WEXITSTATUS(wait_status) != 0) {
+    LOG(LS_WARNING) << "Unexpected exit status from lsb_release";
+  }
+  lsb_release_string = sstr.str();
+  return lsb_release_string;
+}
+std::string ReadLinuxUname() {
+  struct utsname buf;
+  if (uname(&buf) < 0) {
+    LOG_ERR(LS_ERROR) << "Can't call uname()";
+    return std::string();
+  }
+  std::ostringstream sstr;
+  sstr << buf.sysname << " "
+       << buf.release << " "
+       << buf.version << " "
+       << buf.machine;
+  return sstr.str();
+}
+int ReadCpuMaxFreq() {
+  FileStream fs;
+  std::string str;
+  int freq = -1;
+  if (!fs.Open(kCpuMaxFreqFile, "r", NULL) ||
+      SR_SUCCESS != fs.ReadLine(&str) ||
+      !FromString(str, &freq)) {
+    return -1;
+  }
+  return freq;
+}
+}  // namespace libyuv
+#endif  // defined(LINUX) || defined(ANDROID)
--- a/source/linux.h
+++ b/source/linux.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_SOURCE_LINUX_H_
+#define LIBYUV_SOURCE_LINUX_H_
+#if defined(LINUX) || defined(ANDROID)
+#include <string>
+#include <map>
+#include <vector>
+namespace libyuv {
+//////////////////////////////////////////////////////////////////////////////
+// ConfigParser parses a FileStream of an ".ini."-type format into a map.
+//////////////////////////////////////////////////////////////////////////////
+// Sample Usage:
+//   ConfigParser parser;
+//   ConfigParser::MapVector key_val_pairs;
+//   if (parser.Open(inifile) && parser.Parse(&key_val_pairs)) {
+//     for (int section_num=0; i < key_val_pairs.size(); ++section_num) {
+//       std::string val1 = key_val_pairs[section_num][key1];
+//       std::string val2 = key_val_pairs[section_num][key2];
+//       // Do something with valn;
+//     }
+//   }
+class ConfigParser {
+ public:
+  typedef std::map<std::string, std::string> SimpleMap;
+  typedef std::vector<SimpleMap> MapVector;
+  ConfigParser();
+  virtual ~ConfigParser();
+  virtual bool Open(const std::string& filename);
+  virtual void Attach(StreamInterface* stream);
+  virtual bool Parse(MapVector* key_val_pairs);
+  virtual bool ParseSection(SimpleMap* key_val_pair);
+  virtual bool ParseLine(std::string* key, std::string* value);
+ private:
+  scoped_ptr<StreamInterface> instream_;
+};
+//////////////////////////////////////////////////////////////////////////////
+// ProcCpuInfo reads CPU info from the /proc subsystem on any *NIX platform.
+//////////////////////////////////////////////////////////////////////////////
+// Sample Usage:
+//   ProcCpuInfo proc_info;
+//   int no_of_cpu;
+//   if (proc_info.LoadFromSystem()) {
+//      std::string out_str;
+//      proc_info.GetNumCpus(&no_of_cpu);
+//      proc_info.GetCpuStringValue(0, "vendor_id", &out_str);
+//      }
+//   }
+class ProcCpuInfo {
+ public:
+  ProcCpuInfo();
+  virtual ~ProcCpuInfo();
+  // Reads the proc subsystem's cpu info into memory. If this fails, this
+  // returns false; if it succeeds, it returns true.
+  virtual bool LoadFromSystem();
+  // Obtains the number of logical CPU threads and places the value num.
+  virtual bool GetNumCpus(int* num);
+  // Obtains the number of physical CPU cores and places the value num.
+  virtual bool GetNumPhysicalCpus(int* num);
+  // Obtains the CPU family id.
+  virtual bool GetCpuFamily(int* id);
+  // Obtains the number of sections in /proc/cpuinfo, which may be greater
+  // than the number of CPUs (e.g. on ARM)
+  virtual bool GetSectionCount(size_t* count);
+  // Looks for the CPU proc item with the given name for the given section
+  // number and places the string value in result.
+  virtual bool GetSectionStringValue(size_t section_num, const std::string& key,
+                                     std::string* result);
+  // Looks for the CPU proc item with the given name for the given section
+  // number and places the int value in result.
+  virtual bool GetSectionIntValue(size_t section_num, const std::string& key,
+                                  int* result);
+ private:
+  ConfigParser::MapVector sections_;
+};
+// Builds a string containing the info from lsb_release on a single line.
+std::string ReadLinuxLsbRelease();
+// Returns the output of "uname".
+std::string ReadLinuxUname();
+// Returns the content (int) of
+// /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq
+// Returns -1 on error.
+int ReadCpuMaxFreq();
+}  // namespace libyuv
+#endif  // defined(LINUX) || defined(ANDROID)
+#endif  // LIBYUV_SOURCE_LINUX_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "planar_functions.h"
+#include <string.h>
+#include "cpu_id.h"
+namespace libyuv {
+#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#define HAS_SPLITUV_NEON
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
+// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
+static void SplitUV_NEON(const uint8* src_uv,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm__ volatile
+  (
+    "1:\n"
+    "vld2.u8    {q0,q1}, [%0]!    \n"  // load 16 pairs of UV
+    "vst1.u8    {q0}, [%1]!       \n"  // store U
+    "vst1.u8    {q1}, [%2]!       \n"  // Store V
+    "subs       %3, %3, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+    :                                                // Output registers
+    : "r"(src_uv), "r"(dst_u), "r"(dst_v), "r"(pix)  // Input registers
+    : "q0", "q1"                                     // Clobber List
+  );
+}
+#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_SPLITUV_SSE2
+static void SplitUV_SSE2(const uint8* src_uv,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    mov        esi, src_uv
+    mov        edi, dst_u
+    mov        edx, dst_v
+    mov        ecx, pix
+    mov        eax, 0x00ff00ff       // mask for isolating low bytes
+    movd       xmm7, eax
+    pshufd     xmm7, xmm7, 0
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm7   // even bytes
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqa     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+  }
+}
+#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+    !TARGET_IPHONE_SIMULATOR
+// GCC version is same as Visual C
+#define HAS_SPLITUV_SSE2
+extern "C" void SplitUV_SSE2(const uint8* src_uv,
+                             uint8* dst_u, uint8* dst_v, int pix);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _SplitUV_SSE2\n"
+"_SplitUV_SSE2:\n"
+#else
+    ".global SplitUV_SSE2\n"
+"SplitUV_SSE2:\n"
+#endif
+    "push   %ebp\n"
+    "mov    %esp,%ebp\n"
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0x8(%ebp),%esi\n"
+    "mov    0xc(%ebp),%edi\n"
+    "mov    0x10(%ebp),%edx\n"
+    "mov    0x14(%ebp),%ecx\n"
+    "mov    $0xff00ff,%eax\n"
+    "movd   %eax,%xmm7\n"
+    "pshufd $0x0,%xmm7,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "movdqa %xmm0,%xmm2\n"
+    "movdqa %xmm1,%xmm3\n"
+    "pand   %xmm7,%xmm0\n"
+    "pand   %xmm7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "psrlw  $0x8,%xmm2\n"
+    "psrlw  $0x8,%xmm3\n"
+    "packuswb %xmm3,%xmm2\n"
+    "movdqa %xmm2,(%edx)\n"
+    "lea    0x10(%edx),%edx\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "pop    %ebp\n"
+    "ret\n"
+);
+#endif
+static void SplitUV_C(const uint8* src_uv,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  // Copy a row of UV.
+  for (int x = 0; x < pix; ++x) {
+    dst_u[0] = src_uv[0];
+    dst_v[0] = src_uv[1];
+    src_uv += 2;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+static void I420CopyPlane(const uint8* src_y, int src_pitch_y,
+                          uint8* dst_y, int dst_pitch_y,
+                          int width, int height) {
+  // Copy plane
+  for (int y = 0; y < height; ++y) {
+    memcpy(dst_y, src_y, width);
+    src_y += src_pitch_y;
+    dst_y += dst_pitch_y;
+  }
+}
+static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1,
+                           uint8* dst, int dst_pitch,
+                           int width, int height) {
+  // Copy plane
+  for (int y = 0; y < height; y += 2) {
+    memcpy(dst, src, width);
+    src += src_pitch_0;
+    dst += dst_pitch;
+    memcpy(dst, src, width);
+    src += src_pitch_1;
+    dst += dst_pitch;
+  }
+}
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
+// Chroma is half width / half height. (420)
+// pitch_m420 is row planar.  Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so pitch_m420 applies to this
+//   as well as the two Y planes.
+// TODO(fbarchard): Do NV21/NV12 formats with this function
+static void X420ToI420(uint8* dst_y, int dst_pitch_y,
+                       uint8* dst_u, int dst_pitch_u,
+                       uint8* dst_v, int dst_pitch_v,
+                       const uint8* src_y,
+                       int src_pitch_y0, int src_pitch_y1,
+                       const uint8* src_uv, int src_pitch_uv,
+                       int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_pitch_y;
+    dst_u = dst_u + (height - 1) * dst_pitch_u;
+    dst_v = dst_v + (height - 1) * dst_pitch_v;
+    dst_pitch_y = -dst_pitch_y;
+    dst_pitch_u = -dst_pitch_u;
+    dst_pitch_v = -dst_pitch_v;
+  }
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+#if defined(HAS_SPLITUV_NEON)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasNEON) &&
+      (halfwidth % 16 == 0) &&
+      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
+    SplitUV = SplitUV_NEON;
+  } else
+#elif defined(HAS_SPLITUV_SSE2)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
+      (halfwidth % 16 == 0) &&
+      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
+    SplitUV = SplitUV_SSE2;
+  } else
+#endif
+  {
+    SplitUV = SplitUV_C;
+  }
+  I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y,
+                width, height);
+  int halfheight = (height + 1) >> 1;
+  for (int y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUV(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_pitch_u;
+    dst_v += dst_pitch_v;
+    src_uv += src_pitch_uv;
+  }
+}
+// TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same
+// as I420, and only the chroma plane varies. Copy the Y plane by reference,
+// and just convert the UV.  This method can be used for NV21, NV12, I420,
+// I422, M422.  8 of the 12 bits is Y, so this would copy 3 times less data,
+// which is approximately how much faster it would be.
+// Helper function to copy yuv data without scaling.  Used
+// by our jpeg conversion callbacks to incrementally fill a yuv image.
+void PlanarFunctions::I420Copy(const uint8* src_y, int src_pitch_y,
+                               const uint8* src_u, int src_pitch_u,
+                               const uint8* src_v, int src_pitch_v,
+                               uint8* dst_y, int dst_pitch_y,
+                               uint8* dst_u, int dst_pitch_u,
+                               uint8* dst_v, int dst_pitch_v,
+                               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_pitch_y;
+    src_u = src_u + (height - 1) * src_pitch_u;
+    src_v = src_v + (height - 1) * src_pitch_v;
+    src_pitch_y = -src_pitch_y;
+    src_pitch_u = -src_pitch_u;
+    src_pitch_v = -src_pitch_v;
+  }
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height);
+  I420CopyPlane(src_u, src_pitch_u, dst_u, dst_pitch_u, halfwidth, halfheight);
+  I420CopyPlane(src_v, src_pitch_v, dst_v, dst_pitch_v, halfwidth, halfheight);
+}
+// Helper function to copy yuv data without scaling.  Used
+// by our jpeg conversion callbacks to incrementally fill a yuv image.
+void PlanarFunctions::I422ToI420(const uint8* src_y, int src_pitch_y,
+                                 const uint8* src_u, int src_pitch_u,
+                                 const uint8* src_v, int src_pitch_v,
+                                 uint8* dst_y, int dst_pitch_y,
+                                 uint8* dst_u, int dst_pitch_u,
+                                 uint8* dst_v, int dst_pitch_v,
+                                 int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_pitch_y;
+    src_u = src_u + (height - 1) * src_pitch_u;
+    src_v = src_v + (height - 1) * src_pitch_v;
+    src_pitch_y = -src_pitch_y;
+    src_pitch_u = -src_pitch_u;
+    src_pitch_v = -src_pitch_v;
+  }
+  // Copy Y plane
+  I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height);
+  // SubSample UV planes.
+  int x, y;
+  int halfwidth = (width + 1) >> 1;
+  for (y = 0; y < height; y += 2) {
+    const uint8* u0 = src_u;
+    const uint8* u1 = src_u + src_pitch_u;
+    if ((y + 1) >= height) {
+      u1 = u0;
+    }
+    for (x = 0; x < halfwidth; ++x) {
+      dst_u[x] = (u0[x] + u1[x] + 1) >> 1;
+    }
+    src_u += src_pitch_u * 2;
+    dst_u += dst_pitch_u;
+  }
+  for (y = 0; y < height; y += 2) {
+    const uint8* v0 = src_v;
+    const uint8* v1 = src_v + src_pitch_v;
+    if ((y + 1) >= height) {
+      v1 = v0;
+    }
+    for (x = 0; x < halfwidth; ++x) {
+      dst_v[x] = (v0[x] + v1[x] + 1) >> 1;
+    }
+    src_v += src_pitch_v * 2;
+    dst_v += dst_pitch_v;
+  }
+}
+// Convert M420 to I420.
+void PlanarFunctions::M420ToI420(uint8* dst_y, int dst_pitch_y,
+                                 uint8* dst_u, int dst_pitch_u,
+                                 uint8* dst_v, int dst_pitch_v,
+                                 const uint8* m420, int pitch_m420,
+                                 int width, int height) {
+  X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
+             m420, pitch_m420, pitch_m420 * 2,
+             m420 + pitch_m420 * 2, pitch_m420 * 3,
+             width, height);
+}
+// Convert NV12 to I420.
+void PlanarFunctions::NV12ToI420(uint8* dst_y, int dst_pitch_y,
+                                 uint8* dst_u, int dst_pitch_u,
+                                 uint8* dst_v, int dst_pitch_v,
+                                 const uint8* src_y,
+                                 const uint8* src_uv,
+                                 int src_pitch,
+                                 int width, int height) {
+  X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
+             src_y, src_pitch, src_pitch,
+             src_uv, src_pitch,
+             width, height);
+}
+}  // namespace libyuv
--- a/source/row.h
+++ b/source/row.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LIBYUV_SOURCE_ROW_H_
+#define LIBYUV_SOURCE_ROW_H_
+#include "basic_types.h"
+extern "C" {
+// Can only do 1x.
+// This is the second fastest of the scalers.
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+#ifdef OSX
+extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+#else
+extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
+#endif
+// Method to force C version.
+//#define USE_MMX 0
+//#define USE_SSE2 0
+#if !defined(USE_MMX)
+// Windows, Mac and Linux use MMX
+#if defined(__i386__) || defined(_MSC_VER)
+#define USE_MMX 1
+#else
+#define USE_MMX 0
+#endif
+#endif
+#if !defined(USE_SSE2)
+#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
+#define USE_SSE2 1
+#else
+#define USE_SSE2 0
+#endif
+#endif
+// x64 uses MMX2 (SSE) so emms is not required.
+// Warning C4799: function has no EMMS instruction.
+// EMMS() is slow and should be called by the calling function once per image.
+#if USE_MMX && !defined(ARCH_CPU_X86_64)
+#if defined(_MSC_VER)
+#define EMMS() __asm emms
+#pragma warning(disable: 4799)
+#else
+#define EMMS() asm("emms")
+#endif
+#else
+#define EMMS()
+#endif
+}  // extern "C"
+#endif  // LIBYUV_SOURCE_ROW_H_
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "row.h"
+extern "C" {
+#if defined(__x86_64__)
+// 64 bit linux gcc version
+void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
+                              const uint8* u_buf,  // rsi
+                              const uint8* v_buf,  // rdx
+                              uint8* rgb_buf,      // rcx
+                              int width) {         // r8
+  asm(
+"1:"
+  "movzb  (%1),%%r10\n"
+  "lea    1(%1),%1\n"
+  "movzb  (%2),%%r11\n"
+  "lea    1(%2),%2\n"
+  "movq   2048(%5,%%r10,8),%%xmm0\n"
+  "movzb  (%0),%%r10\n"
+  "movq   4096(%5,%%r11,8),%%xmm1\n"
+  "movzb  0x1(%0),%%r11\n"
+  "paddsw %%xmm1,%%xmm0\n"
+  "movq   (%5,%%r10,8),%%xmm2\n"
+  "lea    2(%0),%0\n"
+  "movq   (%5,%%r11,8),%%xmm3\n"
+  "paddsw %%xmm0,%%xmm2\n"
+  "paddsw %%xmm0,%%xmm3\n"
+  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "psraw  $0x6,%%xmm2\n"
+  "packuswb %%xmm2,%%xmm2\n"
+  "movq   %%xmm2,0x0(%3)\n"
+  "lea    8(%3),%3\n"
+  "sub    $0x2,%4\n"
+  "ja     1b\n"
+  :
+  : "r"(y_buf),  // %0
+    "r"(u_buf),  // %1
+    "r"(v_buf),  // %2
+    "r"(rgb_buf),  // %3
+    "r"(width),  // %4
+    "r" (_kCoefficientsRgbY)  // %5
+  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+);
+}
+#elif defined(__i386__)
+// 32 bit gcc version
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUVToRGB32Row\n"
+"_FastConvertYUVToRGB32Row:\n"
+#else
+  ".global FastConvertYUVToRGB32Row\n"
+"FastConvertYUVToRGB32Row:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   _kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "lea    2(%edx),%edx\n"
+  "movq   _kCoefficientsRgbY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movntq %mm1,0x0(%ebp)\n"
+  "lea    8(%ebp),%ebp\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+#else
+// C reference code that mimic the YUV assembly.
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+static inline void YuvPixel(uint8 y,
+                            uint8 u,
+                            uint8 v,
+                            uint8* rgb_buf) {
+  int b = _kCoefficientsRgbY[256+u][0];
+  int g = _kCoefficientsRgbY[256+u][1];
+  int r = _kCoefficientsRgbY[256+u][2];
+  int a = _kCoefficientsRgbY[256+u][3];
+  b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
+  g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
+  r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
+  a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
+  b = paddsw(b, _kCoefficientsRgbY[y][0]);
+  g = paddsw(g, _kCoefficientsRgbY[y][1]);
+  r = paddsw(r, _kCoefficientsRgbY[y][2]);
+  a = paddsw(a, _kCoefficientsRgbY[y][3]);
+  b >>= 6;
+  g >>= 6;
+  r >>= 6;
+  a >>= 6;
+  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+                                        (packuswb(g) << 8) |
+                                        (packuswb(r) << 16) |
+                                        (packuswb(a) << 24);
+}
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  for (int x = 0; x < width; x += 2) {
+    uint8 u = u_buf[x >> 1];
+    uint8 v = v_buf[x >> 1];
+    uint8 y0 = y_buf[x];
+    YuvPixel(y0, u, v, rgb_buf);
+    if ((x + 1) < width) {
+      uint8 y1 = y_buf[x + 1];
+      YuvPixel(y1, u, v, rgb_buf + 4);
+    }
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+#endif
+}  // extern "C"
--- a/source/row_table.cc
+++ b/source/row_table.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "row.h"
+extern "C" {
+#define RGBY(i) { \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
+  0 \
+}
+#define RGBU(i) { \
+  static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
+  0, \
+  static_cast<int16>(256 * 64 - 1) \
+}
+#define RGBV(i) { \
+  0, \
+  static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
+  static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
+  0 \
+}
+#ifdef OSX
+SIMD_ALIGNED(const int16 kCoefficientsRgbY[256 * 3][4]) = {
+#else
+SIMD_ALIGNED(const int16 _kCoefficientsRgbY[256 * 3][4]) = {
+#endif
+  // Luminance table.
+  RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03),
+  RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07),
+  RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B),
+  RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F),
+  RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13),
+  RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17),
+  RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B),
+  RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F),
+  RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23),
+  RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27),
+  RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B),
+  RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F),
+  RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33),
+  RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37),
+  RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B),
+  RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F),
+  RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43),
+  RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47),
+  RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B),
+  RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F),
+  RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53),
+  RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57),
+  RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B),
+  RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F),
+  RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63),
+  RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67),
+  RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B),
+  RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F),
+  RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73),
+  RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77),
+  RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B),
+  RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F),
+  RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83),
+  RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87),
+  RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B),
+  RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F),
+  RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93),
+  RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97),
+  RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B),
+  RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F),
+  RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3),
+  RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7),
+  RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB),
+  RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF),
+  RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3),
+  RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7),
+  RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB),
+  RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF),
+  RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3),
+  RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7),
+  RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB),
+  RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF),
+  RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3),
+  RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7),
+  RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB),
+  RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF),
+  RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3),
+  RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7),
+  RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB),
+  RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF),
+  RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3),
+  RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7),
+  RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB),
+  RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF),
+  // Chroma U table.
+  RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03),
+  RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07),
+  RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B),
+  RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F),
+  RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13),
+  RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17),
+  RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B),
+  RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F),
+  RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23),
+  RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27),
+  RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B),
+  RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F),
+  RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33),
+  RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37),
+  RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B),
+  RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F),
+  RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43),
+  RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47),
+  RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B),
+  RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F),
+  RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53),
+  RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57),
+  RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B),
+  RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F),
+  RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63),
+  RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67),
+  RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B),
+  RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F),
+  RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73),
+  RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77),
+  RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B),
+  RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F),
+  RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83),
+  RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87),
+  RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B),
+  RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F),
+  RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93),
+  RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97),
+  RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B),
+  RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F),
+  RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3),
+  RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7),
+  RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB),
+  RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF),
+  RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3),
+  RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7),
+  RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB),
+  RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF),
+  RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3),
+  RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7),
+  RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB),
+  RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF),
+  RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3),
+  RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7),
+  RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB),
+  RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF),
+  RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3),
+  RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7),
+  RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB),
+  RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF),
+  RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3),
+  RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7),
+  RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB),
+  RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF),
+  // Chroma V table.
+  RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03),
+  RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07),
+  RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B),
+  RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F),
+  RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13),
+  RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17),
+  RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B),
+  RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F),
+  RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23),
+  RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27),
+  RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B),
+  RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F),
+  RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33),
+  RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37),
+  RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B),
+  RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F),
+  RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43),
+  RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47),
+  RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B),
+  RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F),
+  RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53),
+  RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57),
+  RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B),
+  RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F),
+  RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63),
+  RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67),
+  RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B),
+  RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F),
+  RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73),
+  RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77),
+  RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B),
+  RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F),
+  RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83),
+  RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87),
+  RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B),
+  RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F),
+  RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93),
+  RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97),
+  RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B),
+  RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F),
+  RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3),
+  RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7),
+  RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB),
+  RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF),
+  RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3),
+  RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7),
+  RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB),
+  RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF),
+  RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3),
+  RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7),
+  RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB),
+  RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF),
+  RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3),
+  RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7),
+  RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB),
+  RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF),
+  RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3),
+  RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7),
+  RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB),
+  RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF),
+  RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3),
+  RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7),
+  RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB),
+  RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF),
+};
+#undef RGBY
+#undef RGBU
+#undef RGBV
+}  // extern "C"
--- a/source/scale.cc
+++ b/source/scale.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "scale.h"
+#include <string.h>
+#include "common.h"
+#include "cpu_id.h"
+// Note: A Neon reference manual
+// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
+// Note: Some SSE2 reference manuals
+// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
+// TODO(fbarchard): Remove once performance is known
+//#define TEST_RSTSC
+#if defined(TEST_RSTSC)
+#include <iomanip>
+#include <iostream>
+#ifdef _MSC_VER
+#include <emmintrin.h>
+#endif
+#if defined(__GNUC__) && defined(__i386__)
+static inline uint64 __rdtsc(void) {
+  uint32_t a, d;
+  __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
+  return ((uint64)d << 32) + a;
+}
+#endif
+#endif
+namespace libyuv {
+// Set the following flag to true to revert to only
+// using the reference implementation ScalePlaneBox(), and
+// NOT the optimized versions. Useful for debugging and
+// when comparing the quality of the resulting YUV planes
+// as produced by the optimized and non-optimized versions.
+bool YuvScaler::use_reference_impl_ = false;
+/**
+ * NEON downscalers with interpolation.
+ *
+ * Provided by Fritz Koenig
+ *
+ */
+#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#define HAS_SCALEROWDOWN2_NEON
+void ScaleRowDown2_NEON(const uint8* iptr, int32 /* istride */,
+                        uint8* dst, int32 owidth) {
+  __asm__ volatile
+  (
+    "1:\n"
+    "vld2.u8    {q0,q1}, [%0]!    \n"  // load even pixels into q0, odd into q1
+    "vst1.u8    {q0}, [%1]!       \n"  // store even pixels
+    "subs       %2, %2, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+    :                                    // Output registers
+    : "r"(iptr), "r"(dst), "r"(owidth)   // Input registers
+    : "r4", "q0", "q1"                   // Clobber List
+  );
+}
+void ScaleRowDown2Int_NEON(const uint8* iptr, int32 istride,
+                           uint8* dst, int32 owidth) {
+  __asm__ volatile
+  (
+    "mov        r4, #2            \n"  // rounding constant
+    "add        %1, %0            \n"  // l2
+    "vdup.16    q4, r4            \n"
+    "1:\n"
+    "vld1.u8    {q0,q1}, [%0]!    \n"  // load l1 and post increment
+    "vld1.u8    {q2,q3}, [%1]!    \n"  // load l2 and post increment
+    "vpaddl.u8  q0, q0            \n"  // l1 add adjacent
+    "vpaddl.u8  q1, q1            \n"
+    "vpadal.u8  q0, q2            \n"  // l2 add adjacent and add l1 to l2
+    "vpadal.u8  q1, q3            \n"
+    "vadd.u16   q0, q4            \n"  // rounding
+    "vadd.u16   q1, q4            \n"
+    "vshrn.u16  d0, q0, #2        \n"  // downshift and pack
+    "vshrn.u16  d1, q1, #2        \n"
+    "vst1.u8    {q0}, [%2]!       \n"
+    "subs       %3, %3, #16       \n"  // 16 processed per loop
+    "bhi        1b                \n"
+    :                                                 // Output registers
+    : "r"(iptr), "r"(istride), "r"(dst), "r"(owidth)  // Input registers
+    : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
+   );
+}
+/**
+ * SSE2 downscalers with interpolation.
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+// Constants for SSE2 code
+#elif (defined(WIN32) || defined(__i386__)) && !defined(COVERAGE_ENABLED) && \
+    !defined(__PIC__) && !TARGET_IPHONE_SIMULATOR
+#if defined(_MSC_VER)
+#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
+#elif defined(OSX)
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#else
+#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#endif
+// Offsets for source bytes 0 to 9
+extern "C" TALIGN16(const uint8, shuf0[16]) =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+extern "C" TALIGN16(const uint8, shuf1[16]) =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+extern "C" TALIGN16(const uint8, shuf2[16]) =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+// Offsets for source bytes 0 to 10
+extern "C" TALIGN16(const uint8, shuf01[16]) =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+extern "C" TALIGN16(const uint8, shuf11[16]) =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+extern "C" TALIGN16(const uint8, shuf21[16]) =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+// Coefficients for source bytes 0 to 10
+extern "C" TALIGN16(const uint8, madd01[16]) =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+// Coefficients for source bytes 10 to 21
+extern "C" TALIGN16(const uint8, madd11[16]) =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+// Coefficients for source bytes 21 to 31
+extern "C" TALIGN16(const uint8, madd21[16]) =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+// Coefficients for source bytes 21 to 31
+extern "C" TALIGN16(const int16, round34[8]) =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+extern "C" TALIGN16(const uint8, shuf38a[16]) =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+extern "C" TALIGN16(const uint8, shuf38b[16]) =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+// Arrange words 0,3,6 into 0,1,2
+extern "C" TALIGN16(const uint8, shufac0[16]) =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+// Arrange words 0,3,6 into 3,4,5
+extern "C" TALIGN16(const uint8, shufac3[16]) =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+// Scaling values for boxes of 3x3 and 2x3
+extern "C" TALIGN16(const uint16, scaleac3[8]) =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+// Arrange first value for pixels 0,1,2,3,4,5
+extern "C" TALIGN16(const uint8, shufab0[16]) =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+// Arrange second value for pixels 0,1,2,3,4,5
+extern "C" TALIGN16(const uint8, shufab1[16]) =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+// Arrange third value for pixels 0,1,2,3,4,5
+extern "C" TALIGN16(const uint8, shufab2[16]) =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+// Scaling values for boxes of 3x2 and 2x2
+extern "C" TALIGN16(const uint16, scaleab2[8]) =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+#endif
+#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#define HAS_SCALEROWDOWN2_SSE2
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned.
+__declspec(naked)
+static void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride,
+                               uint8* optr, int32 owidth) {
+  __asm {
+    mov        eax, [esp + 4]        // iptr
+                                     // istride ignored
+    mov        edx, [esp + 12]       // optr
+    mov        ecx, [esp + 16]       // owidth
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm7
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+    ret
+  }
+}
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned.
+__declspec(naked)
+static void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride,
+                                  uint8* optr, int32 owidth) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // iptr
+    mov        esi, [esp + 4 + 8]    // istride
+    mov        edx, [esp + 4 + 12]   // optr
+    mov        ecx, [esp + 4 + 16]   // owidth
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         wloop
+    pop        esi
+    ret
+  }
+}
+#define HAS_SCALEROWDOWN4_SSE2
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride,
+                               uint8* orow, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+                                     // istride ignored
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    pcmpeqb    xmm7, xmm7            // generate mask 0x000000ff
+    psrld      xmm7, 24
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    pand       xmm0, xmm7
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 8
+    ja         wloop
+    popad
+    ret
+  }
+}
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride,
+                                  uint8* orow, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        ebx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+    lea        edx, [ebx + ebx * 2]  // istride * 3
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    movdqa     xmm2, [esi + ebx]
+    movdqa     xmm3, [esi + ebx + 16]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [esi + ebx * 2]
+    movdqa     xmm3, [esi + ebx * 2 + 16]
+    movdqa     xmm4, [esi + edx]
+    movdqa     xmm5, [esi + edx + 16]
+    lea        esi, [esi + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
+    psrlw      xmm0, 8
+    pand       xmm2, xmm7
+    pavgw      xmm0, xmm2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 8
+    ja         wloop
+    popad
+    ret
+  }
+}
+#define HAS_SCALEROWDOWN8_SSE2
+// Point samples 32 pixels to 4 pixels.
+// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride,
+                               uint8* orow, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+                                     // istride ignored
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    pcmpeqb    xmm7, xmm7            // generate mask isolating 1 in 8 bytes
+    psrlq      xmm7, 56
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + 16]
+    lea        esi,  [esi + 32]
+    pand       xmm0, xmm7
+    pand       xmm1, xmm7
+    packuswb   xmm0, xmm1  // 32->16
+    packuswb   xmm0, xmm0  // 16->8
+    packuswb   xmm0, xmm0  // 8->4
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 4
+    ja         wloop
+    popad
+    ret
+  }
+}
+// Blends 32x8 rectangle to 4x1.
+// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned.
+__declspec(naked)
+static void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride,
+                                  uint8* orow, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        ebx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    lea        edx, [ebx + ebx * 2]  // istride * 3
+    pxor       xmm7, xmm7
+  wloop:
+    movdqa     xmm0, [esi]           // average 8 rows to 1
+    movdqa     xmm1, [esi + 16]
+    movdqa     xmm2, [esi + ebx]
+    movdqa     xmm3, [esi + ebx + 16]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [esi + ebx * 2]
+    movdqa     xmm3, [esi + ebx * 2 + 16]
+    movdqa     xmm4, [esi + edx]
+    movdqa     xmm5, [esi + edx + 16]
+    lea        ebp, [esi + ebx * 4]
+    lea        esi, [esi + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [ebp]
+    movdqa     xmm3, [ebp + 16]
+    movdqa     xmm4, [ebp + ebx]
+    movdqa     xmm5, [ebp + ebx + 16]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    movdqa     xmm4, [ebp + ebx * 2]
+    movdqa     xmm5, [ebp + ebx * 2 + 16]
+    movdqa     xmm6, [ebp + edx]
+    pavgb      xmm4, xmm6
+    movdqa     xmm6, [ebp + edx + 16]
+    pavgb      xmm5, xmm6
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psadbw     xmm0, xmm7            // average 32 pixels to 4
+    psadbw     xmm1, xmm7
+    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
+    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
+    por        xmm0, xmm1            //      -> 3201
+    psrlw      xmm0, 3
+    packuswb   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 4
+    ja         wloop
+    popad
+    ret
+  }
+}
+#define HAS_SCALEROWDOWN34_SSSE3
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride,
+                                 uint8* orow, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+                                     // istride ignored
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    movdqa     xmm3, _shuf0
+    movdqa     xmm4, _shuf1
+    movdqa     xmm5, _shuf2
+  wloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + 16]
+    lea        esi,  [esi + 32]
+    movdqa     xmm1, xmm2
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edi], xmm0
+    movq       qword ptr [edi + 8], xmm1
+    movq       qword ptr [edi + 16], xmm2
+    lea        edi, [edi + 24]
+    sub        ecx, 24
+    ja         wloop
+    popad
+    ret
+  }
+}
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+// Register usage:
+// xmm0 irow 0
+// xmm1 irow 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 round34
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride,
+                                       uint8* orow, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        ebx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _round34
+  wloop:
+    movdqa     xmm0, [esi]           // pixels 0..7
+    movdqa     xmm1, [esi+ebx]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    movdqu     xmm0, [esi+8]         // pixels 8..15
+    movdqu     xmm1, [esi+ebx+8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+8], xmm0
+    movdqa     xmm0, [esi+16]        // pixels 16..23
+    movdqa     xmm1, [esi+ebx+16]
+    lea        esi, [esi+32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, _madd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+16], xmm0
+    lea        edi, [edi+24]
+    sub        ecx, 24
+    ja         wloop
+    popad
+    ret
+  }
+}
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+__declspec(naked)
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride,
+                                       uint8* orow, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        ebx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _round34
+  wloop:
+    movdqa     xmm0, [esi]           // pixels 0..7
+    movdqa     xmm1, [esi+ebx]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi], xmm0
+    movdqu     xmm0, [esi+8]         // pixels 8..15
+    movdqu     xmm1, [esi+ebx+8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+8], xmm0
+    movdqa     xmm0, [esi+16]        // pixels 16..23
+    movdqa     xmm1, [esi+ebx+16]
+    lea        esi, [esi+32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, _madd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi+16], xmm0
+    lea        edi, [edi+24]
+    sub        ecx, 24
+    ja         wloop
+    popad
+    ret
+  }
+}
+#define HAS_SCALEROWDOWN38_SSSE3
+// 3/8 point sampler
+// Scale 32 pixels to 12
+__declspec(naked)
+static void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride,
+                                 uint8* optr, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        edx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // optr
+    mov        ecx, [esp + 32 + 16]  // owidth
+    movdqa     xmm5, _shuf38a
+    movdqa     xmm6, _shuf38b
+    pxor       xmm7, xmm7
+  xloop:
+    movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
+    movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        esi, [esi + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm6
+    paddusb    xmm0, xmm1
+    movq       qword ptr [edi], xmm0 // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edi + 8], xmm1
+    lea        edi, [edi + 12]
+    sub        ecx, 12
+    ja         xloop
+    popad
+    ret
+  }
+}
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride,
+                                       uint8* optr, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        edx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // optr
+    mov        ecx, [esp + 32 + 16]  // owidth
+    movdqa     xmm4, _shufac0
+    movdqa     xmm5, _shufac3
+    movdqa     xmm6, _scaleac3
+    pxor       xmm7, xmm7
+  xloop:
+    movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
+    movdqa     xmm2, [esi + edx]
+    movhlps    xmm1, xmm0
+    movhlps    xmm3, xmm2
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm1, xmm7
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+    paddusw    xmm0, xmm2
+    paddusw    xmm1, xmm3
+    movdqa     xmm2, [esi + edx * 2]
+    lea        esi, [esi + 16]
+    movhlps    xmm3, xmm2
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+    paddusw    xmm0, xmm2
+    paddusw    xmm1, xmm3
+    movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
+    psrldq     xmm0, 2
+    paddusw    xmm2, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm2, xmm0
+    pshufb     xmm2, xmm4
+    movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
+    psrldq     xmm1, 2
+    paddusw    xmm3, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm3, xmm1
+    pshufb     xmm3, xmm5
+    paddusw    xmm2, xmm3
+    pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
+    packuswb   xmm2, xmm2
+    movd       [edi], xmm2           // write 6 pixels
+    pextrw     eax, xmm2, 2
+    mov        [edi + 4], ax
+    lea        edi, [edi + 6]
+    sub        ecx, 6
+    ja         xloop
+    popad
+    ret
+  }
+}
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride,
+                                       uint8* optr, int32 owidth) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        edx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // optr
+    mov        ecx, [esp + 32 + 16]  // owidth
+    movdqa     xmm4, _shufab0
+    movdqa     xmm5, _shufab1
+    movdqa     xmm6, _shufab2
+    movdqa     xmm7, _scaleab2
+  xloop:
+    movdqa     xmm2, [esi]           // average 2 rows into xmm2
+    pavgb      xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmm2
+    pshufb     xmm1, xmm5
+    paddusw    xmm0, xmm1
+    pshufb     xmm2, xmm6
+    paddusw    xmm0, xmm2
+    pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
+    packuswb   xmm0, xmm0
+    movd       [edi], xmm0           // write 6 pixels
+    pextrw     eax, xmm0, 2
+    mov        [edi + 4], ax
+    lea        edi, [edi + 6]
+    sub        ecx, 6
+    ja         xloop
+    popad
+    ret
+  }
+}
+#define HAS_SCALEADDROWS_SSE2
+// Reads 8xN bytes and produces 16 shorts at a time.
+__declspec(naked)
+static void ScaleAddRows_SSE2(const uint8* iptr, int32 istride,
+                              uint16* orow, int32 iwidth, int32 iheight) {
+  __asm {
+    pushad
+    mov        esi, [esp + 32 + 4]   // iptr
+    mov        edx, [esp + 32 + 8]   // istride
+    mov        edi, [esp + 32 + 12]  // orow
+    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ebx, [esp + 32 + 20]  // height
+    pxor       xmm7, xmm7
+    dec        ebx
+  xloop:
+    // first row
+    movdqa     xmm2, [esi]
+    lea        eax, [esi + edx]
+    movhlps    xmm3, xmm2
+    mov        ebp, ebx
+    punpcklbw  xmm2, xmm7
+    punpcklbw  xmm3, xmm7
+    // sum remaining rows
+  yloop:
+    movdqa     xmm0, [eax]       // read 16 pixels
+    lea        eax, [eax + edx]  // advance to next row
+    movhlps    xmm1, xmm0
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm1, xmm7
+    paddusw    xmm2, xmm0        // sum 16 words
+    paddusw    xmm3, xmm1
+    sub        ebp, 1
+    ja         yloop
+    movdqa     [edi], xmm2
+    movdqa     [edi + 16], xmm3
+    lea        edi, [edi + 32]
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    ja         xloop
+    popad
+    ret
+  }
+}
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
+#define HAS_SCALEFILTERROWS_SSE2
+__declspec(naked)
+static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int32 istride,
+                                 int owidth, int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // optr
+    mov        esi, [esp + 8 + 8]   // iptr0
+    mov        edx, [esp + 8 + 12]  // istride
+    mov        ecx, [esp + 8 + 16]  // owidth
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 128
+    je         xloop2
+    movd       xmm6, eax            // xmm6 = y fraction
+    punpcklwd  xmm6, xmm6
+    pshufd     xmm6, xmm6, 0
+    neg        eax                  // xmm5 = 256 - y fraction
+    add        eax, 256
+    movd       xmm5, eax
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+    pxor       xmm7, xmm7
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm0, xmm7
+    punpcklbw  xmm2, xmm7
+    punpckhbw  xmm1, xmm7
+    punpckhbw  xmm3, xmm7
+    pmullw     xmm0, xmm5           // scale row 0
+    pmullw     xmm1, xmm5
+    pmullw     xmm2, xmm6           // scale row 1
+    pmullw     xmm3, xmm6
+    paddusw    xmm0, xmm2           // sum rows
+    paddusw    xmm1, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  xloop1:
+    movdqa     xmm0, [esi]
+    lea        esi, [esi + 16]
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop1
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  xloop2:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    pavgb      xmm0, xmm2
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop2
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
+#define HAS_SCALEFILTERROWS_SSSE3
+__declspec(naked)
+static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int32 istride,
+                                  int owidth, int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // optr
+    mov        esi, [esp + 8 + 8]   // iptr0
+    mov        edx, [esp + 8 + 12]  // istride
+    mov        ecx, [esp + 8 + 16]  // owidth
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 128
+    je         xloop2
+    shr        eax, 1
+    mov        ah,al
+    neg        al
+    add        al, 128
+    movd       xmm7, eax
+    punpcklwd  xmm7, xmm7
+    pshufd     xmm7, xmm7, 0
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  xloop1:
+    movdqa     xmm0, [esi]
+    lea        esi, [esi + 16]
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop1
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  xloop2:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    lea        esi, [esi + 16]
+    pavgb      xmm0, xmm2
+    movdqa     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 16
+    ja         xloop2
+    mov        al, [edi - 1]
+    mov        [edi], al
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+__declspec(naked)
+static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* iptr,
+                                    int owidth) {
+  __asm {
+    mov        edx, [esp + 4]    // optr
+    mov        eax, [esp + 8]    // iptr
+    mov        ecx, [esp + 12]   // owidth
+    movdqa     xmm1, _round34
+    movdqa     xmm2, _shuf01
+    movdqa     xmm3, _shuf11
+    movdqa     xmm4, _shuf21
+    movdqa     xmm5, _madd01
+    movdqa     xmm6, _madd11
+    movdqa     xmm7, _madd21
+  wloop:
+    movdqa     xmm0, [eax]           // pixels 0..7
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax+8]         // pixels 8..15
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx+8], xmm0
+    movdqa     xmm0, [eax+16]        // pixels 16..23
+    lea        eax, [eax+32]
+    pshufb     xmm0, xmm4
+    pmaddubsw  xmm0, xmm7
+    paddsw     xmm0, xmm1
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx+16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    ja         wloop
+    ret
+  }
+}
+#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+    !TARGET_IPHONE_SIMULATOR
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEROWDOWN2_SSE2
+extern "C" void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride,
+                                   uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown2_SSE2\n"
+"_ScaleRowDown2_SSE2:\n"
+#else
+    ".global ScaleRowDown2_SSE2\n"
+"ScaleRowDown2_SSE2:\n"
+#endif
+    "mov    0x4(%esp),%eax\n"
+    "mov    0xc(%esp),%edx\n"
+    "mov    0x10(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "lea    0x20(%eax),%eax\n"
+    "pand   %xmm7,%xmm0\n"
+    "pand   %xmm7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edx)\n"
+    "lea    0x10(%edx),%edx\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "ret\n"
+);
+extern "C" void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride,
+                                      uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown2Int_SSE2\n"
+"_ScaleRowDown2Int_SSE2:\n"
+#else
+    ".global ScaleRowDown2Int_SSE2\n"
+"ScaleRowDown2Int_SSE2:\n"
+#endif
+    "push   %esi\n"
+    "mov    0x8(%esp),%eax\n"
+    "mov    0xc(%esp),%esi\n"
+    "mov    0x10(%esp),%edx\n"
+    "mov    0x14(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
+"1:"
+    "movdqa (%eax),%xmm0\n"
+    "movdqa 0x10(%eax),%xmm1\n"
+    "movdqa (%eax,%esi,1),%xmm2\n"
+    "movdqa 0x10(%eax,%esi,1),%xmm3\n"
+    "lea    0x20(%eax),%eax\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "movdqa %xmm0,%xmm2\n"
+    "psrlw  $0x8,%xmm0\n"
+    "movdqa %xmm1,%xmm3\n"
+    "psrlw  $0x8,%xmm1\n"
+    "pand   %xmm7,%xmm2\n"
+    "pand   %xmm7,%xmm3\n"
+    "pavgw  %xmm2,%xmm0\n"
+    "pavgw  %xmm3,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edx)\n"
+    "lea    0x10(%edx),%edx\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+#define HAS_SCALEROWDOWN4_SSE2
+extern "C" void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride,
+                                   uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown4_SSE2\n"
+"_ScaleRowDown4_SSE2:\n"
+#else
+    ".global ScaleRowDown4_SSE2\n"
+"ScaleRowDown4_SSE2:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrld  $0x18,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pand   %xmm7,%xmm0\n"
+    "pand   %xmm7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "lea    0x8(%edi),%edi\n"
+    "sub    $0x8,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+extern "C" void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride,
+                                      uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown4Int_SSE2\n"
+"_ScaleRowDown4Int_SSE2:\n"
+#else
+    ".global ScaleRowDown4Int_SSE2\n"
+"ScaleRowDown4Int_SSE2:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%ebx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlw  $0x8,%xmm7\n"
+    "lea    (%ebx,%ebx,2),%edx\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "movdqa (%esi,%ebx,1),%xmm2\n"
+    "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "movdqa (%esi,%ebx,2),%xmm2\n"
+    "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
+    "movdqa (%esi,%edx,1),%xmm4\n"
+    "movdqa 0x10(%esi,%edx,1),%xmm5\n"
+    "lea    0x20(%esi),%esi\n"
+    "pavgb  %xmm4,%xmm2\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm5,%xmm3\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "movdqa %xmm0,%xmm2\n"
+    "psrlw  $0x8,%xmm0\n"
+    "movdqa %xmm1,%xmm3\n"
+    "psrlw  $0x8,%xmm1\n"
+    "pand   %xmm7,%xmm2\n"
+    "pand   %xmm7,%xmm3\n"
+    "pavgw  %xmm2,%xmm0\n"
+    "pavgw  %xmm3,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,%xmm2\n"
+    "psrlw  $0x8,%xmm0\n"
+    "pand   %xmm7,%xmm2\n"
+    "pavgw  %xmm2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "lea    0x8(%edi),%edi\n"
+    "sub    $0x8,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+#define HAS_SCALEROWDOWN8_SSE2
+extern "C" void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride,
+                                   uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown8_SSE2\n"
+"_ScaleRowDown8_SSE2:\n"
+#else
+    ".global ScaleRowDown8_SSE2\n"
+"ScaleRowDown8_SSE2:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "pcmpeqb %xmm7,%xmm7\n"
+    "psrlq  $0x38,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pand   %xmm7,%xmm0\n"
+    "pand   %xmm7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movd   %xmm0,(%edi)\n"
+    "lea    0x4(%edi),%edi\n"
+    "sub    $0x4,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+extern "C" void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride,
+                                      uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown8Int_SSE2\n"
+"_ScaleRowDown8Int_SSE2:\n"
+#else
+    ".global ScaleRowDown8Int_SSE2\n"
+"ScaleRowDown8Int_SSE2:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%ebx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "lea    (%ebx,%ebx,2),%edx\n"
+    "pxor   %xmm7,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "movdqa (%esi,%ebx,1),%xmm2\n"
+    "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "movdqa (%esi,%ebx,2),%xmm2\n"
+    "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
+    "movdqa (%esi,%edx,1),%xmm4\n"
+    "movdqa 0x10(%esi,%edx,1),%xmm5\n"
+    "lea    (%esi,%ebx,4),%ebp\n"
+    "lea    0x20(%esi),%esi\n"
+    "pavgb  %xmm4,%xmm2\n"
+    "pavgb  %xmm5,%xmm3\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "movdqa 0x0(%ebp),%xmm2\n"
+    "movdqa 0x10(%ebp),%xmm3\n"
+    "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
+    "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
+    "pavgb  %xmm4,%xmm2\n"
+    "pavgb  %xmm5,%xmm3\n"
+    "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
+    "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
+    "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
+    "pavgb  %xmm6,%xmm4\n"
+    "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
+    "pavgb  %xmm6,%xmm5\n"
+    "pavgb  %xmm4,%xmm2\n"
+    "pavgb  %xmm5,%xmm3\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "pavgb  %xmm3,%xmm1\n"
+    "psadbw %xmm7,%xmm0\n"
+    "psadbw %xmm7,%xmm1\n"
+    "pshufd $0xd8,%xmm0,%xmm0\n"
+    "pshufd $0x8d,%xmm1,%xmm1\n"
+    "por    %xmm1,%xmm0\n"
+    "psrlw  $0x3,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movd   %xmm0,(%edi)\n"
+    "lea    0x4(%edi),%edi\n"
+    "sub    $0x4,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+// fpic is used for magiccam plugin
+#if !defined(__PIC__)
+#define HAS_SCALEROWDOWN34_SSSE3
+extern "C" void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride,
+                                     uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown34_SSSE3\n"
+"_ScaleRowDown34_SSSE3:\n"
+#else
+    ".global ScaleRowDown34_SSSE3\n"
+"ScaleRowDown34_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf0,%xmm3\n"
+    "movdqa _shuf1,%xmm4\n"
+    "movdqa _shuf2,%xmm5\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm2\n"
+    "lea    0x20(%esi),%esi\n"
+    "movdqa %xmm2,%xmm1\n"
+    "palignr $0x8,%xmm0,%xmm1\n"
+    "pshufb %xmm3,%xmm0\n"
+    "pshufb %xmm4,%xmm1\n"
+    "pshufb %xmm5,%xmm2\n"
+    "movq   %xmm0,(%edi)\n"
+    "movq   %xmm1,0x8(%edi)\n"
+    "movq   %xmm2,0x10(%edi)\n"
+    "lea    0x18(%edi),%edi\n"
+    "sub    $0x18,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride,
+                                           uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown34_1_Int_SSSE3\n"
+"_ScaleRowDown34_1_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown34_1_Int_SSSE3\n"
+"ScaleRowDown34_1_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%ebp\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf01,%xmm2\n"
+    "movdqa _shuf11,%xmm3\n"
+    "movdqa _shuf21,%xmm4\n"
+    "movdqa _madd01,%xmm5\n"
+    "movdqa _madd11,%xmm6\n"
+    "movdqa _round34,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%ebp),%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm2,%xmm0\n"
+    "pmaddubsw %xmm5,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "movdqu 0x8(%esi),%xmm0\n"
+    "movdqu 0x8(%esi,%ebp),%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm3,%xmm0\n"
+    "pmaddubsw %xmm6,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x8(%edi)\n"
+    "movdqa 0x10(%esi),%xmm0\n"
+    "movdqa 0x10(%esi,%ebp),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm4,%xmm0\n"
+    "movdqa  _madd21,%xmm1\n"
+    "pmaddubsw %xmm1,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x10(%edi)\n"
+    "lea    0x18(%edi),%edi\n"
+    "sub    $0x18,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride,
+                                           uint8* orow, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown34_0_Int_SSSE3\n"
+"_ScaleRowDown34_0_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown34_0_Int_SSSE3\n"
+"ScaleRowDown34_0_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%ebp\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf01,%xmm2\n"
+    "movdqa _shuf11,%xmm3\n"
+    "movdqa _shuf21,%xmm4\n"
+    "movdqa _madd01,%xmm5\n"
+    "movdqa _madd11,%xmm6\n"
+    "movdqa _round34,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%ebp,1),%xmm1\n"
+    "pavgb  %xmm0,%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm2,%xmm0\n"
+    "pmaddubsw %xmm5,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "movdqu 0x8(%esi),%xmm0\n"
+    "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
+    "pavgb  %xmm0,%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm3,%xmm0\n"
+    "pmaddubsw %xmm6,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x8(%edi)\n"
+    "movdqa 0x10(%esi),%xmm0\n"
+    "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pavgb  %xmm0,%xmm1\n"
+    "pavgb  %xmm1,%xmm0\n"
+    "pshufb %xmm4,%xmm0\n"
+    "movdqa  _madd21,%xmm1\n"
+    "pmaddubsw %xmm1,%xmm0\n"
+    "paddsw %xmm7,%xmm0\n"
+    "psrlw  $0x2,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movq   %xmm0,0x10(%edi)\n"
+    "lea    0x18(%edi),%edi\n"
+    "sub    $0x18,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+#define HAS_SCALEROWDOWN38_SSSE3
+extern "C" void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride,
+                                     uint8* optr, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown38_SSSE3\n"
+"_ScaleRowDown38_SSSE3:\n"
+#else
+    ".global ScaleRowDown38_SSSE3\n"
+"ScaleRowDown38_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shuf38a ,%xmm5\n"
+    "movdqa _shuf38b ,%xmm6\n"
+    "pxor   %xmm7,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa 0x10(%esi),%xmm1\n"
+    "lea    0x20(%esi),%esi\n"
+    "pshufb %xmm5,%xmm0\n"
+    "pshufb %xmm6,%xmm1\n"
+    "paddusb %xmm1,%xmm0\n"
+    "movq   %xmm0,(%edi)\n"
+    "movhlps %xmm0,%xmm1\n"
+    "movd   %xmm1,0x8(%edi)\n"
+    "lea    0xc(%edi),%edi\n"
+    "sub    $0xc,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride,
+                                           uint8* optr, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown38_3_Int_SSSE3\n"
+"_ScaleRowDown38_3_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown38_3_Int_SSSE3\n"
+"ScaleRowDown38_3_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shufac0,%xmm4\n"
+    "movdqa _shufac3,%xmm5\n"
+    "movdqa _scaleac3,%xmm6\n"
+    "pxor   %xmm7,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "movhlps %xmm0,%xmm1\n"
+    "movhlps %xmm2,%xmm3\n"
+    "punpcklbw %xmm7,%xmm0\n"
+    "punpcklbw %xmm7,%xmm1\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpcklbw %xmm7,%xmm3\n"
+    "paddusw %xmm2,%xmm0\n"
+    "paddusw %xmm3,%xmm1\n"
+    "movdqa (%esi,%edx,2),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movhlps %xmm2,%xmm3\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpcklbw %xmm7,%xmm3\n"
+    "paddusw %xmm2,%xmm0\n"
+    "paddusw %xmm3,%xmm1\n"
+    "movdqa %xmm0,%xmm2\n"
+    "psrldq $0x2,%xmm0\n"
+    "paddusw %xmm0,%xmm2\n"
+    "psrldq $0x2,%xmm0\n"
+    "paddusw %xmm0,%xmm2\n"
+    "pshufb %xmm4,%xmm2\n"
+    "movdqa %xmm1,%xmm3\n"
+    "psrldq $0x2,%xmm1\n"
+    "paddusw %xmm1,%xmm3\n"
+    "psrldq $0x2,%xmm1\n"
+    "paddusw %xmm1,%xmm3\n"
+    "pshufb %xmm5,%xmm3\n"
+    "paddusw %xmm3,%xmm2\n"
+    "pmulhuw %xmm6,%xmm2\n"
+    "packuswb %xmm2,%xmm2\n"
+    "movd   %xmm2,(%edi)\n"
+    "pextrw $0x2,%xmm2,%eax\n"
+    "mov    %ax,0x4(%edi)\n"
+    "lea    0x6(%edi),%edi\n"
+    "sub    $0x6,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride,
+                                           uint8* optr, int32 owidth);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleRowDown38_2_Int_SSSE3\n"
+"_ScaleRowDown38_2_Int_SSSE3:\n"
+#else
+    ".global ScaleRowDown38_2_Int_SSSE3\n"
+"ScaleRowDown38_2_Int_SSSE3:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "movdqa _shufab0,%xmm4\n"
+    "movdqa _shufab1,%xmm5\n"
+    "movdqa _shufab2,%xmm6\n"
+    "movdqa _scaleab2,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm2\n"
+    "pavgb  (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm2,%xmm0\n"
+    "pshufb %xmm4,%xmm0\n"
+    "movdqa %xmm2,%xmm1\n"
+    "pshufb %xmm5,%xmm1\n"
+    "paddusw %xmm1,%xmm0\n"
+    "pshufb %xmm6,%xmm2\n"
+    "paddusw %xmm2,%xmm0\n"
+    "pmulhuw %xmm7,%xmm0\n"
+    "packuswb %xmm0,%xmm0\n"
+    "movd   %xmm0,(%edi)\n"
+    "pextrw $0x2,%xmm0,%eax\n"
+    "mov    %ax,0x4(%edi)\n"
+    "lea    0x6(%edi),%edi\n"
+    "sub    $0x6,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+#endif // __PIC__
+#define HAS_SCALEADDROWS_SSE2
+extern "C" void ScaleAddRows_SSE2(const uint8* iptr, int32 istride,
+                                  uint16* orow, int32 iwidth, int32 iheight);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleAddRows_SSE2\n"
+"_ScaleAddRows_SSE2:\n"
+#else
+    ".global ScaleAddRows_SSE2\n"
+"ScaleAddRows_SSE2:\n"
+#endif
+    "pusha\n"
+    "mov    0x24(%esp),%esi\n"
+    "mov    0x28(%esp),%edx\n"
+    "mov    0x2c(%esp),%edi\n"
+    "mov    0x30(%esp),%ecx\n"
+    "mov    0x34(%esp),%ebx\n"
+    "pxor   %xmm7,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm2\n"
+    "lea    (%esi,%edx,1),%eax\n"
+    "movhlps %xmm2,%xmm3\n"
+    "lea    -0x1(%ebx),%ebp\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpcklbw %xmm7,%xmm3\n"
+"2:"
+    "movdqa (%eax),%xmm0\n"
+    "lea    (%eax,%edx,1),%eax\n"
+    "movhlps %xmm0,%xmm1\n"
+    "punpcklbw %xmm7,%xmm0\n"
+    "punpcklbw %xmm7,%xmm1\n"
+    "paddusw %xmm0,%xmm2\n"
+    "paddusw %xmm1,%xmm3\n"
+    "sub    $0x1,%ebp\n"
+    "ja     2b\n"
+    "movdqa %xmm2,(%edi)\n"
+    "movdqa %xmm3,0x10(%edi)\n"
+    "lea    0x20(%edi),%edi\n"
+    "lea    0x10(%esi),%esi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "popa\n"
+    "ret\n"
+);
+// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
+#define HAS_SCALEFILTERROWS_SSE2
+extern "C" void ScaleFilterRows_SSE2(uint8* optr,
+                                     const uint8* iptr0, int32 istride,
+                                     int owidth, int source_y_fraction);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleFilterRows_SSE2\n"
+"_ScaleFilterRows_SSE2:\n"
+#else
+    ".global ScaleFilterRows_SSE2\n"
+"ScaleFilterRows_SSE2:\n"
+#endif
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0xc(%esp),%edi\n"
+    "mov    0x10(%esp),%esi\n"
+    "mov    0x14(%esp),%edx\n"
+    "mov    0x18(%esp),%ecx\n"
+    "mov    0x1c(%esp),%eax\n"
+    "cmp    $0x0,%eax\n"
+    "je     2f\n"
+    "cmp    $0x80,%eax\n"
+    "je     3f\n"
+    "movd   %eax,%xmm6\n"
+    "punpcklwd %xmm6,%xmm6\n"
+    "pshufd $0x0,%xmm6,%xmm6\n"
+    "neg    %eax\n"
+    "add    $0x100,%eax\n"
+    "movd   %eax,%xmm5\n"
+    "punpcklwd %xmm5,%xmm5\n"
+    "pshufd $0x0,%xmm5,%xmm5\n"
+    "pxor   %xmm7,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,%xmm1\n"
+    "movdqa %xmm2,%xmm3\n"
+    "punpcklbw %xmm7,%xmm0\n"
+    "punpcklbw %xmm7,%xmm2\n"
+    "punpckhbw %xmm7,%xmm1\n"
+    "punpckhbw %xmm7,%xmm3\n"
+    "pmullw %xmm5,%xmm0\n"
+    "pmullw %xmm5,%xmm1\n"
+    "pmullw %xmm6,%xmm2\n"
+    "pmullw %xmm6,%xmm3\n"
+    "paddusw %xmm2,%xmm0\n"
+    "paddusw %xmm3,%xmm1\n"
+    "psrlw  $0x8,%xmm0\n"
+    "psrlw  $0x8,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+"2:"
+    "movdqa (%esi),%xmm0\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     2b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+"3:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     3b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
+#define HAS_SCALEFILTERROWS_SSSE3
+extern "C" void ScaleFilterRows_SSSE3(uint8* optr,
+                                      const uint8* iptr0, int32 istride,
+                                      int owidth, int source_y_fraction);
+  asm(
+    ".text\n"
+#if defined(OSX)
+    ".globl _ScaleFilterRows_SSSE3\n"
+"_ScaleFilterRows_SSSE3:\n"
+#else
+    ".global ScaleFilterRows_SSSE3\n"
+"ScaleFilterRows_SSSE3:\n"
+#endif
+    "push   %esi\n"
+    "push   %edi\n"
+    "mov    0xc(%esp),%edi\n"
+    "mov    0x10(%esp),%esi\n"
+    "mov    0x14(%esp),%edx\n"
+    "mov    0x18(%esp),%ecx\n"
+    "mov    0x1c(%esp),%eax\n"
+    "cmp    $0x0,%eax\n"
+    "je     2f\n"
+    "cmp    $0x80,%eax\n"
+    "je     3f\n"
+    "shr    %eax\n"
+    "mov    %al,%ah\n"
+    "neg    %al\n"
+    "add    $0x80,%al\n"
+    "movd   %eax,%xmm7\n"
+    "punpcklwd %xmm7,%xmm7\n"
+    "pshufd $0x0,%xmm7,%xmm7\n"
+"1:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,%xmm1\n"
+    "punpcklbw %xmm2,%xmm0\n"
+    "punpckhbw %xmm2,%xmm1\n"
+    "pmaddubsw %xmm7,%xmm0\n"
+    "pmaddubsw %xmm7,%xmm1\n"
+    "psrlw  $0x7,%xmm0\n"
+    "psrlw  $0x7,%xmm1\n"
+    "packuswb %xmm1,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     1b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+"2:"
+    "movdqa (%esi),%xmm0\n"
+    "lea    0x10(%esi),%esi\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     2b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+"3:"
+    "movdqa (%esi),%xmm0\n"
+    "movdqa (%esi,%edx,1),%xmm2\n"
+    "lea    0x10(%esi),%esi\n"
+    "pavgb  %xmm2,%xmm0\n"
+    "movdqa %xmm0,(%edi)\n"
+    "lea    0x10(%edi),%edi\n"
+    "sub    $0x10,%ecx\n"
+    "ja     3b\n"
+    "mov    -0x1(%edi),%al\n"
+    "mov    %al,(%edi)\n"
+    "pop    %edi\n"
+    "pop    %esi\n"
+    "ret\n"
+);
+#endif
+// CPU agnostic row functions
+static void ScaleRowDown2_C(const uint8* iptr, int32,
+                            uint8* dst, int32 owidth) {
+  for (int x = 0; x < owidth; ++x) {
+    *dst++ = *iptr;
+    iptr += 2;
+  }
+}
+static void ScaleRowDown2Int_C(const uint8* iptr, int32 istride,
+                               uint8* dst, int32 owidth) {
+  for (int x = 0; x < owidth; ++x) {
+    *dst++ = (iptr[0] + iptr[1] +
+              iptr[istride] + iptr[istride + 1] + 2) >> 2;
+    iptr += 2;
+  }
+}
+static void ScaleRowDown4_C(const uint8* iptr, int32,
+                            uint8* dst, int32 owidth) {
+  for (int x = 0; x < owidth; ++x) {
+    *dst++ = *iptr;
+    iptr += 4;
+  }
+}
+static void ScaleRowDown4Int_C(const uint8* iptr, int32 istride,
+                               uint8* dst, int32 owidth) {
+  for (int x = 0; x < owidth; ++x) {
+    *dst++ = (iptr[0] + iptr[1] + iptr[2] + iptr[3] +
+              iptr[istride + 0] + iptr[istride + 1] +
+              iptr[istride + 2] + iptr[istride + 3] +
+              iptr[istride * 2 + 0] + iptr[istride * 2 + 1] +
+              iptr[istride * 2 + 2] + iptr[istride * 2 + 3] +
+              iptr[istride * 3 + 0] + iptr[istride * 3 + 1] +
+              iptr[istride * 3 + 2] + iptr[istride * 3 + 3] + 8) >> 4;
+    iptr += 4;
+  }
+}
+// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
+// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
+static const int kMaxOutputWidth = 640;
+static const int kMaxRow12 = kMaxOutputWidth * 2;
+static void ScaleRowDown8_C(const uint8* iptr, int32,
+                            uint8* dst, int32 owidth) {
+  for (int x = 0; x < owidth; ++x) {
+    *dst++ = *iptr;
+    iptr += 8;
+  }
+}
+// Note calling code checks width is less than max and if not
+// uses ScaleRowDown8_C instead.
+static void ScaleRowDown8Int_C(const uint8* iptr, int32 istride,
+                               uint8* dst, int32 owidth) {
+  ALIGN16(uint8 irow[kMaxRow12 * 2]);
+  ASSERT(owidth <= kMaxOutputWidth);
+  ScaleRowDown4Int_C(iptr, istride, irow, owidth * 2);
+  ScaleRowDown4Int_C(iptr + istride * 4, istride, irow + kMaxOutputWidth,
+                     owidth * 2);
+  ScaleRowDown2Int_C(irow, kMaxOutputWidth, dst, owidth);
+}
+static void ScaleRowDown34_C(const uint8* iptr, int32,
+                             uint8* dst, int32 owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  uint8* dend = dst + owidth;
+  do {
+    dst[0] = iptr[0];
+    dst[1] = iptr[1];
+    dst[2] = iptr[3];
+    dst += 3;
+    iptr += 4;
+  } while (dst < dend);
+}
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_C(const uint8* iptr, int32 istride,
+                                   uint8* d, int32 owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  uint8* dend = d + owidth;
+  const uint8* s = iptr;
+  const uint8* t = iptr + istride;
+  do {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  } while (d < dend);
+}
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_C(const uint8* iptr, int32 istride,
+                                   uint8* d, int32 owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  uint8* dend = d + owidth;
+  const uint8* s = iptr;
+  const uint8* t = iptr + istride;
+  do {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  } while (d < dend);
+}
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+// Filter row to 3/4
+static void ScaleFilterCols34_C(uint8* optr, const uint8* iptr, int owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  uint8* dend = optr + owidth;
+  const uint8* s = iptr;
+  do {
+    optr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    optr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    optr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    optr += 3;
+    s += 4;
+  } while (optr < dend);
+}
+#endif
+static void ScaleFilterCols_C(uint8* optr, const uint8* iptr,
+                              int owidth, int dx) {
+  int x = 0;
+  for (int j = 0; j < owidth; ++j) {
+    int xi = x >> 16;
+    int xf1 = x & 0xffff;
+    int xf0 = 65536 - xf1;
+    *optr++ = (iptr[xi] * xf0 + iptr[xi + 1] * xf1) >> 16;
+    x += dx;
+  }
+}
+#ifdef TEST_RSTSC
+uint64 timers34[4] = { 0, };
+#endif
+static const int kMaxInputWidth = 2560;
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+#define HAS_SCALEROWDOWN34_SSE2
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Int_SSE2(const uint8* iptr, int32 istride,
+                                      uint8* d, int32 owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  ALIGN16(uint8 row[kMaxInputWidth]);
+#ifdef TEST_RSTSC
+  uint64 t1 = __rdtsc();
+#endif
+  ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 4);
+#ifdef TEST_RSTSC
+  uint64 t2 = __rdtsc();
+#endif
+  ScaleFilterCols34_C(d, row, owidth);
+#ifdef TEST_RSTSC
+  uint64 t3 = __rdtsc();
+  timers34[0] += t2 - t1;
+  timers34[1] += t3 - t2;
+#endif
+}
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Int_SSE2(const uint8* iptr, int32 istride,
+                                      uint8* d, int32 owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  ALIGN16(uint8 row[kMaxInputWidth]);
+#ifdef TEST_RSTSC
+  uint64 t1 = __rdtsc();
+#endif
+  ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 2);
+#ifdef TEST_RSTSC
+  uint64 t2 = __rdtsc();
+#endif
+  ScaleFilterCols34_C(d, row, owidth);
+#ifdef TEST_RSTSC
+  uint64 t3 = __rdtsc();
+  timers34[2] += t2 - t1;
+  timers34[3] += t3 - t2;
+#endif
+}
+#endif
+static void ScaleRowDown38_C(const uint8* iptr, int32,
+                             uint8* dst, int32 owidth) {
+  ASSERT(owidth % 3 == 0);
+  for (int x = 0; x < owidth; x += 3) {
+    dst[0] = iptr[0];
+    dst[1] = iptr[3];
+    dst[2] = iptr[6];
+    dst += 3;
+    iptr += 8;
+  }
+}
+// 8x3 -> 3x1
+static void ScaleRowDown38_3_Int_C(const uint8* iptr, int32 istride,
+                                   uint8* optr, int32 owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  for (int i = 0; i < owidth; i+=3) {
+    optr[0] = (iptr[0] + iptr[1] + iptr[2] +
+        iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2] +
+        iptr[istride * 2 + 0] + iptr[istride * 2 + 1] + iptr[istride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    optr[1] = (iptr[3] + iptr[4] + iptr[5] +
+        iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5] +
+        iptr[istride * 2 + 3] + iptr[istride * 2 + 4] + iptr[istride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    optr[2] = (iptr[6] + iptr[7] +
+        iptr[istride + 6] + iptr[istride + 7] +
+        iptr[istride * 2 + 6] + iptr[istride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    iptr += 8;
+    optr += 3;
+  }
+}
+// 8x2 -> 3x1
+static void ScaleRowDown38_2_Int_C(const uint8* iptr, int32 istride,
+                                   uint8* optr, int32 owidth) {
+  ASSERT((owidth % 3 == 0) && (owidth > 0));
+  for (int i = 0; i < owidth; i+=3) {
+    optr[0] = (iptr[0] + iptr[1] + iptr[2] +
+        iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2]) *
+        (65536 / 6) >> 16;
+    optr[1] = (iptr[3] + iptr[4] + iptr[5] +
+        iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5]) *
+        (65536 / 6) >> 16;
+    optr[2] = (iptr[6] + iptr[7] +
+        iptr[istride + 6] + iptr[istride + 7]) *
+        (65536 / 4) >> 16;
+    iptr += 8;
+    optr += 3;
+  }
+}
+// C version 8x2 -> 8x1
+static void ScaleFilterRows_C(uint8* optr,
+                              const uint8* iptr0, int32 istride,
+                              int owidth, int source_y_fraction) {
+  ASSERT(owidth > 0);
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* iptr1 = iptr0 + istride;
+  uint8* end = optr + owidth;
+  do {
+    optr[0] = (iptr0[0] * y0_fraction + iptr1[0] * y1_fraction) >> 8;
+    optr[1] = (iptr0[1] * y0_fraction + iptr1[1] * y1_fraction) >> 8;
+    optr[2] = (iptr0[2] * y0_fraction + iptr1[2] * y1_fraction) >> 8;
+    optr[3] = (iptr0[3] * y0_fraction + iptr1[3] * y1_fraction) >> 8;
+    optr[4] = (iptr0[4] * y0_fraction + iptr1[4] * y1_fraction) >> 8;
+    optr[5] = (iptr0[5] * y0_fraction + iptr1[5] * y1_fraction) >> 8;
+    optr[6] = (iptr0[6] * y0_fraction + iptr1[6] * y1_fraction) >> 8;
+    optr[7] = (iptr0[7] * y0_fraction + iptr1[7] * y1_fraction) >> 8;
+    iptr0 += 8;
+    iptr1 += 8;
+    optr += 8;
+  } while (optr < end);
+  optr[0] = optr[-1];
+}
+void ScaleAddRows_C(const uint8* iptr, int32 istride,
+                    uint16* orow, int32 iwidth, int32 iheight) {
+  ASSERT(iwidth > 0);
+  ASSERT(iheight > 0);
+  for (int x = 0; x < iwidth; ++x) {
+    const uint8* s = iptr + x;
+    int sum = 0;
+    for (int y = 0; y < iheight; ++y) {
+      sum += s[0];
+      s += istride;
+    }
+    orow[x] = sum;
+  }
+}
+/**
+ * Scale plane, 1/2
+ *
+ * This is an optimized version for scaling down a plane to 1/2 of
+ * its original size.
+ *
+ */
+static void ScalePlaneDown2(int32 iwidth, int32 iheight,
+                            int32 owidth, int32 oheight,
+                            int32 istride, int32 ostride,
+                            const uint8 *iptr, uint8 *optr,
+                            bool interpolate) {
+  ASSERT(iwidth % 2 == 0);
+  ASSERT(iheight % 2 == 0);
+  void (*ScaleRowDown2)(const uint8* iptr, int32 istride,
+                        uint8* orow, int32 owidth);
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasNEON) &&
+      (owidth % 16 == 0) && (istride % 16 == 0) && (ostride % 16 == 0) &&
+      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) {
+    ScaleRowDown2 = interpolate ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
+      (owidth % 16 == 0) && IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) {
+    ScaleRowDown2 = interpolate ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown2 = interpolate ? ScaleRowDown2Int_C : ScaleRowDown2_C;
+  }
+  for (int y = 0; y < oheight; ++y) {
+    ScaleRowDown2(iptr, istride, optr, owidth);
+    iptr += (istride << 1);
+    optr += ostride;
+  }
+}
+/**
+ * Scale plane, 1/4
+ *
+ * This is an optimized version for scaling down a plane to 1/4 of
+ * its original size.
+ */
+static void ScalePlaneDown4(int32 iwidth, int32 iheight,
+                            int32 owidth, int32 oheight,
+                            int32 istride, int32 ostride,
+                            const uint8 *iptr, uint8 *optr,
+                            bool interpolate) {
+  ASSERT(iwidth % 4 == 0);
+  ASSERT(iheight % 4 == 0);
+  void (*ScaleRowDown4)(const uint8* iptr, int32 istride,
+                        uint8* orow, int32 owidth);
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
+      (owidth % 8 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) {
+    ScaleRowDown4 = interpolate ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown4 = interpolate ? ScaleRowDown4Int_C : ScaleRowDown4_C;
+  }
+  for (int y = 0; y < oheight; ++y) {
+    ScaleRowDown4(iptr, istride, optr, owidth);
+    iptr += (istride << 2);
+    optr += ostride;
+  }
+}
+/**
+ * Scale plane, 1/8
+ *
+ * This is an optimized version for scaling down a plane to 1/8
+ * of its original size.
+ *
+ */
+static void ScalePlaneDown8(int32 iwidth, int32 iheight,
+                            int32 owidth, int32 oheight,
+                            int32 istride, int32 ostride,
+                            const uint8 *iptr, uint8 *optr,
+                            bool interpolate) {
+  ASSERT(iwidth % 8 == 0);
+  ASSERT(iheight % 8 == 0);
+  void (*ScaleRowDown8)(const uint8* iptr, int32 istride,
+                        uint8* orow, int32 owidth);
+#if defined(HAS_SCALEROWDOWN8_SSE2)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
+      (owidth % 16 == 0) && owidth <= kMaxOutputWidth &&
+      (istride % 16 == 0) && (ostride % 16 == 0) &&
+      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) {
+    ScaleRowDown8 = interpolate ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
+  } else
+#endif
+  {
+    ScaleRowDown8 = interpolate && (owidth <= kMaxOutputWidth) ?
+        ScaleRowDown8Int_C : ScaleRowDown8_C;
+  }
+  for (int y = 0; y < oheight; ++y) {
+    ScaleRowDown8(iptr, istride, optr, owidth);
+    iptr += (istride << 3);
+    optr += ostride;
+  }
+}
+/**
+ * Scale plane down, 3/4
+ *
+ * Provided by Frank Barchard (fbarchard@google.com)
+ *
+ */
+static void ScalePlaneDown34(int32 iwidth, int32 iheight,
+                             int32 owidth, int32 oheight,
+                             int32 istride, int32 ostride,
+                             const uint8* iptr, uint8* optr,
+                             bool interpolate) {
+  ASSERT(owidth % 3 == 0);
+  void (*ScaleRowDown34_0)(const uint8* iptr, int32 istride,
+                           uint8* orow, int32 owidth);
+  void (*ScaleRowDown34_1)(const uint8* iptr, int32 istride,
+                           uint8* orow, int32 owidth);
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSSE3) &&
+      (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) {
+    if (!interpolate) {
+      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+    }
+  } else
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSE2)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
+      (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8) &&
+      interpolate) {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
+  } else
+#endif
+  {
+    if (!interpolate) {
+      ScaleRowDown34_0 = ScaleRowDown34_C;
+      ScaleRowDown34_1 = ScaleRowDown34_C;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+    }
+  }
+  int irow = 0;
+  for (int y = 0; y < oheight; ++y) {
+    switch (irow) {
+      case 0:
+        ScaleRowDown34_0(iptr, istride, optr, owidth);
+        break;
+      case 1:
+        ScaleRowDown34_1(iptr, istride, optr, owidth);
+        break;
+      case 2:
+        ScaleRowDown34_0(iptr + istride, -istride, optr, owidth);
+        break;
+    }
+    ++irow;
+    iptr += istride;
+    optr += ostride;
+    if (irow >= 3) {
+      iptr += istride;
+      irow = 0;
+    }
+  }
+#ifdef TEST_RSTSC
+  std::cout << "Timer34_0 Row " << std::setw(9) << timers34[0]
+            << " Column " << std::setw(9) << timers34[1]
+            << " Timer34_1 Row " << std::setw(9) << timers34[2]
+            << " Column " << std::setw(9) << timers34[3] << std::endl;
+#endif
+}
+/**
+ * Scale plane, 3/8
+ *
+ * This is an optimized version for scaling down a plane to 3/8
+ * of its original size.
+ *
+ * Reduces 16x3 to 6x1
+ */
+static void ScalePlaneDown38(int32 iwidth, int32 iheight,
+                             int32 owidth, int32 oheight,
+                             int32 istride, int32 ostride,
+                             const uint8* iptr, uint8* optr,
+                             bool interpolate) {
+  ASSERT(owidth % 3 == 0);
+  void (*ScaleRowDown38_3)(const uint8* iptr, int32 istride,
+                           uint8* orow, int32 owidth);
+  void (*ScaleRowDown38_2)(const uint8* iptr, int32 istride,
+                           uint8* orow, int32 owidth);
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSSE3) &&
+      (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) {
+    if (!interpolate) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
+    }
+  } else
+#endif
+  {
+    if (!interpolate) {
+      ScaleRowDown38_3 = ScaleRowDown38_C;
+      ScaleRowDown38_2 = ScaleRowDown38_C;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+    }
+  }
+  int irow = 0;
+  for (int y = 0; y < oheight; ++y) {
+    switch (irow) {
+      case 0:
+      case 1:
+        ScaleRowDown38_3(iptr, istride, optr, owidth);
+        iptr += istride * 3;
+        ++irow;
+        break;
+      case 2:
+        ScaleRowDown38_2(iptr, istride, optr, owidth);
+        iptr += istride * 2;
+        irow = 0;
+        break;
+    }
+    optr += ostride;
+  }
+}
+inline static uint32 SumBox(int32 iboxwidth, int32 iboxheight,
+                            int32 istride, const uint8 *iptr) {
+  ASSERT(iboxwidth > 0);
+  ASSERT(iboxheight > 0);
+  uint32 sum = 0u;
+  for (int y = 0; y < iboxheight; ++y) {
+    for (int x = 0; x < iboxwidth; ++x) {
+      sum += iptr[x];
+    }
+    iptr += istride;
+  }
+  return sum;
+}
+static void ScalePlaneBoxRow(int32 owidth, int32 boxheight,
+                             int dx, int32 istride,
+                             const uint8 *iptr, uint8 *optr) {
+  int x = 0;
+  for (int i = 0; i < owidth; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    int boxwidth = (x >> 16) - ix;
+    *optr++ = SumBox(boxwidth, boxheight, istride, iptr + ix) /
+        (boxwidth * boxheight);
+  }
+}
+inline static uint32 SumPixels(int32 iboxwidth, const uint16 *iptr) {
+  ASSERT(iboxwidth > 0);
+  uint32 sum = 0u;
+  for (int x = 0; x < iboxwidth; ++x) {
+    sum += iptr[x];
+  }
+  return sum;
+}
+static void ScaleAddCols2_C(int32 owidth, int32 boxheight, int dx,
+                            const uint16 *iptr, uint8 *optr) {
+  int scaletbl[2];
+  int minboxwidth = (dx >> 16);
+  scaletbl[0] = 65536 / (minboxwidth * boxheight);
+  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  int *scaleptr = scaletbl - minboxwidth;
+  int x = 0;
+  for (int i = 0; i < owidth; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    int boxwidth = (x >> 16) - ix;
+    *optr++ = SumPixels(boxwidth, iptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+static void ScaleAddCols1_C(int32 owidth, int32 boxheight, int dx,
+                            const uint16 *iptr, uint8 *optr) {
+  int boxwidth = (dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int x = 0;
+  for (int i = 0; i < owidth; ++i) {
+    *optr++ = SumPixels(boxwidth, iptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+/**
+ * Scale plane down to any dimensions, with interpolation.
+ * (boxfilter).
+ *
+ * Same method as SimpleScale, which is fixed point, outputting
+ * one pixel of destination using fixed point (16.16) to step
+ * through source, sampling a box of pixel with simple
+ * averaging.
+ */
+static void ScalePlaneBox(int32 iwidth, int32 iheight,
+                          int32 owidth, int32 oheight,
+                          int32 istride, int32 ostride,
+                          const uint8 *iptr, uint8 *optr) {
+  ASSERT(owidth > 0);
+  ASSERT(oheight > 0);
+  int dy = (iheight << 16) / oheight;
+  int dx = (iwidth << 16) / owidth;
+  if ((iwidth % 16 != 0) || (iwidth > kMaxInputWidth) ||
+      oheight * 2 > iheight) {
+    uint8 *dst = optr;
+    int dy = (iheight << 16) / oheight;
+    int dx = (iwidth << 16) / owidth;
+    int y = 0;
+    for (int j = 0; j < oheight; ++j) {
+      int iy = y >> 16;
+      const uint8 *const src = iptr + iy * istride;
+      y += dy;
+      if (y > (iheight << 16)) {
+        y = (iheight << 16);
+      }
+      int boxheight = (y >> 16) - iy;
+      ScalePlaneBoxRow(owidth, boxheight,
+                       dx, istride,
+                       src, dst);
+      dst += ostride;
+    }
+  } else {
+    ALIGN16(uint16 row[kMaxInputWidth]);
+    void (*ScaleAddRows)(const uint8* iptr, int32 istride,
+                         uint16* orow, int32 iwidth, int32 iheight);
+    void (*ScaleAddCols)(int32 owidth, int32 boxheight, int dx,
+                         const uint16 *iptr, uint8 *optr);
+#if defined(HAS_SCALEADDROWS_SSE2)
+    if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
+        (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) {
+      ScaleAddRows = ScaleAddRows_SSE2;
+    } else
+#endif
+    {
+      ScaleAddRows = ScaleAddRows_C;
+    }
+    if (dx & 0xffff) {
+      ScaleAddCols = ScaleAddCols2_C;
+    } else {
+      ScaleAddCols = ScaleAddCols1_C;
+    }
+    int y = 0;
+    for (int j = 0; j < oheight; ++j) {
+      int iy = y >> 16;
+      const uint8 *const src = iptr + iy * istride;
+      y += dy;
+      if (y > (iheight << 16)) {
+        y = (iheight << 16);
+      }
+      int boxheight = (y >> 16) - iy;
+      ScaleAddRows(src, istride, row, iwidth, boxheight);
+      ScaleAddCols(owidth, boxheight, dx, row, optr);
+      optr += ostride;
+    }
+  }
+}
+/**
+ * Scale plane to/from any dimensions, with interpolation.
+ */
+static void ScalePlaneBilinearSimple(int32 iwidth, int32 iheight,
+                                     int32 owidth, int32 oheight,
+                                     int32 istride, int32 ostride,
+                                     const uint8 *iptr, uint8 *optr) {
+  uint8 *dst = optr;
+  int dx = (iwidth << 16) / owidth;
+  int dy = (iheight << 16) / oheight;
+  int maxx = ((iwidth - 1) << 16) - 1;
+  int maxy = ((iheight - 1) << 16) - 1;
+  int y = (oheight < iheight) ? 32768 : (iheight << 16) / oheight - 32768;
+  for (int i = 0; i < oheight; ++i) {
+    int cy = (y < 0) ? 0 : y;
+    int yi = cy >> 16;
+    int yf = cy & 0xffff;
+    const uint8 *const src = iptr + yi * istride;
+    int x = (owidth < iwidth) ? 32768 : (iwidth << 16) / owidth - 32768;
+    for (int j = 0; j < owidth; ++j) {
+      int cx = (x < 0) ? 0 : x;
+      int xi = cx >> 16;
+      int xf = cx & 0xffff;
+      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
+      int r1 = (src[xi + istride] * (65536 - xf) + src[xi + istride + 1] * xf)
+                >> 16;
+      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+      x += dx;
+      if (x > maxx)
+        x = maxx;
+    }
+    dst += ostride - owidth;
+    y += dy;
+    if (y > maxy)
+      y = maxy;
+  }
+}
+/**
+ * Scale plane to/from any dimensions, with bilinear
+ * interpolation.
+ */
+static void ScalePlaneBilinear(int32 iwidth, int32 iheight,
+                               int32 owidth, int32 oheight,
+                               int32 istride, int32 ostride,
+                               const uint8 *iptr, uint8 *optr) {
+  ASSERT(owidth > 0);
+  ASSERT(oheight > 0);
+  int dy = (iheight << 16) / oheight;
+  int dx = (iwidth << 16) / owidth;
+  if ((iwidth % 8 != 0) || (iwidth > kMaxInputWidth)) {
+    ScalePlaneBilinearSimple(iwidth, iheight, owidth, oheight, istride, ostride,
+                             iptr, optr);
+  } else {
+    ALIGN16(uint8 row[kMaxInputWidth + 1]);
+    void (*ScaleFilterRows)(uint8* optr, const uint8* iptr0, int32 istride,
+                            int owidth, int source_y_fraction);
+    void (*ScaleFilterCols)(uint8* optr, const uint8* iptr,
+                            int owidth, int dx);
+#if defined(HAS_SCALEFILTERROWS_SSSE3)
+    if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSSE3) &&
+        (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) {
+      ScaleFilterRows = ScaleFilterRows_SSSE3;
+    } else
+#endif
+#if defined(HAS_SCALEFILTERROWS_SSE2)
+    if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) &&
+        (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) {
+      ScaleFilterRows = ScaleFilterRows_SSE2;
+    } else
+#endif
+    {
+      ScaleFilterRows = ScaleFilterRows_C;
+    }
+    ScaleFilterCols = ScaleFilterCols_C;
+    int y = 0;
+    int maxy = ((iheight - 1) << 16) - 1; // max is filter of last 2 rows.
+    for (int j = 0; j < oheight; ++j) {
+      int iy = y >> 16;
+      int fy = (y >> 8) & 255;
+      const uint8 *const src = iptr + iy * istride;
+      ScaleFilterRows(row, src, istride, iwidth, fy);
+      ScaleFilterCols(optr, row, owidth, dx);
+      optr += ostride;
+      y += dy;
+      if (y > maxy) {
+        y = maxy;
+      }
+    }
+  }
+}
+/**
+ * Scale plane to/from any dimensions, without interpolation.
+ * Fixed point math is used for performance: The upper 16 bits
+ * of x and dx is the integer part of the source position and
+ * the lower 16 bits are the fixed decimal part.
+ */
+static void ScalePlaneSimple(int32 iwidth, int32 iheight,
+                             int32 owidth, int32 oheight,
+                             int32 istride, int32 ostride,
+                             const uint8 *iptr, uint8 *optr) {
+  uint8 *dst = optr;
+  int dx = (iwidth << 16) / owidth;
+  for (int y = 0; y < oheight; ++y) {
+    const uint8 *const src = iptr + (y * iheight / oheight) * istride;
+    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
+    int x = 0;
+    for (int i = 0; i < owidth; ++i) {
+      *dst++ = src[x >> 16];
+      x += dx;
+    }
+    dst += ostride - owidth;
+  }
+}
+/**
+ * Scale plane to/from any dimensions.
+ */
+static void ScalePlaneAnySize(int32 iwidth, int32 iheight,
+                              int32 owidth, int32 oheight,
+                              int32 istride, int32 ostride,
+                              const uint8 *iptr, uint8 *optr,
+                              bool interpolate) {
+  if (!interpolate) {
+    ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride,
+                     iptr, optr);
+  } else {
+    // fall back to non-optimized version
+    ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride,
+                       iptr, optr);
+  }
+}
+/**
+ * Scale plane down, any size
+ *
+ * This is an optimized version for scaling down a plane to any size.
+ * The current implementation is ~10 times faster compared to the
+ * reference implementation for e.g. XGA->LowResPAL
+ *
+ */
+static void ScalePlaneDown(int32 iwidth, int32 iheight,
+                           int32 owidth, int32 oheight,
+                           int32 istride, int32 ostride,
+                           const uint8 *iptr, uint8 *optr,
+                           bool interpolate) {
+  if (!interpolate) {
+    ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride,
+                     iptr, optr);
+  } else if (iheight * 2 > oheight) {  // between 1/2x and 1x use bilinear
+    ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride,
+                       iptr, optr);
+  } else {
+    ScalePlaneBox(iwidth, iheight, owidth, oheight, istride, ostride,
+                  iptr, optr);
+  }
+}
+/**
+ * Copy plane, no scaling
+ *
+ * This simply copies the given plane without scaling.
+ * The current implementation is ~115 times faster
+ * compared to the reference implementation.
+ *
+ */
+static void CopyPlane(int32 iwidth, int32 iheight,
+                      int32 owidth, int32 oheight,
+                      int32 istride, int32 ostride,
+                      const uint8 *iptr, uint8 *optr) {
+  if (istride == iwidth && ostride == owidth) {
+    // All contiguous, so can use REALLY fast path.
+    memcpy(optr, iptr, iwidth * iheight);
+  } else {
+    // Not all contiguous; must copy scanlines individually
+    const uint8 *src = iptr;
+    uint8 *dst = optr;
+    for (int i = 0; i < iheight; ++i) {
+      memcpy(dst, src, iwidth);
+      dst += ostride;
+      src += istride;
+    }
+  }
+}
+static void ScalePlane(const uint8 *in, int32 istride,
+                       int32 iwidth, int32 iheight,
+                       uint8 *out, int32 ostride,
+                       int32 owidth, int32 oheight,
+                       bool interpolate, bool use_ref) {
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (owidth == iwidth && oheight == iheight) {
+    // Straight copy.
+    CopyPlane(iwidth, iheight, owidth, oheight, istride, ostride, in, out);
+  } else if (owidth <= iwidth && oheight <= iheight) {
+    // Scale down.
+    if (use_ref) {
+      // For testing, allow the optimized versions to be disabled.
+      ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride,
+                     in, out, interpolate);
+    } else if (4 * owidth == 3 * iwidth && 4 * oheight == 3 * iheight) {
+      // optimized, 3/4
+      ScalePlaneDown34(iwidth, iheight, owidth, oheight, istride, ostride,
+                       in, out, interpolate);
+    } else if (2 * owidth == iwidth && 2 * oheight == iheight) {
+      // optimized, 1/2
+      ScalePlaneDown2(iwidth, iheight, owidth, oheight, istride, ostride,
+                      in, out, interpolate);
+    // 3/8 rounded up for odd sized chroma height.
+    } else if (8 * owidth == 3 * iwidth && oheight == ((iheight * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(iwidth, iheight, owidth, oheight, istride, ostride,
+                       in, out, interpolate);
+    } else if (4 * owidth == iwidth && 4 * oheight == iheight) {
+      // optimized, 1/4
+      ScalePlaneDown4(iwidth, iheight, owidth, oheight, istride, ostride,
+                      in, out, interpolate);
+    } else if (8 * owidth == iwidth && 8 * oheight == iheight) {
+      // optimized, 1/8
+      ScalePlaneDown8(iwidth, iheight, owidth, oheight, istride, ostride,
+                      in, out, interpolate);
+    } else {
+      // Arbitrary downsample
+      ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride,
+                     in, out, interpolate);
+    }
+  } else {
+    // Arbitrary scale up and/or down.
+    ScalePlaneAnySize(iwidth, iheight, owidth, oheight, istride, ostride,
+                      in, out, interpolate);
+  }
+}
+/**
+ * Scale a plane.
+ *
+ * This function in turn calls a scaling function
+ * suitable for handling the desired resolutions.
+ *
+ */
+bool YuvScaler::Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV,
+                      int32 istrideY, int32 istrideU, int32 istrideV,
+                      int32 iwidth, int32 iheight,
+                      uint8 *outY, uint8 *outU, uint8 *outV,
+                      int32 ostrideY, int32 ostrideU, int32 ostrideV,
+                      int32 owidth, int32 oheight,
+                      bool interpolate) {
+  if (!inY || !inU || !inV || iwidth <= 0 || iheight <= 0 ||
+      !outY || !outU || !outV || owidth <= 0 || oheight <= 0) {
+    return false;
+  }
+  int32 halfiwidth = (iwidth + 1) >> 1;
+  int32 halfiheight = (iheight + 1) >> 1;
+  int32 halfowidth = (owidth + 1) >> 1;
+  int32 halfoheight = (oheight + 1) >> 1;
+  ScalePlane(inY, istrideY, iwidth, iheight,
+             outY, ostrideY, owidth, oheight,
+             interpolate, use_reference_impl_);
+  ScalePlane(inU, istrideU, halfiwidth, halfiheight,
+             outU, ostrideU, halfowidth, halfoheight,
+             interpolate, use_reference_impl_);
+  ScalePlane(inV, istrideV, halfiwidth, halfiheight,
+             outV, ostrideV, halfowidth, halfoheight,
+             interpolate, use_reference_impl_);
+  return true;
+}
+bool YuvScaler::Scale(const uint8 *in, int32 iwidth, int32 iheight,
+                      uint8 *out, int32 owidth, int32 oheight, int32 ooffset,
+                      bool interpolate) {
+  if (!in || iwidth <= 0 || iheight <= 0 ||
+      !out || owidth <= 0 || oheight <= 0 || ooffset < 0 ||
+      ooffset >= oheight) {
+    return false;
+  }
+  ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
+  int32 halfiwidth = (iwidth + 1) >> 1;
+  int32 halfiheight = (iheight + 1) >> 1;
+  int32 halfowidth = (owidth + 1) >> 1;
+  int32 halfoheight = (oheight + 1) >> 1;
+  int32 aheight = oheight - ooffset * 2;  // actual output height
+  const uint8 *const iyptr = in;
+  uint8 *oyptr = out + ooffset * owidth;
+  const uint8 *const iuptr = in + iwidth * iheight;
+  uint8 *ouptr = out + owidth * oheight + (ooffset >> 1) * halfowidth;
+  const uint8 *const ivptr = in + iwidth * iheight +
+                             halfiwidth * halfiheight;
+  uint8 *ovptr = out + owidth * oheight + halfowidth * halfoheight +
+                 (ooffset >> 1) * halfowidth;
+  return Scale(iyptr, iuptr, ivptr, iwidth, halfiwidth, halfiwidth,
+               iwidth, iheight, oyptr, ouptr, ovptr, owidth,
+               halfowidth, halfowidth, owidth, aheight, interpolate);
+}
+}  // namespace libyuv
--- a/source/video_common.cc
+++ b/source/video_common.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "video_common.h"
+#include <sstream>
+#include "common.h"
+namespace libyuv {
+struct FourCCAliasEntry {
+  uint32 alias;
+  uint32 canonical;
+};
+static const FourCCAliasEntry kFourCCAliases[] = {
+  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU12, FOURCC_I420},
+  {FOURCC_YUYV, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},
+  {FOURCC_HDYC, FOURCC_UYVY},
+  {FOURCC_2VUY, FOURCC_UYVY},
+  {FOURCC_BA81, FOURCC_BGGR},
+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+  {FOURCC_RGB3, FOURCC_RAW},
+  {FOURCC_BGR3, FOURCC_24BG},
+};
+uint32 CanonicalFourCC(uint32 fourcc) {
+  for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+std::string VideoFormat::ToString() const {
+  std::string fourcc_name = GetFourccName(fourcc) + " ";
+  for (std::string::const_iterator i = fourcc_name.begin();
+      i < fourcc_name.end(); ++i) {
+    // Test character is printable; Avoid isprint() which asserts on negatives
+    if (*i < 32 || *i >= 127) {
+      fourcc_name = "";
+      break;
+    }
+  }
+  std::ostringstream ss;
+  ss << fourcc_name << width << "x" << height << "x" << IntervalToFps(interval);
+  return ss.str();
+}
+}  // namespace libyuv
--- a/source/video_common.h
+++ b/source/video_common.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+/*
+* Common definitions for video, including fourcc and VideoFormat
+*/
+#ifndef LIBYUV_SOURCE_VIDEO_COMMON_H_
+#define LIBYUV_SOURCE_VIDEO_COMMON_H_
+#include <string>
+#include "basic_types.h"
+namespace libyuv {
+//////////////////////////////////////////////////////////////////////////////
+// Definition of fourcc.
+//////////////////////////////////////////////////////////////////////////////
+// Convert four characters to a fourcc code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#define FOURCC(a, b, c, d) (\
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+// Get the name, that is, string with four characters, of a fourcc code.
+inline std::string GetFourccName(uint32 fourcc) {
+  std::string name;
+  name.push_back(static_cast<char>(fourcc & 0xFF));
+  name.push_back(static_cast<char>((fourcc >> 8) & 0xFF));
+  name.push_back(static_cast<char>((fourcc >> 16) & 0xFF));
+  name.push_back(static_cast<char>((fourcc >> 24) & 0xFF));
+  return name;
+}
+// Some good pages discussing FourCC codes:
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://www.fourcc.org/yuv.php
+enum FourCC {
+  // Canonical fourcc codes used in our code.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  // Next four are Bayer RGB formats. The four characters define the order of
+  // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+  // Aliases for canonical fourcc codes, replaced with their canonical
+  // equivalents by CanonicalFourCC().
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Alias for I420
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG
+  // Match any fourcc.
+  FOURCC_ANY  = 0xFFFFFFFF,
+};
+// Converts fourcc aliases into canonical ones.
+uint32 CanonicalFourCC(uint32 fourcc);
+//////////////////////////////////////////////////////////////////////////////
+// Definition of VideoFormat.
+//////////////////////////////////////////////////////////////////////////////
+static const int64 kNumNanosecsPerSec = 1000000000;
+struct VideoFormat {
+  static const int64 kMinimumInterval = kNumNanosecsPerSec / 10000;  // 10k fps
+  VideoFormat() : width(0), height(0), interval(0), fourcc(0) {}
+  VideoFormat(int w, int h, int64 interval_ns, uint32 cc)
+      : width(w),
+        height(h),
+        interval(interval_ns),
+        fourcc(cc) {
+  }
+  VideoFormat(const VideoFormat& format)
+      : width(format.width),
+        height(format.height),
+        interval(format.interval),
+        fourcc(format.fourcc) {
+  }
+  static int64 FpsToInterval(int fps) {
+    return fps ? kNumNanosecsPerSec / fps : kMinimumInterval;
+  }
+  static int IntervalToFps(int64 interval) {
+    // Normalize the interval first.
+    interval = libyuv::_max(interval, kMinimumInterval);
+    return static_cast<int>(kNumNanosecsPerSec / interval);
+  }
+  bool operator==(const VideoFormat& format) const {
+    return width == format.width && height == format.height &&
+        interval == format.interval && fourcc == format.fourcc;
+  }
+  bool operator!=(const VideoFormat& format) const {
+    return !(*this == format);
+  }
+  bool operator<(const VideoFormat& format) const {
+    return (fourcc < format.fourcc) ||
+        (fourcc == format.fourcc && width < format.width) ||
+        (fourcc == format.fourcc && width == format.width &&
+            height < format.height) ||
+        (fourcc == format.fourcc && width == format.width &&
+            height == format.height && interval > format.interval);
+  }
+  int framerate() const { return IntervalToFps(interval); }
+  // Check if both width and height are 0.
+  bool IsSize0x0() const { return 0 == width && 0 == height; }
+  // Check if this format is less than another one by comparing the resolution
+  // and frame rate.
+  bool IsPixelRateLess(const VideoFormat& format) const {
+    return width * height * framerate() <
+        format.width * format.height * format.framerate();
+  }
+  // Get a string presentation in the form of "fourcc width x height x fps"
+  std::string ToString() const;
+  int    width;     // in number of pixels
+  int    height;    // in number of pixels
+  int64  interval;  // in nanoseconds
+  uint32 fourcc;    // color space. FOURCC_ANY means that any color space is OK.
+};
+// Result of video capturer start.
+enum CaptureResult {
+  CR_SUCCESS,    // The capturer starts successfully.
+  CR_PENDING,    // The capturer is pending to start the capture device.
+  CR_FAILURE,    // The capturer fails to start.
+  CR_NO_DEVICE,  // The capturer has no device and fails to start.
+};
+}  // namespace libyuv
+#endif  // LIBYUV_SOURCE_VIDEO_COMMON_H_