ARGBInterpolateRow_SSSE3 for motion blur. Used to use bilinear row filter,…

ARGBInterpolateRow_SSSE3 for motion blur. Used to use bilinear row filter, which extrudes edges. This branches off the code so the extrude can be removed for Interpolate. BUG=none TEST=build\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=* Review URL: https://webrtc-codereview.appspot.com/786007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@354 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBInterpolateRow_SSSE3 for motion blur. Used to use bilinear row filter,…
ARGBInterpolateRow_SSSE3 for motion blur. Used to use bilinear row filter, which extrudes edges. This branches off the code so the extrude can be removed for Interpolate. BUG=none TEST=build\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=* Review URL: https://webrtc-codereview.appspot.com/786007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@354 16f28f9a-4ce2-e073-06de-1de4eb20be90
9bcc9a25 · fbarchard@google.com · a2cc341b · 9bcc9a25 · 9bcc9a25 · 9bcc9a25
Commit 9bcc9a25 authored Sep 16, 2012 by fbarchard@google.com
20 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 353
+Version: 354
 License: BSD
 License File: LICENSE

--- a/include/libyuv/basic_types.h
+++ b/include/libyuv/basic_types.h
@@ -65,6 +65,10 @@ typedef signed char int8;
    defined(__i386__) || defined(_M_IX86)
 #define CPU_X86 1
 #endif
+// Detect compiler is for arm.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
 #define ALIGNP(p, t) \
  (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \

--- a/include/libyuv/compare.h
+++ b/include/libyuv/compare.h
@@ -21,7 +21,7 @@ extern "C" {
 // Compute a hash for specified memory.  Seed of 5381 recommended.
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
-// Sum Square Error - used to compute Mean Square Error or PSNR
+// Sum Square Error - used to compute Mean Square Error or PSNR.
 uint64 ComputeSumSquareError(const uint8* src_a,
                             const uint8* src_b, int count);

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -12,7 +12,7 @@
 #define INCLUDE_LIBYUV_CONVERT_H_
 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes
+// TODO(fbarchard): Remove the following headers includes.
 #include "libyuv/convert_from.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
@@ -22,7 +22,7 @@ namespace libyuv {
 extern "C" {
 #endif
-// Alias
+// Alias.
 #define I420ToI420 I420Copy
 // Copy I420 to I420.
@@ -112,56 +112,63 @@ int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
-// ARGB little endian (bgra in memory) to I420
+// ARGB little endian (bgra in memory) to I420.
 int ARGBToI420(const uint8* src_frame, int src_stride_frame,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
-// BGRA little endian (argb in memory) to I420
+// BGRA little endian (argb in memory) to I420.
 int BGRAToI420(const uint8* src_frame, int src_stride_frame,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
-// ABGR little endian (rgba in memory) to I420
+// ABGR little endian (rgba in memory) to I420.
 int ABGRToI420(const uint8* src_frame, int src_stride_frame,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
-// RGB little endian (bgr in memory) to I420
+// RGBA little endian (rgba in memory) to I420.
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+// RGB little endian (bgr in memory) to I420.
 int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
-// RGB big endian (rgb in memory) to I420
+// RGB big endian (rgb in memory) to I420.
 int RAWToI420(const uint8* src_frame, int src_stride_frame,
              uint8* dst_y, int dst_stride_y,
              uint8* dst_u, int dst_stride_u,
              uint8* dst_v, int dst_stride_v,
              int width, int height);
-// RGB16 (RGBP fourcc) little endian to I420
+// RGB16 (RGBP fourcc) little endian to I420.
 int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
                 uint8* dst_y, int dst_stride_y,
                 uint8* dst_u, int dst_stride_u,
                 uint8* dst_v, int dst_stride_v,
                 int width, int height);
-// RGB15 (RGBO fourcc) little endian to I420
+// RGB15 (RGBO fourcc) little endian to I420.
 int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
                   uint8* dst_y, int dst_stride_y,
                   uint8* dst_u, int dst_stride_u,
                   uint8* dst_v, int dst_stride_v,
                   int width, int height);
-// RGB12 (R444 fourcc) little endian to I420
+// RGB12 (R444 fourcc) little endian to I420.
 int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
                   uint8* dst_y, int dst_stride_y,
                   uint8* dst_u, int dst_stride_u,
@@ -169,7 +176,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
                   int width, int height);
 #ifdef HAVE_JPEG
-// src_width/height provided by capture
+// src_width/height provided by capture.
 // dst_width/height for clipping determine final size.
 int MJPGToI420(const uint8* sample, size_t sample_size,
               uint8* dst_y, int dst_stride_y,

--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -28,7 +28,7 @@ namespace libyuv {
 extern "C" {
 #endif
-// Alias
+// Alias.
 #define ARGBToARGB ARGBCopy
 // Copy ARGB to ARGB.
@@ -112,17 +112,17 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
 //                uint8* dst_argb, int dst_stride_argb,
 //                int width, int height);
-// BGRA little endian (argb in memory) to ARGB
+// BGRA little endian (argb in memory) to ARGB.
 int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
-// ABGR little endian (rgba in memory) to ARGB
+// ABGR little endian (rgba in memory) to ARGB.
 int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
-// RGBA little endian (abgr in memory) to ARGB
+// RGBA little endian (abgr in memory) to ARGB.
 int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
@@ -130,27 +130,27 @@ int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
 // Deprecated function name.
 #define BG24ToARGB RGB24ToARGB
-// RGB little endian (bgr in memory) to ARGB
+// RGB little endian (bgr in memory) to ARGB.
 int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
-// RGB big endian (rgb in memory) to ARGB
+// RGB big endian (rgb in memory) to ARGB.
 int RAWToARGB(const uint8* src_frame, int src_stride_frame,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);
-// RGB16 (RGBP fourcc) little endian to ARGB
+// RGB16 (RGBP fourcc) little endian to ARGB.
 int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
                 uint8* dst_argb, int dst_stride_argb,
                 int width, int height);
-// RGB15 (RGBO fourcc) little endian to ARGB
+// RGB15 (RGBO fourcc) little endian to ARGB.
 int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
                   uint8* dst_argb, int dst_stride_argb,
                   int width, int height);
-// RGB12 (R444 fourcc) little endian to ARGB
+// RGB12 (R444 fourcc) little endian to ARGB.
 int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
                   uint8* dst_argb, int dst_stride_argb,
                   int width, int height);
@@ -164,7 +164,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
               int dst_width, int dst_height);
 #endif
-// Note Bayer formats (BGGR) to ARGB are in format_conversion.h
+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.

--- a/include/libyuv/convert_from.h
+++ b/include/libyuv/convert_from.h
@@ -19,9 +19,9 @@ namespace libyuv {
 extern "C" {
 #endif
-// See Also convert.h for conversions from formats to I420
+// See Also convert.h for conversions from formats to I420.
-// I420Copy in convert to I420ToI420
+// I420Copy in convert to I420ToI420.
 int I420ToI422(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@@ -47,7 +47,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
-// Copy to I400.  Source can be I420,422,444,400,NV12,NV21
+// Copy to I400.  Source can be I420, I422, I444, I400, NV12 or NV21.
 int I400Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_y, int dst_stride_y,
             int width, int height);
@@ -92,6 +92,12 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
 int I420ToRGB24(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
@@ -122,7 +128,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
                   uint8* dst_frame, int dst_stride_frame,
                   int width, int height);
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the

--- a/include/libyuv/format_conversion.h
+++ b/include/libyuv/format_conversion.h
@@ -43,7 +43,7 @@ int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_v, int dst_stride_v,
                    int width, int height);
-// Temporary API mapper
+// Temporary API mapper.
 #define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
@@ -79,7 +79,7 @@ int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
                    uint8* dst_frame, int dst_stride_frame,
                    int width, int height);
-// Temporary API mapper
+// Temporary API mapper.
 #define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
@@ -107,7 +107,7 @@ int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height);
-// Temporary API mapper
+// Temporary API mapper.
 #define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
 int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
@@ -132,7 +132,7 @@ int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_bayer, int dst_stride_bayer,
                    int width, int height);
-// Temporary API mapper
+// Temporary API mapper.
 #define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
 int ARGBToBayer(const uint8* src_argb, int src_stride_argb,

--- a/include/libyuv/mjpeg_decoder.h
+++ b/include/libyuv/mjpeg_decoder.h
@@ -13,6 +13,8 @@
 #include "libyuv/basic_types.h"
+// NOTE: For a simplified public API use convert.h MJPGToI420().
 struct jpeg_common_struct;
 struct jpeg_decompress_struct;
 struct jpeg_source_mgr;
@@ -85,10 +87,10 @@ class MJpegDecoder {
  int GetVertSubSampFactor(int component);
-  // Public for testability
+  // Public for testability.
  int GetImageScanlinesPerImcuRow();
-  // Public for testability
+  // Public for testability.
  int GetComponentScanlinesPerImcuRow(int component);
  // Width of a component in bytes.

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -13,7 +13,7 @@
 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes
+// TODO(fbarchard): Remove the following headers includes.
 #include "libyuv/convert.h"
 #include "libyuv/convert_argb.h"
@@ -31,7 +31,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
-// Convert I420 to I400.  (calls CopyPlane ignoring u/v)
+// Convert I420 to I400.  (calls CopyPlane ignoring u/v).
 int I420ToI400(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
@@ -103,7 +103,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
-// ARGB little endian (bgra in memory) to I422
+// ARGB little endian (bgra in memory) to I422.
 int ARGBToI422(const uint8* src_frame, int src_stride_frame,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,

--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@@ -31,7 +31,7 @@ enum RotationMode {
  kRotateCounterClockwise = 270,
 };
-// Rotate I420 frame
+// Rotate I420 frame.
 int I420Rotate(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
@@ -40,7 +40,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int src_width, int src_height, RotationMode mode);
-// Rotate NV12 input and store in I420
+// Rotate NV12 input and store in I420.
 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                     const uint8* src_uv, int src_stride_uv,
                     uint8* dst_y, int dst_stride_y,

--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@@ -20,9 +20,9 @@ extern "C" {
 // Supported filtering
 enum FilterMode {
-  kFilterNone = 0,  // Point sample; Fastest
+  kFilterNone = 0,  // Point sample; Fastest.
  kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 2  // Highest quality
+  kFilterBox = 2  // Highest quality.
 };
 // Scale a YUV plane.
@@ -52,7 +52,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
              int dst_width, int dst_height,
              FilterMode filtering);
-// Legacy API.  Deprecated
+// Legacy API.  Deprecated.
 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int src_stride_y, int src_stride_u, int src_stride_v,
          int src_width, int src_height,
@@ -61,12 +61,12 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int dst_width, int dst_height,
          bool interpolate);
-// Legacy API.  Deprecated
+// Legacy API.  Deprecated.
 int ScaleOffset(const uint8* src, int src_width, int src_height,
                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
                bool interpolate);
-// For testing, allow disabling of optimizations.
+// For testing, allow disabling of specialized scalers.
 void SetUseReferenceImpl(bool use);
 #ifdef __cplusplus

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 353
+#define LIBYUV_VERSION 354
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
-// Common definitions for video, including fourcc and VideoFormat
+// Common definitions for video, including fourcc and VideoFormat.
 #ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
 #define INCLUDE_LIBYUV_VIDEO_COMMON_H_
@@ -107,7 +107,7 @@ enum FourCCBpp {
  FOURCC_BPP_UYVY = 16,
  FOURCC_BPP_M420 = 12,
  FOURCC_BPP_Q420 = 12,
-  FOURCC_BPP_V210 = 22,  // 22.5 actually
+  FOURCC_BPP_V210 = 22,  // 128 / 6 actually.
  FOURCC_BPP_24BG = 24,
  FOURCC_BPP_ARGB = 32,
  FOURCC_BPP_BGRA = 32,

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -598,7 +598,7 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SETROW_NEON
 static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
+  asm volatile (  // NOLINT
    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
    "1:                                        \n"
    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
@@ -669,7 +669,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
 #define HAS_SETROW_X86
 static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
  size_t width_tmp = static_cast<size_t>(width);
-  asm volatile (
+  asm volatile (  // NOLINT
    "shr       $0x2,%1                         \n"
    "rep stosl                                 \n"
    : "+D"(dst),       // %0
@@ -683,7 +683,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
  for (int y = 0; y < height; ++y) {
    size_t width_tmp = static_cast<size_t>(width);
    uint32* d = reinterpret_cast<uint32*>(dst);
-    asm volatile (
+    asm volatile (  // NOLINT
      "rep stosl                               \n"
      : "+D"(d),         // %0
        "+c"(width_tmp)  // %1
@@ -1176,17 +1176,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
  return 0;
 }
-#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
-    (defined(__x86_64__) || defined(__i386__)))
-#define HAS_SCALEARGBFILTERROWS_SSSE3
-#endif
-void ScaleARGBFilterRows_C(uint8* dst_ptr,
-                           const uint8* src_ptr, ptrdiff_t src_stride,
-                           int dst_width, int source_y_fraction);
-void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr,
-                               const uint8* src_ptr, ptrdiff_t src_stride,
-                               int dst_width, int source_y_fraction);
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                    const uint8* src_argb1, int src_stride_argb1,
@@ -1201,24 +1190,20 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-  void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) = ScaleARGBFilterRows_C;
+                              int source_y_fraction) = ARGBInterpolateRow_C;
-#if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
+#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) &&
      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
+    ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
  }
 #endif
-  uint8 last16[16];
  for (int y = 0; y < height; ++y) {
-    // Filter extrudes edge for its scaling purpose.
+    ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
-    memcpy(last16, dst_argb + width * 4, 16);  // Save last 16 beyond end.
+                       width, interpolation);
-    ScaleARGBFilterRows(dst_argb, src_argb0, src_argb1 - src_argb0,
-                        width, interpolation);
-    memcpy(dst_argb + width * 4, last16, 16);  // Restore last 16 beyond end.
    src_argb0 += src_stride_argb0;
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;

--- a/source/row.h
+++ b/source/row.h
@@ -83,9 +83,7 @@ extern "C" {
 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2
 #define HAS_ARGBSHADE_SSE2
 #define HAS_ARGBAFFINEROW_SSE2
-// HAS_ARGBBLENDROW_SSE2 may be faster than SSSE3 version on some CPUs, so
+#define HAS_ARGBINTERPOLATEROW_SSSE3
-// enable it here instead of LIBYUV_SSSE3_ONLY section.
-#define HAS_ARGBBLENDROW_SSE2
 #endif
 // The following are Windows only:
@@ -102,6 +100,7 @@ extern "C" {
    !defined(LIBYUV_SSSE3_ONLY)
 #define HAS_MIRRORROW_SSE2
 #define HAS_ARGBATTENUATE_SSE2
+#define HAS_ARGBBLENDROW_SSE2
 #endif
 // The following are available on Neon platforms
@@ -553,6 +552,13 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction);
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1081,6 +1081,29 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
  }
 }
+// C version 2x2 -> 2x1.
+void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  uint8* end = dst_ptr + (dst_width << 2);
+  do {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+    src_ptr += 8;
+    src_ptr1 += 8;
+    dst_ptr += 8;
+  } while (dst_ptr < end);
+}
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3560,6 +3560,71 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        2f                              \n"
+    "cmp       $0x40,%3                        \n"
+    "je        3f                              \n"
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "pmaddubsw %%xmm5,%%xmm0                   \n"
+    "pmaddubsw %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        1b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "2:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        2b                              \n"
+    "jmp       4f                              \n"
+    ".p2align  4                               \n"
+  "3:                                          \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        3b                              \n"
+  "4:                                          \n"
+    ".p2align  4                               \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"(static_cast<intptr_t>(src_stride))  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+  );
+}
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3664,6 +3664,81 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
+__declspec(naked) __declspec(align(16))
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride, int dst_width,
+                              int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    shr        eax, 1
+    cmp        eax, 0
+    je         xloop1
+    cmp        eax, 64
+    je         xloop2
+    movd       xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    movd       xmm5, eax  // low fraction 128..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+    align      16
+  xloop:
+    movdqa     xmm0, [esi]
+    movdqa     xmm2, [esi + edx]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop
+    pop        edi
+    pop        esi
+    ret
+    align      16
+  xloop1:
+    movdqa     xmm0, [esi]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop1
+    pop        edi
+    pop        esi
+    ret
+    align      16
+  xloop2:
+    movdqa     xmm0, [esi]
+    pavgb      xmm0, [esi + edx]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop2
+    pop        edi
+    pop        esi
+    ret
+  }
+}
 #endif  // _M_IX86
 #ifdef __cplusplus

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -80,7 +80,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
 }
 #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)          \
-    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ,)          \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +)        \
    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
 TESTPLANARTOB(I420, 2, 2, ARGB, 4)
@@ -151,7 +151,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
 }
 #define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B)        \
-    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ,)        \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +)      \
    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
 TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4)
@@ -233,7 +233,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) {                        \
 }
 #define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)          \
-    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, ,)          \
+    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +)        \
    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -)
 TESTATOPLANAR(ARGB, 4, I420, 2, 2)
@@ -293,7 +293,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) {                             \
  free_aligned_buffer_16(dst_argb_opt)                                         \
 }
 #define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B)                         \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, ,)                         \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +)                       \
    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -)
 TESTATOB(ARGB, 4, 4, ARGB, 4)
@@ -853,14 +853,9 @@ TEST_F(libyuvTest, TestShade) {
 }
 TEST_F(libyuvTest, TestInterpolate) {
-  // Interpolate internally used bilinear filtering, which duplicates the last
-  // value, but the interpolate saves and restores it.  The buffer must be
-  // padded by 16 extra bytes.  TODO(fbarchard): Reimplement interpolate with
-  // code that does not duplicate the last value and remove kPad.
-  const int kPad = 16;
  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
  SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
-  SIMD_ALIGNED(uint8 interpolate_pixels[256 + kPad][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
  orig_pixels_0[0][0] = 16u;
  orig_pixels_0[0][1] = 32u;
@@ -930,7 +925,7 @@ TEST_F(libyuvTest, TestInterpolate) {
  EXPECT_EQ(16u, interpolate_pixels[0][2]);
  EXPECT_EQ(32u, interpolate_pixels[0][3]);
-  for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+  for (int i = 0; i < benchmark_iterations_ * (1280 * 720 / 256); ++i) {
    ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
                    &interpolate_pixels[0][0], 0, 256, 1, 128);
  }

--- a/unit_test/version_test.cc
+++ b/unit_test/version_test.cc
@@ -25,7 +25,9 @@ TEST_F(libyuvTest, TestVersion) {
  printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION);
 #ifdef LIBYUV_SVNREVISION
  const char *ver = strchr(LIBYUV_SVNREVISION, ':');
-  if (!ver) {
+  if (ver) {
+    ++ver;
+  } else {
    ver = LIBYUV_SVNREVISION;
  }
  int svn_revision = atoi(ver);