Libyuv MIPS DSPR2 optimizations.

Optimized functions: I444ToARGBRow_DSPR2 I422ToARGB4444Row_DSPR2 I422ToARGB1555Row_DSPR2 NV12ToARGBRow_DSPR2 BGRAToUVRow_DSPR2 BGRAToYRow_DSPR2 ABGRToUVRow_DSPR2 ARGBToYRow_DSPR2 ABGRToYRow_DSPR2 RGBAToUVRow_DSPR2 RGBAToYRow_DSPR2 ARGBToUVRow_DSPR2 RGB24ToARGBRow_DSPR2 RAWToARGBRow_DSPR2 RGB565ToARGBRow_DSPR2 ARGB1555ToARGBRow_DSPR2 ARGB4444ToARGBRow_DSPR2 ScaleAddRow_DSPR2 Bug-fixes in functions: ScaleRowDown2_DSPR2 ScaleRowDown4_DSPR2 BUG= Review-Url: https://codereview.chromium.org/2626123003 .

Libyuv MIPS DSPR2 optimizations.
Optimized functions: I444ToARGBRow_DSPR2 I422ToARGB4444Row_DSPR2 I422ToARGB1555Row_DSPR2 NV12ToARGBRow_DSPR2 BGRAToUVRow_DSPR2 BGRAToYRow_DSPR2 ABGRToUVRow_DSPR2 ARGBToYRow_DSPR2 ABGRToYRow_DSPR2 RGBAToUVRow_DSPR2 RGBAToYRow_DSPR2 ARGBToUVRow_DSPR2 RGB24ToARGBRow_DSPR2 RAWToARGBRow_DSPR2 RGB565ToARGBRow_DSPR2 ARGB1555ToARGBRow_DSPR2 ARGB4444ToARGBRow_DSPR2 ScaleAddRow_DSPR2 Bug-fixes in functions: ScaleRowDown2_DSPR2 ScaleRowDown4_DSPR2 BUG= Review-Url: https://codereview.chromium.org/2626123003 .
000d2fa9 · Frank Barchard · 288bfbef · 000d2fa9 · 000d2fa9 · 000d2fa9
Commit 000d2fa9 authored Jan 11, 2017 by Frank Barchard
15 changed files
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -364,6 +364,23 @@ extern "C" {
 #define HAS_MIRRORROW_DSPR2
 #define HAS_MIRRORUVROW_DSPR2
 #define HAS_SPLITUVROW_DSPR2
+#define HAS_RGB24TOARGBROW_DSPR2
+#define HAS_RAWTOARGBROW_DSPR2
+#define HAS_RGB565TOARGBROW_DSPR2
+#define HAS_ARGB1555TOARGBROW_DSPR2
+#define HAS_ARGB4444TOARGBROW_DSPR2
+#define HAS_I444TOARGBROW_DSPR2
+#define HAS_I422TOARGB4444ROW_DSPR2
+#define HAS_I422TOARGB1555ROW_DSPR2
+#define HAS_NV12TOARGBROW_DSPR2
+#define HAS_BGRATOUVROW_DSPR2
+#define HAS_BGRATOYROW_DSPR2
+#define HAS_ABGRTOUVROW_DSPR2
+#define HAS_ARGBTOYROW_DSPR2
+#define HAS_ABGRTOYROW_DSPR2
+#define HAS_RGBATOUVROW_DSPR2
+#define HAS_RGBATOYROW_DSPR2
+#define HAS_ARGBTOUVROW_DSPR2
 #endif
 #endif
@@ -660,6 +677,30 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                        uint8* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I444ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB1555Row_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToARGBRow_MSA(const uint8* src_y,
                       const uint8* src_u,
                       const uint8* src_v,
@@ -789,6 +830,30 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
+void BGRAToUVRow_DSPR2(const uint8* src_bgra,
+                       int src_stride_bgra,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width);
+void BGRAToYRow_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToUVRow_DSPR2(const uint8* src_abgr,
+                       int src_stride_abgr,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width);
+void ARGBToYRow_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
+void ABGRToYRow_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToUVRow_DSPR2(const uint8* src_rgba,
+                       int src_stride_rgba,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width);
+void RGBAToYRow_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
+void ARGBToUVRow_DSPR2(const uint8* src_argb,
+                       int src_stride_argb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width);
 void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
 void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
 void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
@@ -817,6 +882,10 @@ void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
 void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555,
                             uint8* dst_y,
                             int width);
+void BGRAToYRow_Any_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
+void ARGBToYRow_Any_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
+void ABGRToYRow_Any_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
 void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444,
                             uint8* dst_y,
                             int width);
@@ -955,6 +1024,36 @@ void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
                              uint8* dst_u,
                              uint8* dst_v,
                              int width);
+void BGRAToUVRow_Any_DSPR2(const uint8* src_bgra,
+                           int src_stride_bgra,
+                           uint8* dst_u,
+                           uint8* dst_v,
+                           int width);
+void ABGRToUVRow_Any_DSPR2(const uint8* src_abgr,
+                           int src_stride_abgr,
+                           uint8* dst_u,
+                           uint8* dst_v,
+                           int width);
+void RGBAToUVRow_Any_DSPR2(const uint8* src_rgba,
+                           int src_stride_rgba,
+                           uint8* dst_u,
+                           uint8* dst_v,
+                           int width);
+void ARGBToUVRow_Any_DSPR2(const uint8* src_argb,
+                           int src_stride_argb,
+                           uint8* dst_u,
+                           uint8* dst_v,
+                           int width);
+void ARGBToUVRow_C(const uint8* src_argb,
+                   int src_stride_argb,
+                   uint8* dst_u,
+                   uint8* dst_v,
+                   int width);
+void ARGBToUVJRow_C(const uint8* src_argb,
+                    int src_stride_argb,
+                    uint8* dst_u,
+                    uint8* dst_v,
+                    int width);
 void ARGBToUVRow_C(const uint8* src_argb,
                   int src_stride_argb,
                   uint8* dst_u,
@@ -1251,6 +1350,15 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
                            uint8* dst_argb,
                            int width);
+void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
+void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
+                             uint8* dst_argb,
+                             int width);
+void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
+                             uint8* dst_argb,
+                             int width);
 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
                           uint8* dst_argb,
                           int width);
@@ -1299,6 +1407,20 @@ void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555,
 void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444,
                                uint8* dst_argb,
                                int width);
+void RGB24ToARGBRow_Any_DSPR2(const uint8* src_rgb24,
+                              uint8* dst_argb,
+                              int width);
+void RAWToARGBRow_Any_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
+void RGB565ToARGBRow_Any_DSPR2(const uint8* src_rgb565,
+                               uint8* dst_argb,
+                               int width);
+void ARGB1555ToARGBRow_Any_DSPR2(const uint8* src_argb1555,
+                                 uint8* dst_argb,
+                                 int width);
+void ARGB4444ToARGBRow_Any_DSPR2(const uint8* src_argb4444,
+                                 uint8* dst_argb,
+                                 int width);
 void ARGB4444ToARGBRow_Any_MSA(const uint8* src_argb4444,
                               uint8* dst_argb,
                               int width);
@@ -2042,12 +2164,47 @@ void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
                            uint8* dst_argb,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I444ToARGBRow_Any_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_DSPR2(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToARGBRow_Any_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_DSPR2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToARGB1555Row_Any_DSPR2(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I411ToARGBRow_Any_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToARGBRow_Any_DSPR2(const uint8* src_y,
+                             const uint8* src_uv,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_DSPR2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,

--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -101,6 +101,7 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_DSPR2
 #define HAS_SCALEROWDOWN34_DSPR2
 #define HAS_SCALEROWDOWN38_DSPR2
+#define HAS_SCALEADDROW_DSPR2
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -846,6 +847,10 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr,
                                int dst_width);
+void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_DSPR2(const uint8* src_ptr,
+                           uint16* dst_ptr,
+                           int src_width);
 void ScaleRowDown2_MSA(const uint8_t* src_ptr,
                       ptrdiff_t src_stride,

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -579,6 +579,14 @@ int ARGBToI420(const uint8* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -587,6 +595,14 @@ int ARGBToI420(const uint8* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGBToUVRow = ARGBToUVRow_Any_MSA;
@@ -664,6 +680,22 @@ int BGRAToI420(const uint8* src_bgra,
    }
  }
 #endif
+#if defined(HAS_BGRATOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    BGRAToYRow = BGRAToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height - 1; y += 2) {
    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
@@ -733,6 +765,22 @@ int ABGRToI420(const uint8* src_abgr,
    }
  }
 #endif
+#if defined(HAS_ABGRTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ABGRToYRow = ABGRToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height - 1; y += 2) {
    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -802,6 +850,22 @@ int RGBAToI420(const uint8* src_rgba,
    }
  }
 #endif
+#if defined(HAS_RGBATOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGBAToYRow = RGBAToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGBAToUVRow = RGBAToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height - 1; y += 2) {
    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -1014,6 +1078,14 @@ int RAWToI420(const uint8* src_raw,
        ARGBToYRow = ARGBToYRow_AVX2;
      }
    }
+#endif
+#if defined(HAS_RAWTOARGBROW_DSPR2)
+    if (TestCpuFlag(kCpuHasDSPR2)) {
+      RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
+      if (IS_ALIGNED(width, 4)) {
+        RAWToARGBRow = RAWToARGBRow_DSPR2;
+      }
+    }
 #endif
    {
      // Allocate 2 rows of ARGB.
@@ -1142,6 +1214,14 @@ int RGB565ToI420(const uint8* src_rgb565,
          ARGBToYRow = ARGBToYRow_AVX2;
        }
      }
+#endif
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+      if (TestCpuFlag(kCpuHasDSPR2)) {
+        RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+        if (IS_ALIGNED(width, 8)) {
+          RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+        }
+      }
 #endif
      {
        // Allocate 2 rows of ARGB.

--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -485,6 +485,14 @@ static int I444ToARGBMatrix(const uint8* src_y,
    }
  }
 #endif
+#if defined(HAS_I444TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -946,6 +954,14 @@ int RGB24ToARGB(const uint8* src_rgb24,
    }
  }
 #endif
+#if defined(HAS_RGB24TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -997,6 +1013,14 @@ int RAWToARGB(const uint8* src_raw,
    }
  }
 #endif
+#if defined(HAS_RAWTOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    RAWToARGBRow(src_raw, dst_argb, width);
@@ -1056,6 +1080,14 @@ int RGB565ToARGB(const uint8* src_rgb565,
    }
  }
 #endif
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -1115,6 +1147,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555,
    }
  }
 #endif
+#if defined(HAS_ARGB1555TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -1174,6 +1214,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444,
    }
  }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_ARGB4444TOARGBROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
@@ -1238,6 +1286,14 @@ int NV12ToARGB(const uint8* src_y,
    }
  }
 #endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height; ++y) {
    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
@@ -1354,6 +1410,14 @@ int M420ToARGB(const uint8* src_m420,
    }
  }
 #endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+    }
+  }
+#endif
  for (y = 0; y < height - 1; y += 2) {
    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -708,6 +708,14 @@ int I420ToARGB1555(const uint8* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TOARGB1555ROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGB1555ROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
@@ -781,6 +789,14 @@ int I420ToARGB4444(const uint8* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TOARGB4444ROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGB4444ROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;

--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -100,6 +100,14 @@ int ARGBToI444(const uint8* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -189,6 +197,23 @@ int ARGBToI422(const uint8* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -318,6 +343,22 @@ int ARGBToNV12(const uint8* src_argb,
      MergeUVRow_ = MergeUVRow_NEON;
    }
  }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
 #endif
  {
    // Allocate a rows of uv.
@@ -445,6 +486,22 @@ int ARGBToNV21(const uint8* src_argb,
      MergeUVRow_ = MergeUVRow_NEON;
    }
  }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
 #endif
  {
    // Allocate a rows of uv.
@@ -570,6 +627,22 @@ int ARGBToYUY2(const uint8* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
@@ -698,6 +771,22 @@ int ARGBToUYVY(const uint8* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -775,6 +864,14 @@ int ARGBToI400(const uint8* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGBToYRow = ARGBToYRow_Any_MSA;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -167,6 +167,12 @@ ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
 #endif
+#ifdef HAS_I422TOARGBROW_DSPR2
+ANY31C(I444ToARGBRow_Any_DSPR2, I444ToARGBRow_DSPR2, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_DSPR2, I422ToARGBRow_DSPR2, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
+#endif
 #ifdef HAS_I422TOARGBROW_MSA
 ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
 ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
@@ -291,6 +297,9 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
 #ifdef HAS_NV12TOARGBROW_NEON
 ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV12TOARGBROW_DSPR2
+ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV21TOARGBROW_SSSE3
 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
 #endif
@@ -484,6 +493,33 @@ ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_RGB24TOARGBROW_DSPR2
+ANY11(RGB24ToARGBRow_Any_DSPR2, RGB24ToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_DSPR2
+ANY11(RAWToARGBRow_Any_DSPR2, RAWToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_DSPR2
+ANY11(RGB565ToARGBRow_Any_DSPR2, RGB565ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_DSPR2
+ANY11(ARGB1555ToARGBRow_Any_DSPR2, ARGB1555ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_DSPR2
+ANY11(ARGB4444ToARGBRow_Any_DSPR2, ARGB4444ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_BGRATOYROW_DSPR2
+ANY11(BGRAToYRow_Any_DSPR2, BGRAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_DSPR2
+ANY11(ARGBToYRow_Any_DSPR2, ARGBToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_DSPR2
+ANY11(ABGRToYRow_Any_DSPR2, ABGRToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_DSPR2
+ANY11(RGBAToYRow_Any_DSPR2, RGBAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
 #ifdef HAS_ARGB4444TOARGBROW_MSA
 ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
 #endif
@@ -904,6 +940,18 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
 #ifdef HAS_UYVYTOUVROW_NEON
 ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #endif
+#ifdef HAS_BGRATOUVROW_DSPR2
+ANY12S(BGRAToUVRow_Any_DSPR2, BGRAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_DSPR2
+ANY12S(ABGRToUVRow_Any_DSPR2, ABGRToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_DSPR2
+ANY12S(RGBAToUVRow_Any_DSPR2, RGBAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_DSPR2
+ANY12S(ARGBToUVRow_Any_DSPR2, ARGBToUVRow_DSPR2, 0, 4, 15)
+#endif
 #ifdef HAS_YUY2TOUVROW_MSA
 ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
 #endif

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -202,8 +202,9 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
    uint8 b1 = src_argb[4] >> 3;
    uint8 g1 = src_argb[5] >> 2;
    uint8 r1 = src_argb[6] >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+    WRITEWORD(
-                           (r1 << 27));
+        dst_rgb,
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
    dst_rgb += 4;
    src_argb += 8;
  }
@@ -237,8 +238,9 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb,
    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+    WRITEWORD(
-                           (r1 << 27));
+        dst_rgb,
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
    dst_rgb += 4;
    src_argb += 8;
  }

--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -585,126 +585,89 @@ void MirrorUVRow_DSPR2(const uint8* src_uv,
      : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
 }
-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+void I422ToARGBRow_DSPR2(const uint8* src_y,
-// t5 = | 0 | B0 | 0 | b0 |
+                         const uint8* src_u,
-// t4 = | 0 | B1 | 0 | b1 |
+                         const uint8* src_v,
-// t9 = | 0 | G0 | 0 | g0 |
-// t8 = | 0 | G1 | 0 | g1 |
-// t2 = | 0 | R0 | 0 | r0 |
-// t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB                                \
-  "lw                $t0, 0(%[y_buf])       \n" \
-  "lhu               $t1, 0(%[u_buf])       \n" \
-  "lhu               $t2, 0(%[v_buf])       \n" \
-  "preceu.ph.qbr     $t1, $t1               \n" \
-  "preceu.ph.qbr     $t2, $t2               \n" \
-  "preceu.ph.qbra    $t3, $t0               \n" \
-  "preceu.ph.qbla    $t0, $t0               \n" \
-  "subu.ph           $t1, $t1, $s5          \n" \
-  "subu.ph           $t2, $t2, $s5          \n" \
-  "subu.ph           $t3, $t3, $s4          \n" \
-  "subu.ph           $t0, $t0, $s4          \n" \
-  "mul.ph            $t3, $t3, $s0          \n" \
-  "mul.ph            $t0, $t0, $s0          \n" \
-  "shll.ph           $t4, $t1, 0x7          \n" \
-  "subu.ph           $t4, $t4, $t1          \n" \
-  "mul.ph            $t6, $t1, $s1          \n" \
-  "mul.ph            $t1, $t2, $s2          \n" \
-  "addq_s.ph         $t5, $t4, $t3          \n" \
-  "addq_s.ph         $t4, $t4, $t0          \n" \
-  "shra.ph           $t5, $t5, 6            \n" \
-  "shra.ph           $t4, $t4, 6            \n" \
-  "addiu             %[u_buf], 2            \n" \
-  "addiu             %[v_buf], 2            \n" \
-  "addu.ph           $t6, $t6, $t1          \n" \
-  "mul.ph            $t1, $t2, $s3          \n" \
-  "addu.ph           $t9, $t6, $t3          \n" \
-  "addu.ph           $t8, $t6, $t0          \n" \
-  "shra.ph           $t9, $t9, 6            \n" \
-  "shra.ph           $t8, $t8, 6            \n" \
-  "addu.ph           $t2, $t1, $t3          \n" \
-  "addu.ph           $t1, $t1, $t0          \n" \
-  "shra.ph           $t2, $t2, 6            \n" \
-  "shra.ph           $t1, $t1, 6            \n" \
-  "subu.ph           $t5, $t5, $s5          \n" \
-  "subu.ph           $t4, $t4, $s5          \n" \
-  "subu.ph           $t9, $t9, $s5          \n" \
-  "subu.ph           $t8, $t8, $s5          \n" \
-  "subu.ph           $t2, $t2, $s5          \n" \
-  "subu.ph           $t1, $t1, $s5          \n" \
-  "shll_s.ph         $t5, $t5, 8            \n" \
-  "shll_s.ph         $t4, $t4, 8            \n" \
-  "shll_s.ph         $t9, $t9, 8            \n" \
-  "shll_s.ph         $t8, $t8, 8            \n" \
-  "shll_s.ph         $t2, $t2, 8            \n" \
-  "shll_s.ph         $t1, $t1, 8            \n" \
-  "shra.ph           $t5, $t5, 8            \n" \
-  "shra.ph           $t4, $t4, 8            \n" \
-  "shra.ph           $t9, $t9, 8            \n" \
-  "shra.ph           $t8, $t8, 8            \n" \
-  "shra.ph           $t2, $t2, 8            \n" \
-  "shra.ph           $t1, $t1, 8            \n" \
-  "addu.ph           $t5, $t5, $s5          \n" \
-  "addu.ph           $t4, $t4, $s5          \n" \
-  "addu.ph           $t9, $t9, $s5          \n" \
-  "addu.ph           $t8, $t8, $s5          \n" \
-  "addu.ph           $t2, $t2, $s5          \n" \
-  "addu.ph           $t1, $t1, $s5          \n"
-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
                         uint8* rgb_buf,
                         const struct YuvConstants* yuvconstants,
                         int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_yg;
+  uint32 tmp_mask = 0x7fff7fff;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
    __asm__ __volatile__(
        ".set push                                             \n"
        ".set noreorder                                        \n"
-      "beqz              %[width], 2f           \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
-      " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
-      "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
-      "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
-      "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+        "lbu              %[tmp_t2], 0(%[src_u])               \n"
-      "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+        "lbu              %[tmp_t3], 0(%[src_v])               \n"
-      "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
-      "lui               $s6, 0xff00            \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
-      "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
-      "1:                                        \n" YUVTORGB
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
-      // Arranging into argb format
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
-      "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
-      "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
-      "addiu             %[width], -4           \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
-      "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
-      "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
-      "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
-      "addiu             %[y_buf], 4            \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
-      "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
-      "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
-      "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
-      "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
-      "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
-      "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
-      "sll               $t9, $t9, 16           \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
-      "sll               $t8, $t8, 16           \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
-      "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-      "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+        "precrq.ph.w      %[tmp_t9], %[tmp_t8],     %[tmp_t7]  \n"
-                                                     // Store results.
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
-      "sw                $t2, 0(%[rgb_buf])     \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t9],     %[tmp_t7]  \n"
-      "sw                $t0, 4(%[rgb_buf])     \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-      "sw                $t1, 8(%[rgb_buf])     \n"
+        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
-      "sw                $t3, 12(%[rgb_buf])    \n"
+        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
-      "bnez              %[width], 1b           \n"
-      " addiu            %[rgb_buf], 16         \n"
-      "2:                                        \n"
        ".set pop                                              \n"
-      : [y_buf] "+r"(y_buf), [u_buf] "+r"(u_buf), [v_buf] "+r"(v_buf),
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-        [width] "+r"(width), [rgb_buf] "+r"(rgb_buf)
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-      :
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
-        "s2", "s3", "s4", "s5", "s6");
+        : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+          [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
+          [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+          [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+          [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 4 pixels.
+  }
 }
 // Bilinear filter 8x2 -> 8x1
@@ -740,10 +703,10 @@ void InterpolateRow_DSPR2(uint8* dst_ptr,
      "addq.ph           $t7, $t7, $t9                     \n"
      "addq.ph           $t2, $t2, $t4                     \n"
      "addq.ph           $t3, $t3, $t5                     \n"
-      "shra.ph           $t6, $t6, 8                       \n"
+      "shra_r.ph         $t6, $t6, 8                       \n"
-      "shra.ph           $t7, $t7, 8                       \n"
+      "shra_r.ph         $t7, $t7, 8                       \n"
-      "shra.ph           $t2, $t2, 8                       \n"
+      "shra_r.ph         $t2, $t2, 8                       \n"
-      "shra.ph           $t3, $t3, 8                       \n"
+      "shra_r.ph         $t3, $t3, 8                       \n"
      "precr.qb.ph       $t6, $t6, $t7                     \n"
      "precr.qb.ph       $t2, $t2, $t3                     \n"
      "addiu             %[src_ptr], %[src_ptr], 8         \n"
@@ -761,6 +724,993 @@ void InterpolateRow_DSPR2(uint8* dst_ptr,
        [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
 }
+#include <stdio.h>
+void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  int x;
+  uint32 tmp_mask = 0xff;
+  uint32 tmp_t1;
+  for (x = 0; x < (width - 1); ++x) {
+    __asm__ __volatile__(
+        ".set push                                                  \n"
+        ".set noreorder                                             \n"
+        "ulw             %[tmp_t1],    0(%[src_rgb24])              \n"
+        "addiu           %[dst_argb],  %[dst_argb],     4           \n"
+        "addiu           %[src_rgb24], %[src_rgb24],    3           \n"
+        "ins             %[tmp_t1],    %[tmp_mask],     24,    8    \n"
+        "sw              %[tmp_t1],    -4(%[dst_argb])              \n"
+        ".set pop                                                   \n"
+        : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
+          [tmp_t1] "=&r"(tmp_t1)
+        : [tmp_mask] "r"(tmp_mask)
+        : "memory");
+  }
+  uint8 b = src_rgb24[0];
+  uint8 g = src_rgb24[1];
+  uint8 r = src_rgb24[2];
+  dst_argb[0] = b;
+  dst_argb[1] = g;
+  dst_argb[2] = r;
+  dst_argb[3] = 255u;
+}
+void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
+  int x;
+  uint32 tmp_mask = 0xff;
+  uint32 tmp_t1, tmp_t2;
+  for (x = 0; x < (width - 1); ++x) {
+    __asm__ __volatile__(
+        ".set push                                               \n"
+        ".set noreorder                                          \n"
+        "ulw               %[tmp_t1],   0(%[src_raw])            \n"
+        "addiu             %[dst_argb], %[dst_argb],      4      \n"
+        "addiu             %[src_raw],  %[src_raw],       3      \n"
+        "srl               %[tmp_t2],   %[tmp_t1],        16     \n"
+        "ins               %[tmp_t1],   %[tmp_mask],      24, 8  \n"
+        "ins               %[tmp_t1],   %[tmp_t1],        16, 8  \n"
+        "ins               %[tmp_t1],   %[tmp_t2],        0,  8  \n"
+        "sw                %[tmp_t1],   -4(%[dst_argb])          \n"
+        ".set pop                                                \n"
+        : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
+          [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
+        : [tmp_mask] "r"(tmp_mask)
+        : "memory");
+  }
+  uint8 r = src_raw[0];
+  uint8 g = src_raw[1];
+  uint8 b = src_raw[2];
+  dst_argb[0] = b;
+  dst_argb[1] = g;
+  dst_argb[2] = r;
+  dst_argb[3] = 255u;
+}
+void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
+                           uint8* dst_argb,
+                           int width) {
+  int x;
+  uint32 tmp_mask = 0xff;
+  uint32 tmp_t1, tmp_t2, tmp_t3;
+  for (x = 0; x < width; ++x) {
+    __asm__ __volatile__(
+        ".set push                                                   \n"
+        ".set noreorder                                              \n"
+        "lhu               %[tmp_t1],     0(%[src_rgb565])           \n"
+        "addiu             %[dst_argb],   %[dst_argb],      4        \n"
+        "addiu             %[src_rgb565], %[src_rgb565],    2        \n"
+        "sll               %[tmp_t2],     %[tmp_t1],        8        \n"
+        "ins               %[tmp_t2],     %[tmp_mask],      24,8     \n"
+        "ins               %[tmp_t2],     %[tmp_t1],        3, 16    \n"
+        "ins               %[tmp_t2],     %[tmp_t1],        5, 11    \n"
+        "srl               %[tmp_t3],     %[tmp_t1],        9        \n"
+        "ins               %[tmp_t2],     %[tmp_t3],        8, 2     \n"
+        "ins               %[tmp_t2],     %[tmp_t1],        3, 5     \n"
+        "srl               %[tmp_t3],     %[tmp_t1],        2        \n"
+        "ins               %[tmp_t2],     %[tmp_t3],        0, 3     \n"
+        "sw                %[tmp_t2],     -4(%[dst_argb])            \n"
+        ".set pop                                                    \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
+          [dst_argb] "+r"(dst_argb)
+        : [tmp_mask] "r"(tmp_mask));
+  }
+}
+void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
+                             uint8* dst_argb,
+                             int width) {
+  int x;
+  uint32 tmp_t1, tmp_t2, tmp_t3;
+  for (x = 0; x < width; ++x) {
+    __asm__ __volatile__(
+        ".set push                                                   \n"
+        ".set noreorder                                              \n"
+        "lh                %[tmp_t1],       0(%[src_argb1555])       \n"
+        "addiu             %[dst_argb],     %[dst_argb],      4      \n"
+        "addiu             %[src_argb1555], %[src_argb1555],  2      \n"
+        "sll               %[tmp_t2],       %[tmp_t1],        9      \n"
+        "ins               %[tmp_t2],       %[tmp_t1],        4, 15  \n"
+        "ins               %[tmp_t2],       %[tmp_t1],        6, 10  \n"
+        "srl               %[tmp_t3],       %[tmp_t1],        7      \n"
+        "ins               %[tmp_t2],       %[tmp_t3],        8, 3   \n"
+        "ins               %[tmp_t2],       %[tmp_t1],        3, 5   \n"
+        "srl               %[tmp_t3],       %[tmp_t1],        2      \n"
+        "ins               %[tmp_t2],       %[tmp_t3],        0, 3   \n"
+        "sw                %[tmp_t2],       -4(%[dst_argb])          \n"
+        ".set pop                                                    \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
+          [dst_argb] "+r"(dst_argb)
+        :);
+  }
+}
+void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
+                             uint8* dst_argb,
+                             int width) {
+  int x;
+  uint32 tmp_t1;
+  for (x = 0; x < width; ++x) {
+    __asm__ __volatile__(
+        ".set push                                                    \n"
+        ".set noreorder                                               \n"
+        "lh                %[tmp_t1],       0(%[src_argb4444])        \n"
+        "addiu             %[dst_argb],     %[dst_argb],       4      \n"
+        "addiu             %[src_argb4444], %[src_argb4444],   2      \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         16, 16 \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         12, 16 \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         8,  12 \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         4,  8  \n"
+        "sw                %[tmp_t1],       -4(%[dst_argb])           \n"
+        ".set pop                                                     \n"
+        : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
+          [tmp_t1] "=&r"(tmp_t1));
+  }
+}
+void I444ToARGBRow_DSPR2(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_mask = 0x7fff7fff;
+  uint32 tmp_yg;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                              \n"
+        ".set noreorder                                         \n"
+        "lbu              %[tmp_t7], 0(%[y_buf])               \n"
+        "lbu              %[tmp_t1], 1(%[y_buf])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lh               %[tmp_t2], 0(%[u_buf])               \n"
+        "lh               %[tmp_t3], 0(%[v_buf])               \n"
+        "preceu.ph.qbr    %[tmp_t2], %[tmp_t2]                 \n"
+        "preceu.ph.qbr    %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
+        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
+          [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
+          [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+          [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+          [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+    y_buf += 2;
+    u_buf += 2;
+    v_buf += 2;
+    rgb_buf += 8;  // Advance 1 pixel.
+  }
+}
+void I422ToARGB4444Row_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_yg;
+  uint32 tmp_mask = 0x7fff7fff;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                             \n"
+        ".set noreorder                                        \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lbu              %[tmp_t2], 0(%[src_u])               \n"
+        "lbu              %[tmp_t3], 0(%[src_v])               \n"
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "shrl.qb          %[tmp_t1], %[tmp_t8],     4          \n"
+        "shrl.qb          %[tmp_t2], %[tmp_t7],     4          \n"
+        "shrl.ph          %[tmp_t8], %[tmp_t1],     4          \n"
+        "shrl.ph          %[tmp_t7], %[tmp_t2],     4          \n"
+        "or               %[tmp_t8], %[tmp_t8],     %[tmp_t1]  \n"
+        "or               %[tmp_t7], %[tmp_t7],     %[tmp_t2]  \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t7],     %[tmp_t8]  \n"
+        "sw               %[tmp_t8], 0(%[dst_argb4444])        \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
+          [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+          [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+          [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+          [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
+  }
+}
+void I422ToARGB1555Row_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_yg;
+  uint32 tmp_mask = 0x80008000;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                             \n"
+        ".set noreorder                                        \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lbu              %[tmp_t2], 0(%[src_u])               \n"
+        "lbu              %[tmp_t3], 0(%[src_v])               \n"
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "ins              %[tmp_t3], %[tmp_t8],     7,      24 \n"
+        "ins              %[tmp_t3], %[tmp_t8],     10,     16 \n"
+        "ins              %[tmp_t3], %[tmp_t8],     13,     8  \n"
+        "ins              %[tmp_t4], %[tmp_t7],     7,      24 \n"
+        "ins              %[tmp_t4], %[tmp_t7],     10,     16 \n"
+        "ins              %[tmp_t4], %[tmp_t7],     13,     8  \n"
+        "precrq.ph.w      %[tmp_t8], %[tmp_t4],     %[tmp_t3]  \n"
+        "or               %[tmp_t8], %[tmp_t8],     %[tmp_mask]\n"
+        "sw               %[tmp_t8], 0(%[dst_argb1555])        \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
+          [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+          [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+          [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+          [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+}
+void NV12ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_mask = 0x7fff7fff;
+  uint32 tmp_yg;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                             \n"
+        ".set noreorder                                        \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lbu              %[tmp_t2], 0(%[src_uv])              \n"
+        "lbu              %[tmp_t3], 1(%[src_uv])              \n"
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
+        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
+          [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
+          [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
+          [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
+          [tmp_mask] "r"(tmp_mask));
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffda0000;
+  int const2 = 0x0070ffb6;
+  int const3 = 0x00700000;
+  int const4 = 0xffeeffa2;
+  int const5 = 0x100;
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                 \n"
+        ".set noreorder                                            \n"
+        "lw                %[tmp_t1],   0(%[src_rgb0])             \n"
+        "lw                %[tmp_t2],   4(%[src_rgb0])             \n"
+        "lw                %[tmp_t3],   0(%[src_rgb1])             \n"
+        "lw                %[tmp_t4],   4(%[src_rgb1])             \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                  \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                  \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                  \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                  \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                  \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                  \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                  \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]   \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]   \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]   \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]   \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]   \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]   \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2           \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2           \n"
+        "mult              $ac0,        %[const5],     %[const5]   \n"
+        "mult              $ac1,        %[const5],     %[const5]   \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]   \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]   \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]   \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]   \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9           \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9           \n"
+        "addiu             %[dst_u],    %[dst_u],    1             \n"
+        "addiu             %[dst_v],    %[dst_v],    1             \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8             \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8             \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])               \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])               \n"
+        ".set pop                                                  \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
+void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00420000;
+  int const2 = 0x00190081;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffb6ffda;
+  int const2 = 0x00000070;
+  int const3 = 0xffa20070;
+  int const4 = 0x0000ffee;
+  int const5 = 0x100;
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
+        "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
+        "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
+        "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
+        "addiu             %[dst_u],    %[dst_u],    1            \n"
+        "addiu             %[dst_v],    %[dst_v],    1            \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
+void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00810019;
+  int const2 = 0x00000042;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00810042;
+  int const2 = 0x00000019;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffb60070;
+  int const2 = 0x0000ffda;
+  int const3 = 0xffa2ffee;
+  int const4 = 0x00000070;
+  int const5 = 0x100;
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "ulw               %[tmp_t1],   0+1(%[src_rgb0])          \n"
+        "ulw               %[tmp_t2],   4+1(%[src_rgb0])          \n"
+        "ulw               %[tmp_t3],   0+1(%[src_rgb1])          \n"
+        "ulw               %[tmp_t4],   4+1(%[src_rgb1])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
+        "addiu             %[dst_u],    %[dst_u],    1            \n"
+        "addiu             %[dst_v],    %[dst_v],    1            \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
+void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00420081;
+  int const2 = 0x00190000;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbl     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbr     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffb60070;
+  int const2 = 0x0000ffda;
+  int const3 = 0xffa2ffee;
+  int const4 = 0x00000070;
+  int const5 = 0x100;
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
+        "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
+        "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
+        "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
+        "addiu             %[dst_u],    %[dst_u],    1            \n"
+        "addiu             %[dst_v],    %[dst_v],    1            \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
 #endif  // __mips_dsp_rev >= 2
 #endif  // defined(__mips__)

--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -894,6 +894,14 @@ static void ScalePlaneBox(int src_width,
      }
    }
 #endif
+#if defined(HAS_SCALEADDROW_DSPR2)
+    if (TestCpuFlag(kCpuHasDSPR2)) {
+      ScaleAddRow = ScaleAddRow_Any_DSPR2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_DSPR2;
+      }
+    }
+#endif
    for (j = 0; j < dst_height; ++j) {
      int boxheight;

--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -421,6 +421,9 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #ifdef HAS_SCALEADDROW_MSA
 SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
 #endif
+#ifdef HAS_SCALEADDROW_DSPR2
+SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15)
+#endif
 #undef SAANY
 #ifdef __cplusplus

--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -42,10 +42,10 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr,
      "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
      "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
      // TODO(fbarchard): Use odd pixels instead of even.
-      "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
+      "precrq.qb.ph   $t8, $t1, $t0                  \n"  // |7|5|3|1|
-      "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
+      "precrq.qb.ph   $t0, $t3, $t2                  \n"  // |15|13|11|9|
-      "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
+      "precrq.qb.ph   $t1, $t5, $t4                  \n"  // |23|21|19|17|
-      "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
+      "precrq.qb.ph   $t2, $t7, $t6                  \n"  // |31|29|27|25|
      "addiu          %[src_ptr], %[src_ptr], 32     \n"
      "addiu          $t9, $t9, -1                   \n"
      "sw             $t8, 0(%[dst])                 \n"
@@ -61,7 +61,7 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr,
      " nop                                          \n"
      "21:                                           \n"
-      "lbu            $t0, 0(%[src_ptr])             \n"
+      "lbu            $t0, 1(%[src_ptr])             \n"
      "addiu          %[src_ptr], %[src_ptr], 2      \n"
      "addiu          $t9, $t9, -1                   \n"
      "sb             $t0, 0(%[dst])                 \n"
@@ -198,8 +198,8 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr,
      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precrq.qb.ph   $t1, $t2, $t1                 \n"  // |14|10|6|2|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "precrq.qb.ph   $t5, $t6, $t5                 \n"  // |30|26|22|18|
      "addiu          %[src_ptr], %[src_ptr], 32    \n"
      "addiu          $t9, $t9, -1                  \n"
      "sw             $t1, 0(%[dst])                \n"
@@ -213,7 +213,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr,
      " nop                                         \n"
      "21:                                          \n"
-      "lbu            $t1, 0(%[src_ptr])            \n"
+      "lbu            $t1, 2(%[src_ptr])            \n"
      "addiu          %[src_ptr], %[src_ptr], 4     \n"
      "addiu          $t9, $t9, -1                  \n"
      "sb             $t1, 0(%[dst])                \n"
@@ -615,6 +615,51 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
 }
+void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  int x;
+  for (x = 0; x < ((src_width - 1)); x += 8) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
+    uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t5],   0(%[src_ptr])             \n"
+        "lw                %[tmp_t6],   4(%[src_ptr])             \n"
+        "lw                %[tmp_t1],   0(%[dst_ptr])             \n"
+        "lw                %[tmp_t2],   4(%[dst_ptr])             \n"
+        "lw                %[tmp_t3],   8(%[dst_ptr])             \n"
+        "lw                %[tmp_t4],   12(%[dst_ptr])            \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t5]                 \n"
+        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t5]                 \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t2],   %[tmp_t2],     %[tmp_t8]  \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t6]                 \n"
+        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t6]                 \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t4],   %[tmp_t4],     %[tmp_t8]  \n"
+        "sw                %[tmp_t1],   0(%[dst_ptr])             \n"
+        "sw                %[tmp_t2],   4(%[dst_ptr])             \n"
+        "sw                %[tmp_t3],   8(%[dst_ptr])             \n"
+        "sw                %[tmp_t4],   12(%[dst_ptr])            \n"
+        ".set pop                                                 \n"
+        :
+        [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
+        [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+        [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
+        : [dst_ptr] "r"(dst_ptr));
+    src_ptr += 8;
+    dst_ptr += 8;
+  }
+  if ((src_width)&7) {
+    for (x = 0; x < ((src_width - 1) & 7); x += 1) {
+      dst_ptr[0] += src_ptr[0];
+      src_ptr += 1;
+      dst_ptr += 1;
+    }
+  }
+}
 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #ifdef __cplusplus

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -36,22 +36,28 @@ namespace libyuv {
    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
    const int kHeight = benchmark_height_;                                     \
    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
+    align_buffer_page_end(                                                     \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
+        src_u,                                                                 \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
            OFF);                                                              \
-    align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
+    align_buffer_page_end(                                                     \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
+        src_v,                                                                 \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
            OFF);                                                              \
    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+    align_buffer_page_end(                                                     \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+        dst_u_c,                                                               \
-    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(                                                     \
+        dst_v_c,                                                               \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+    align_buffer_page_end(                                                     \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+        dst_u_opt,                                                             \
-    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(                                                     \
+        dst_v_opt,                                                             \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
    for (int i = 0; i < kHeight; ++i)                                          \
      for (int j = 0; j < kWidth; ++j)                                         \
        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
@@ -166,15 +172,19 @@ TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
    align_buffer_page_end(src_uv,                                             \
                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+    align_buffer_page_end(                                                    \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+        dst_u_c,                                                              \
-    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(                                                    \
+        dst_v_c,                                                              \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+    align_buffer_page_end(                                                    \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+        dst_u_opt,                                                            \
-    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(                                                    \
+        dst_v_opt,                                                            \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
    uint8* src_u = src_uv + OFF_U;                                            \
    uint8* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);            \
    int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE;          \
@@ -284,18 +294,22 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
    const int kHeight = benchmark_height_;                                     \
    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
+    align_buffer_page_end(                                                     \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
+        src_u,                                                                 \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
            OFF);                                                              \
-    align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
+    align_buffer_page_end(                                                     \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
+        src_v,                                                                 \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
            OFF);                                                              \
    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *        \
+    align_buffer_page_end(                                                     \
-                                        SUBSAMPLE(kHeight, SUBSAMP_Y));       \
+        dst_uv_c,                                                              \
+        SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *      \
+    align_buffer_page_end(                                                     \
-                                          SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+        dst_uv_opt,                                                            \
+        SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    for (int i = 0; i < kHeight; ++i)                                          \
      for (int j = 0; j < kWidth; ++j)                                         \
        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
@@ -379,19 +393,24 @@ TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
    const int kHeight = benchmark_height_;                                     \
    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *       \
+    align_buffer_page_end(src_uv,                                              \
+                          2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *               \
                                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +          \
                              OFF);                                            \
    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *              \
+    align_buffer_page_end(                                                     \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));         \
+        dst_u_c,                                                               \
-    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *              \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));         \
+    align_buffer_page_end(                                                     \
+        dst_v_c,                                                               \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *            \
+    align_buffer_page_end(                                                     \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
+        dst_u_opt,                                                             \
-    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *            \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
+    align_buffer_page_end(                                                     \
+        dst_v_opt,                                                             \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));         \
    for (int i = 0; i < kHeight; ++i)                                          \
      for (int j = 0; j < kWidth; ++j)                                         \
        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
@@ -1369,10 +1388,12 @@ TEST_F(LibYUVConvertTest, MJPGToI420) {
  const int kSize = kImageSize + kOff;
  align_buffer_page_end(orig_pixels, kSize);
  align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
-  align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
+  align_buffer_page_end(
-                                       SUBSAMPLE(benchmark_height_, 2));
+      dst_u_opt,
-  align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
+      SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2));
-                                       SUBSAMPLE(benchmark_height_, 2));
+  align_buffer_page_end(
+      dst_v_opt,
+      SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2));
  // EOI, SOI to make MJPG appear valid.
  memset(orig_pixels, 0, kSize);
@@ -1444,16 +1465,20 @@ TEST_F(LibYUVConvertTest, NV12Crop) {
  uint8* src_uv = src_y + kWidth * kHeight;
  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+  align_buffer_page_end(
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+      dst_u,
-  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+      SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(
+      dst_v,
+      SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
  align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+  align_buffer_page_end(
-                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+      dst_u_2,
-  align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+      SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(
+      dst_v_2,
+      SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
  for (int i = 0; i < kHeight * kWidth; ++i) {
    src_y[i] = (fastrand() & 0xff);

--- a/util/psnr_main.cc
+++ b/util/psnr_main.cc
@@ -356,16 +356,18 @@ int main(int argc, const char* argv[]) {
  const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
  const size_t total_size = y_size + 2 * uv_size;  // NOLINT
 #if defined(_MSC_VER)
-  _fseeki64(file_org, static_cast<__int64>(num_skip_org) *
+  _fseeki64(
-                          static_cast<__int64>(total_size),
+      file_org,
+      static_cast<__int64>(num_skip_org) * static_cast<__int64>(total_size),
      SEEK_SET);
 #else
  fseek(file_org, num_skip_org * total_size, SEEK_SET);
 #endif
  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
 #if defined(_MSC_VER)
-    _fseeki64(file_rec[cur_rec], static_cast<__int64>(num_skip_rec) *
+    _fseeki64(
-                                     static_cast<__int64>(total_size),
+        file_rec[cur_rec],
+        static_cast<__int64>(num_skip_rec) * static_cast<__int64>(total_size),
        SEEK_SET);
 #else
    fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);