/*
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
#include "libyuv/row.h"

#include <string.h>  // For memcpy and memset.

#include "libyuv/basic_types.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

// This module is for Mips MMI.
#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)

void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
                        uint8_t* dst_argb,
                        int width) {
  uint64_t src0, src1, dest;
  const uint64_t mask = 0xff000000ULL;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"

      "or         %[src0],         %[src0],           %[mask]       \n\t"
      "or         %[src1],         %[src1],           %[mask]       \n\t"
      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"

      "or         %[src0],         %[src0],           %[mask]       \n\t"
      "or         %[src1],         %[src1],           %[mask]       \n\t"
      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
      "daddi      %[width],        %[width],         -0x04          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
      : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
        [mask] "f"(mask)
      : "memory");
}

void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  uint64_t src0, src1, dest;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0xff000000ULL;
  const uint64_t mask2 = 0xc6;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"

      "or         %[src0],         %[src0],           %[mask1]      \n\t"
      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
      "or         %[src1],         %[src1],           %[mask1]      \n\t"
      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"

      "or         %[src0],         %[src0],           %[mask1]      \n\t"
      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
      "or         %[src1],         %[src1],           %[mask1]      \n\t"
      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
      "daddi      %[width],        %[width],         -0x04          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
      : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
      : "memory");
}

void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  uint64_t src0, src1;
  uint64_t ftmp[4];
  uint64_t mask0 = 0xc6;
  uint64_t mask1 = 0x6c;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldrc1    %[src0],         0x00(%[src_raw])                 \n\t"
      "gsldlc1    %[src0],         0x07(%[src_raw])                 \n\t"
      "gslwrc1    %[src1],         0x08(%[src_raw])                 \n\t"
      "gslwlc1    %[src1],         0x0b(%[src_raw])                 \n\t"

      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
      "punpcklbh  %[src1],         %[src1],           %[zero]       \n\t"
      "pextrh     %[ftmp2],        %[ftmp0],          %[three]      \n\t"
      "pextrh     %[ftmp3],        %[ftmp1],          %[one]        \n\t"
      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[ftmp3]      \n\t"
      "pextrh     %[ftmp3],        %[ftmp1],          %[two]        \n\t"
      "pinsrh_1   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
      "pextrh     %[ftmp2],        %[src1],           %[zero]       \n\t"
      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
      "pinsrh_0   %[src1],         %[src1],           %[ftmp3]      \n\t"
      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
      "packushb   %[src1],         %[src1],           %[zero]       \n\t"

      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb24])               \n\t"
      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb24])               \n\t"
      "gsswrc1    %[src1],         0x08(%[dst_rgb24])               \n\t"
      "gsswlc1    %[src1],         0x0b(%[dst_rgb24])               \n\t"

      "daddiu     %[src_raw],      %[src_raw],        0x0c          \n\t"
      "daddiu     %[dst_rgb24],    %[dst_rgb24],      0x0c          \n\t"
      "daddiu     %[width],        %[width],         -0x04          \n\t"
      "bgtz       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
      : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
        [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
      : "memory");
}

void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
                         uint8_t* dst_argb,
                         int width) {
  uint64_t ftmp[5];
  uint64_t c0 = 0x001f001f001f001f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t c2 = 0x0007000700070007;
  __asm__ volatile(
      "1:                                                      \n\t"
      "gsldrc1   %[src0],       0x00(%[src_rgb565])            \n\t"
      "gsldlc1   %[src0],       0x07(%[src_rgb565])            \n\t"
      "psrlh     %[src1],       %[src0],             %[eight]  \n\t"
      "and       %[b],          %[src0],             %[c0]     \n\t"
      "and       %[src0],       %[src0],             %[c1]     \n\t"
      "psrlh     %[src0],       %[src0],             %[five]   \n\t"
      "and       %[g],          %[src1],             %[c2]     \n\t"
      "psllh     %[g],          %[g],                %[three]  \n\t"
      "or        %[g],          %[src0],             %[g]      \n\t"
      "psrlh     %[r],          %[src1],             %[three]  \n\t"
      "psllh     %[src0],       %[b],                %[three]  \n\t"
      "psrlh     %[src1],       %[b],                %[two]    \n\t"
      "or        %[b],          %[src0],             %[src1]   \n\t"
      "psllh     %[src0],       %[g],                %[two]    \n\t"
      "psrlh     %[src1],       %[g],                %[four]   \n\t"
      "or        %[g],          %[src0],             %[src1]   \n\t"
      "psllh     %[src0],       %[r],                %[three]  \n\t"
      "psrlh     %[src1],       %[r],                %[two]    \n\t"
      "or        %[r],          %[src0],             %[src1]   \n\t"
      "packushb  %[b],          %[b],                %[r]      \n\t"
      "packushb  %[g],          %[g],                %[c1]     \n\t"
      "punpcklbh %[src0],       %[b],                %[g]      \n\t"
      "punpckhbh %[src1],       %[b],                %[g]      \n\t"
      "punpcklhw %[r],          %[src0],             %[src1]   \n\t"
      "gssdrc1   %[r],          0x00(%[dst_argb])              \n\t"
      "gssdlc1   %[r],          0x07(%[dst_argb])              \n\t"
      "punpckhhw %[r],          %[src0],             %[src1]   \n\t"
      "gssdrc1   %[r],          0x08(%[dst_argb])              \n\t"
      "gssdlc1   %[r],          0x0f(%[dst_argb])              \n\t"
      "daddiu    %[src_rgb565], %[src_rgb565],       0x08      \n\t"
      "daddiu    %[dst_argb],   %[dst_argb],         0x10      \n\t"
      "daddiu    %[width],      %[width],           -0x04      \n\t"
      "bgtz      %[width],     1b                              \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
      : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
        [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
        [four] "f"(0x04)
      : "memory");
}

void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
                           uint8_t* dst_argb,
                           int width) {
  uint64_t ftmp[6];
  uint64_t c0 = 0x001f001f001f001f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t c2 = 0x0003000300030003;
  uint64_t c3 = 0x007c007c007c007c;
  uint64_t c4 = 0x0001000100010001;
  __asm__ volatile(
      "1:                                                         \n\t"
      "gsldrc1   %[src0],         0x00(%[src_argb1555])           \n\t"
      "gsldlc1   %[src0],         0x07(%[src_argb1555])           \n\t"
      "psrlh     %[src1],         %[src0],              %[eight]  \n\t"
      "and       %[b],            %[src0],              %[c0]     \n\t"
      "and       %[src0],         %[src0],              %[c1]     \n\t"
      "psrlh     %[src0],         %[src0],              %[five]   \n\t"
      "and       %[g],            %[src1],              %[c2]     \n\t"
      "psllh     %[g],            %[g],                 %[three]  \n\t"
      "or        %[g],            %[src0],              %[g]      \n\t"
      "and       %[r],            %[src1],              %[c3]     \n\t"
      "psrlh     %[r],            %[r],                 %[two]    \n\t"
      "psrlh     %[a],            %[src1],              %[seven]  \n\t"
      "psllh     %[src0],         %[b],                 %[three]  \n\t"
      "psrlh     %[src1],         %[b],                 %[two]    \n\t"
      "or        %[b],            %[src0],              %[src1]   \n\t"
      "psllh     %[src0],         %[g],                 %[three]  \n\t"
      "psrlh     %[src1],         %[g],                 %[two]    \n\t"
      "or        %[g],            %[src0],              %[src1]   \n\t"
      "psllh     %[src0],         %[r],                 %[three]  \n\t"
      "psrlh     %[src1],         %[r],                 %[two]    \n\t"
      "or        %[r],            %[src0],              %[src1]   \n\t"
      "xor       %[a],            %[a],                 %[c1]     \n\t"
      "paddb     %[a],            %[a],                 %[c4]     \n\t"
      "packushb  %[b],            %[b],                 %[r]      \n\t"
      "packushb  %[g],            %[g],                 %[a]      \n\t"
      "punpcklbh %[src0],         %[b],                 %[g]      \n\t"
      "punpckhbh %[src1],         %[b],                 %[g]      \n\t"
      "punpcklhw %[r],            %[src0],              %[src1]   \n\t"
      "gssdrc1   %[r],            0x00(%[dst_argb])               \n\t"
      "gssdlc1   %[r],            0x07(%[dst_argb])               \n\t"
      "punpckhhw %[r],            %[src0],              %[src1]   \n\t"
      "gssdrc1   %[r],            0x08(%[dst_argb])               \n\t"
      "gssdlc1   %[r],            0x0f(%[dst_argb])               \n\t"
      "daddiu    %[src_argb1555], %[src_argb1555],      0x08      \n\t"
      "daddiu    %[dst_argb],     %[dst_argb],          0x10      \n\t"
      "daddiu    %[width],        %[width],            -0x04      \n\t"
      "bgtz      %[width],        1b                              \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
      : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
        [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
        [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
      : "memory");
}

void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
                           uint8_t* dst_argb,
                           int width) {
  uint64_t ftmp[6];
  uint64_t c0 = 0x000f000f000f000f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  __asm__ volatile(
      "1:                                                          \n\t"
      "gsldrc1   %[src0],         0x00(%[src_argb4444])            \n\t"
      "gsldlc1   %[src0],         0x07(%[src_argb4444])            \n\t"
      "psrlh     %[src1],         %[src0],              %[eight]   \n\t"
      "and       %[b],            %[src0],              %[c0]      \n\t"
      "and       %[src0],         %[src0],              %[c1]      \n\t"
      "psrlh     %[g],            %[src0],              %[four]    \n\t"
      "and       %[r],            %[src1],              %[c0]      \n\t"
      "psrlh     %[a],            %[src1],              %[four]    \n\t"
      "psllh     %[src0],         %[b],                 %[four]    \n\t"
      "or        %[b],            %[src0],              %[b]       \n\t"
      "psllh     %[src0],         %[g],                 %[four]    \n\t"
      "or        %[g],            %[src0],              %[g]       \n\t"
      "psllh     %[src0],         %[r],                 %[four]    \n\t"
      "or        %[r],            %[src0],              %[r]       \n\t"
      "psllh     %[src0],         %[a],                 %[four]    \n\t"
      "or        %[a],            %[src0],              %[a]       \n\t"
      "packushb  %[b],            %[b],                 %[r]       \n\t"
      "packushb  %[g],            %[g],                 %[a]       \n\t"
      "punpcklbh %[src0],         %[b],                 %[g]       \n\t"
      "punpckhbh %[src1],         %[b],                 %[g]       \n\t"
      "punpcklhw %[r],            %[src0],              %[src1]    \n\t"
      "gssdrc1   %[r],            0x00(%[dst_argb])                \n\t"
      "gssdlc1   %[r],            0x07(%[dst_argb])                \n\t"
      "punpckhhw %[r],            %[src0],              %[src1]    \n\t"
      "gssdrc1   %[r],            0x08(%[dst_argb])                \n\t"
      "gssdlc1   %[r],            0x0f(%[dst_argb])                \n\t"
      "daddiu    %[src_argb4444], %[src_argb4444],      0x08       \n\t"
      "daddiu    %[dst_argb],     %[dst_argb],          0x10       \n\t"
      "daddiu    %[width],        %[width],            -0x04       \n\t"
      "bgtz      %[width],        1b                               \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
      : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
        [four] "f"(0x04)
      : "memory");
}

void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  uint64_t src;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "gsswlc1    %[src],          0x03(%[dst_ptr])                 \n\t"
      "gsswrc1    %[src],          0x00(%[dst_ptr])                 \n\t"

      "gslwlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gslwrc1    %[src],          0x04(%[src_ptr])                 \n\t"
      "gsswlc1    %[src],          0x06(%[dst_ptr])                 \n\t"
      "gsswrc1    %[src],          0x03(%[dst_ptr])                 \n\t"

      "gslwlc1    %[src],          0x0b(%[src_ptr])                 \n\t"
      "gslwrc1    %[src],          0x08(%[src_ptr])                 \n\t"
      "gsswlc1    %[src],          0x09(%[dst_ptr])                 \n\t"
      "gsswrc1    %[src],          0x06(%[dst_ptr])                 \n\t"

      "gslwlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
      "gslwrc1    %[src],          0x0c(%[src_ptr])                 \n\t"
      "gsswlc1    %[src],          0x0c(%[dst_ptr])                 \n\t"
      "gsswrc1    %[src],          0x09(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x0c          \n\t"
      "daddi      %[width],        %[width],         -0x04          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src)
      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
      : "memory");
}

void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  uint64_t src0, src1;
  uint64_t ftmp[3];
  uint64_t mask0 = 0xc6;
  uint64_t mask1 = 0x18;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"

      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
      "punpcklbh  %[ftmp2],        %[src1],           %[zero]       \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]       \n\t"

      "pextrh     %[src0],         %[ftmp1],          %[two]        \n\t"
      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[src0]       \n\t"
      "pshufh     %[ftmp1],        %[ftmp1],          %[one]        \n\t"

      "pextrh     %[src0],         %[ftmp2],          %[two]        \n\t"
      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
      "pextrh     %[src0],         %[ftmp2],          %[one]        \n\t"
      "pinsrh_3   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
      "pextrh     %[src0],         %[ftmp2],          %[zero]       \n\t"
      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
      "pinsrh_0   %[src1],         %[src1],           %[src0]       \n\t"
      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
      "packushb   %[src1],         %[src1],           %[zero]       \n\t"

      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb])                 \n\t"
      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb])                 \n\t"
      "gsswrc1    %[src1],         0x08(%[dst_rgb])                 \n\t"
      "gsswlc1    %[src1],         0x0b(%[dst_rgb])                 \n\t"

      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
      "daddiu     %[dst_rgb],      %[dst_rgb],        0x0c          \n\t"
      "daddiu     %[width],        %[width],         -0x04          \n\t"
      "bgtz       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
        [one] "f"(0x01), [two] "f"(0x02)
      : "memory");
}

void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  uint64_t src0, src1;
  uint64_t ftmp[3];

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"

      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"

      "psrlh      %[b],            %[b],              %[three]      \n\t"
      "psrlh      %[g],            %[g],              %[two]        \n\t"
      "psrlh      %[r],            %[r],              %[three]      \n\t"

      "psllh      %[g],            %[g],              %[five]       \n\t"
      "psllh      %[r],            %[r],              %[eleven]     \n\t"
      "or         %[b],            %[b],              %[g]          \n\t"
      "or         %[b],            %[b],              %[r]          \n\t"

      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"

      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
      "daddiu     %[width],        %[width],         -0x04          \n\t"
      "bgtz       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
        [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
        [eleven] "f"(0x0b)
      : "memory");
}

// dither4 is a row of 4 values from 4x4 dither matrix.
// The 4x4 matrix contains values to increase RGB.  When converting to
// fewer bits (565) this provides an ordered dither.
// The order in the 4x4 matrix in first byte is upper left.
// The 4 values are passed as an int, then referenced as an array, so
// endian will not affect order of the original matrix.  But the dither4
// will containing the first pixel in the lower byte for little endian
// or the upper byte for big endian.
void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
                               uint8_t* dst_rgb,
                               const uint32_t dither4,
                               int width) {
  uint64_t src0, src1;
  uint64_t ftmp[3];
  uint64_t c0 = 0x00ff00ff00ff00ff;

  __asm__ volatile(
      "punpcklbh  %[dither],       %[dither],         %[zero]       \n\t"
      "1:                                                           \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"

      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"

      "paddh      %[b],            %[b],              %[dither]     \n\t"
      "paddh      %[g],            %[g],              %[dither]     \n\t"
      "paddh      %[r],            %[r],              %[dither]     \n\t"
      "pcmpgth    %[src0],         %[b],              %[c0]         \n\t"
      "or         %[src0],         %[src0],           %[b]          \n\t"
      "and        %[b],            %[src0],           %[c0]         \n\t"
      "pcmpgth    %[src0],         %[g],              %[c0]         \n\t"
      "or         %[src0],         %[src0],           %[g]          \n\t"
      "and        %[g],            %[src0],           %[c0]         \n\t"
      "pcmpgth    %[src0],         %[r],              %[c0]         \n\t"
      "or         %[src0],         %[src0],           %[r]          \n\t"
      "and        %[r],            %[src0],           %[c0]         \n\t"

      "psrlh      %[b],            %[b],              %[three]      \n\t"
      "psrlh      %[g],            %[g],              %[two]        \n\t"
      "psrlh      %[r],            %[r],              %[three]      \n\t"

      "psllh      %[g],            %[g],              %[five]       \n\t"
      "psllh      %[r],            %[r],              %[eleven]     \n\t"
      "or         %[b],            %[b],              %[g]          \n\t"
      "or         %[b],            %[b],              %[r]          \n\t"

      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"

      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
      "daddiu     %[width],        %[width],         -0x04          \n\t"
      "bgtz       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
        [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
        [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
      : "memory");
}

void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
                           uint8_t* dst_rgb,
                           int width) {
  uint64_t src0, src1;
  uint64_t ftmp[4];

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"

      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"

      "psrlh      %[b],            %[b],              %[three]      \n\t"
      "psrlh      %[g],            %[g],              %[three]      \n\t"
      "psrlh      %[r],            %[r],              %[three]      \n\t"
      "psrlh      %[a],            %[a],              %[seven]      \n\t"

      "psllh      %[g],            %[g],              %[five]       \n\t"
      "psllh      %[r],            %[r],              %[ten]        \n\t"
      "psllh      %[a],            %[a],              %[fifteen]    \n\t"
      "or         %[b],            %[b],              %[g]          \n\t"
      "or         %[b],            %[b],              %[r]          \n\t"
      "or         %[b],            %[b],              %[a]          \n\t"

      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"

      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
      "daddiu     %[width],        %[width],         -0x04          \n\t"
      "bgtz       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
        [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
        [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
      : "memory");
}

void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
                           uint8_t* dst_rgb,
                           int width) {
  uint64_t src0, src1;
  uint64_t ftmp[4];

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"

      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"

      "psrlh      %[b],            %[b],              %[four]       \n\t"
      "psrlh      %[g],            %[g],              %[four]       \n\t"
      "psrlh      %[r],            %[r],              %[four]       \n\t"
      "psrlh      %[a],            %[a],              %[four]       \n\t"

      "psllh      %[g],            %[g],              %[four]       \n\t"
      "psllh      %[r],            %[r],              %[eight]      \n\t"
      "psllh      %[a],            %[a],              %[twelve]     \n\t"
      "or         %[b],            %[b],              %[g]          \n\t"
      "or         %[b],            %[b],              %[r]          \n\t"
      "or         %[b],            %[b],              %[a]          \n\t"

      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"

      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
      "daddiu     %[width],        %[width],         -0x04          \n\t"
      "bgtz       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
        [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
        [twelve] "f"(0x0c)
      : "memory");
}

void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  uint64_t src, src_hi, src_lo;
  uint64_t dest0, dest1, dest2, dest3;
  const uint64_t value = 0x1080;
  const uint64_t mask = 0x0001004200810019;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"

      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"

      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest3] "=&f"(dest3)
      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
        [zero] "f"(0x00)
      : "memory");
}

void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  uint64_t src_rgb1;
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x0026004a00700002;
  const uint64_t mask_v = 0x00020070005e0012;

  __asm__ volatile(
      "1:                                                               \n\t"
      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x10              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
        [sixteen] "f"(0x10)
      : "memory");
}

void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  uint64_t src, src_hi, src_lo;
  uint64_t dest0, dest1, dest2, dest3;
  const uint64_t value = 0x1080;
  const uint64_t mask = 0x0019008100420001;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"

      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"

      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest3] "=&f"(dest3)
      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
        [zero] "f"(0x00)
      : "memory");
}

void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  uint64_t src_rgb1;
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x00020070004a0026;
  const uint64_t mask_v = 0x0012005e00700002;

  __asm__ volatile(
      "1:                                                               \n\t"
      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
      "pinsrh_0   %[dest0_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
      "pinsrh_0   %[dest1_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
      "pinsrh_0   %[dest2_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]        \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
      "pinsrh_0   %[dest3_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x10              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
        [sixteen] "f"(0x10)
      : "memory");
}

void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  uint64_t src, src_hi, src_lo;
  uint64_t dest0, dest1, dest2, dest3;
  const uint64_t value = 0x1080;
  const uint64_t mask = 0x0001001900810042;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"

      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"

      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest3] "=&f"(dest3)
      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
        [zero] "f"(0x00)
      : "memory");
}

void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  uint64_t src_rgb1;
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x00020070004a0026;
  const uint64_t mask_v = 0x0012005e00700002;

  __asm__ volatile(
      "1:                                                               \n\t"
      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x10              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
        [sixteen] "f"(0x10)
      : "memory");
}

void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  uint64_t src, src_hi, src_lo;
  uint64_t dest0, dest1, dest2, dest3;
  const uint64_t value = 0x1080;
  const uint64_t mask = 0x0042008100190001;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"

      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"

      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest3] "=&f"(dest3)
      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
        [zero] "f"(0x00)
      : "memory");
}

void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  uint64_t src_rgb1;
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x0026004a00700002;
  const uint64_t mask_v = 0x00020070005e0012;

  __asm__ volatile(
      "1:                                                               \n\t"
      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[dest0_u],      %[src0],           %[value]          \n\t"
      "dsrl       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[dest1_u],      %[src0],           %[value]          \n\t"
      "dsrl       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[dest2_u],      %[src0],           %[value]          \n\t"
      "dsrl       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[dest3_u],      %[src0],           %[value]          \n\t"
      "dsrl       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x10              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
        [sixteen] "f"(0x10)
      : "memory");
}

void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  uint64_t src, src_hi, src_lo;
  uint64_t dest0, dest1, dest2, dest3;
  const uint64_t value = 0x1080;
  const uint64_t mask = 0x0001004200810019;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"

      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"

      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest3] "=&f"(dest3)
      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
        [zero] "f"(0x00)
      : "memory");
}

void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  uint64_t src_rgb1;
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x0026004a00700002;
  const uint64_t mask_v = 0x00020070005e0012;

  __asm__ volatile(
      "1:                                                               \n\t"
      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x10              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
        [sixteen] "f"(0x10)
      : "memory");
}

void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  uint64_t src, src_hi, src_lo;
  uint64_t dest0, dest1, dest2, dest3;
  const uint64_t value = 0x1080;
  const uint64_t mask = 0x0001001900810042;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"

      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
      "dsll       %[src],          %[src],            %[eight]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"

      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"

      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest3] "=&f"(dest3)
      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
        [zero] "f"(0x00)
      : "memory");
}

void RAWToUVRow_MMI(const uint8_t* src_rgb0,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width) {
  uint64_t src_rgb1;
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x00020070004a0026;
  const uint64_t mask_v = 0x0012005e00700002;

  __asm__ volatile(
      "1:                                                               \n\t"
      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x10              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
        [sixteen] "f"(0x10)
      : "memory");
}

void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
  uint64_t src, src_hi, src_lo;
  uint64_t dest, dest0, dest1, dest2, dest3;
  uint64_t tmp0, tmp1;
  const uint64_t shift = 0x07;
  const uint64_t value = 0x0040;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x00010026004B000FULL;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"

      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"

      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest2],        %[dest2],          %[shift]      \n\t"

      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest3],        %[dest3],          %[shift]      \n\t"

      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
        [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
        [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
        [tmp1] "=&f"(tmp1)
      : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
        [width] "r"(width)
      : "memory");
}

void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  uint64_t src_rgb1;
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x002b0054007f0002;
  const uint64_t mask_v = 0x0002007f006b0014;

  __asm__ volatile(
      "1:                                                               \n\t"
      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"

      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x10              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
        [sixteen] "f"(0x10)
      : "memory");
}

void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
  uint64_t ftmp[11];
  const uint64_t value = 0x1080108010801080;
  const uint64_t mask = 0x0001004200810019;
  uint64_t c0 = 0x001f001f001f001f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t c2 = 0x0007000700070007;
  __asm__ volatile(
      "1:                                                            \n\t"
      "gsldrc1    %[src0],        0x00(%[src_rgb565])                \n\t"
      "gsldlc1    %[src0],        0x07(%[src_rgb565])                \n\t"
      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
      "and        %[b],           %[src0],             %[c0]         \n\t"
      "and        %[src0],        %[src0],             %[c1]         \n\t"
      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
      "and        %[g],           %[src1],             %[c2]         \n\t"
      "psllh      %[g],           %[g],                %[three]      \n\t"
      "or         %[g],           %[src0],             %[g]          \n\t"
      "psrlh      %[r],           %[src1],             %[three]      \n\t"
      "psllh      %[src0],        %[b],                %[three]      \n\t"
      "psrlh      %[src1],        %[b],                %[two]        \n\t"
      "or         %[b],           %[src0],             %[src1]       \n\t"
      "psllh      %[src0],        %[g],                %[two]        \n\t"
      "psrlh      %[src1],        %[g],                %[four]       \n\t"
      "or         %[g],           %[src0],             %[src1]       \n\t"
      "psllh      %[src0],        %[r],                %[three]      \n\t"
      "psrlh      %[src1],        %[r],                %[two]        \n\t"
      "or         %[r],           %[src0],             %[src1]       \n\t"
      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
      "paddw      %[dest0],       %[src0],             %[src1]       \n\t"
      "psrlw      %[dest0],       %[dest0],            %[eight]      \n\t"

      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
      "paddw      %[dest1],       %[src0],             %[src1]       \n\t"
      "psrlw      %[dest1],       %[dest1],            %[eight]      \n\t"

      "gsldrc1    %[src0],        0x08(%[src_rgb565])                \n\t"
      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                \n\t"
      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
      "and        %[b],           %[src0],             %[c0]         \n\t"
      "and        %[src0],        %[src0],             %[c1]         \n\t"
      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
      "and        %[g],           %[src1],             %[c2]         \n\t"
      "psllh      %[g],           %[g],                %[three]      \n\t"
      "or         %[g],           %[src0],             %[g]          \n\t"
      "psrlh      %[r],           %[src1],             %[three]      \n\t"
      "psllh      %[src0],        %[b],                %[three]      \n\t"
      "psrlh      %[src1],        %[b],                %[two]        \n\t"
      "or         %[b],           %[src0],             %[src1]       \n\t"
      "psllh      %[src0],        %[g],                %[two]        \n\t"
      "psrlh      %[src1],        %[g],                %[four]       \n\t"
      "or         %[g],           %[src0],             %[src1]       \n\t"
      "psllh      %[src0],        %[r],                %[three]      \n\t"
      "psrlh      %[src1],        %[r],                %[two]        \n\t"
      "or         %[r],           %[src0],             %[src1]       \n\t"
      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
      "paddw      %[dest2],       %[src0],             %[src1]       \n\t"
      "psrlw      %[dest2],       %[dest2],            %[eight]      \n\t"

      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
      "paddw      %[dest3],       %[src0],             %[src1]       \n\t"
      "psrlw      %[dest3],       %[dest3],            %[eight]      \n\t"

      "packsswh   %[src_lo],      %[dest0],            %[dest1]      \n\t"
      "packsswh   %[src_hi],      %[dest2],            %[dest3]      \n\t"
      "packushb   %[dest0],       %[src_lo],           %[src_hi]     \n\t"
      "gssdlc1    %[dest0],       0x07(%[dst_y])                     \n\t"
      "gssdrc1    %[dest0],       0x00(%[dst_y])                     \n\t"

      "daddiu    %[src_rgb565],   %[src_rgb565],       0x10          \n\t"
      "daddiu    %[dst_y],        %[dst_y],            0x08          \n\t"
      "daddiu    %[width],        %[width],           -0x08          \n\t"
      "bgtz      %[width],        1b                                 \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
      : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
        [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
        [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
      : "memory");
}

void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
                        uint8_t* dst_y,
                        int width) {
  uint64_t ftmp[11];
  const uint64_t value = 0x1080108010801080;
  const uint64_t mask = 0x0001004200810019;
  uint64_t c0 = 0x001f001f001f001f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t c2 = 0x0003000300030003;
  uint64_t c3 = 0x007c007c007c007c;
  __asm__ volatile(
      "1:                                                            \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb1555])             \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb1555])             \n\t"
      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
      "and        %[b],            %[src0],              %[c0]       \n\t"
      "and        %[src0],         %[src0],              %[c1]       \n\t"
      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
      "and        %[g],            %[src1],              %[c2]       \n\t"
      "psllh      %[g],            %[g],                 %[three]    \n\t"
      "or         %[g],            %[src0],              %[g]        \n\t"
      "and        %[r],            %[src1],              %[c3]       \n\t"
      "psrlh      %[r],            %[r],                 %[two]      \n\t"
      "psllh      %[src0],         %[b],                 %[three]    \n\t"
      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
      "or         %[b],            %[src0],              %[src1]     \n\t"
      "psllh      %[src0],         %[g],                 %[three]    \n\t"
      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
      "or         %[g],            %[src0],              %[src1]     \n\t"
      "psllh      %[src0],         %[r],                 %[three]    \n\t"
      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
      "or         %[r],            %[src0],              %[src1]     \n\t"
      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"

      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"

      "gsldrc1    %[src0],         0x08(%[src_argb1555])             \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_argb1555])             \n\t"
      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
      "and        %[b],            %[src0],              %[c0]       \n\t"
      "and        %[src0],         %[src0],              %[c1]       \n\t"
      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
      "and        %[g],            %[src1],              %[c2]       \n\t"
      "psllh      %[g],            %[g],                 %[three]    \n\t"
      "or         %[g],            %[src0],              %[g]        \n\t"
      "and        %[r],            %[src1],              %[c3]       \n\t"
      "psrlh      %[r],            %[r],                 %[two]      \n\t"
      "psllh      %[src0],         %[b],                 %[three]    \n\t"
      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
      "or         %[b],            %[src0],              %[src1]     \n\t"
      "psllh      %[src0],         %[g],                 %[three]    \n\t"
      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
      "or         %[g],            %[src0],              %[src1]     \n\t"
      "psllh      %[src0],         %[r],                 %[three]    \n\t"
      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
      "or         %[r],            %[src0],              %[src1]     \n\t"
      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"

      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"

      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"

      "daddiu     %[src_argb1555], %[src_argb1555],      0x10        \n\t"
      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
      "daddiu     %[width],        %[width],            -0x08        \n\t"
      "bgtz       %[width],        1b                                \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
      : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
        [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
        [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
      : "memory");
}

void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
                        uint8_t* dst_y,
                        int width) {
  uint64_t ftmp[11];
  uint64_t value = 0x1080108010801080;
  uint64_t mask = 0x0001004200810019;
  uint64_t c0 = 0x000f000f000f000f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  __asm__ volatile(
      "1:                                                            \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb4444])             \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb4444])             \n\t"
      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
      "and        %[b],            %[src0],              %[c0]       \n\t"
      "and        %[src0],         %[src0],              %[c1]       \n\t"
      "psrlh      %[g],            %[src0],              %[four]     \n\t"
      "and        %[r],            %[src1],              %[c0]       \n\t"
      "psllh      %[src0],         %[b],                 %[four]     \n\t"
      "or         %[b],            %[src0],              %[b]        \n\t"
      "psllh      %[src0],         %[g],                 %[four]     \n\t"
      "or         %[g],            %[src0],              %[g]        \n\t"
      "psllh      %[src0],         %[r],                 %[four]     \n\t"
      "or         %[r],            %[src0],              %[r]        \n\t"
      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"

      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"

      "gsldrc1    %[src0],         0x08(%[src_argb4444])             \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_argb4444])             \n\t"
      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
      "and        %[b],            %[src0],              %[c0]       \n\t"
      "and        %[src0],         %[src0],              %[c1]       \n\t"
      "psrlh      %[g],            %[src0],              %[four]     \n\t"
      "and        %[r],            %[src1],              %[c0]       \n\t"
      "psllh      %[src0],         %[b],                 %[four]     \n\t"
      "or         %[b],            %[src0],              %[b]        \n\t"
      "psllh      %[src0],         %[g],                 %[four]     \n\t"
      "or         %[g],            %[src0],              %[g]        \n\t"
      "psllh      %[src0],         %[r],                 %[four]     \n\t"
      "or         %[r],            %[src0],              %[r]        \n\t"
      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"

      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"

      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"

      "daddiu     %[src_argb4444], %[src_argb4444],      0x10        \n\t"
      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
      "daddiu     %[width],        %[width],            -0x08        \n\t"
      "bgtz       %[width],        1b                                \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
      : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
        [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
      : "memory");
}

void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
                       int src_stride_rgb565,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
  uint64_t ftmp[13];
  uint64_t value = 0x2020202020202020;
  uint64_t mask_u = 0x0026004a00700002;
  uint64_t mask_v = 0x00020070005e0012;
  uint64_t mask = 0x93;
  uint64_t c0 = 0x001f001f001f001f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t c2 = 0x0007000700070007;
  __asm__ volatile(
      "daddu      %[next_rgb565], %[src_rgb565],       %[next_rgb565]   \n\t"
      "1:                                                               \n\t"
      "gsldrc1    %[src0],        0x00(%[src_rgb565])                   \n\t"
      "gsldlc1    %[src0],        0x07(%[src_rgb565])                   \n\t"
      "gsldrc1    %[src1],        0x00(%[next_rgb565])                  \n\t"
      "gsldlc1    %[src1],        0x07(%[next_rgb565])                  \n\t"
      "psrlh      %[dest0_u],     %[src0],             %[eight]         \n\t"
      "and        %[b0],          %[src0],             %[c0]            \n\t"
      "and        %[src0],        %[src0],             %[c1]            \n\t"
      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
      "and        %[g0],          %[dest0_u],          %[c2]            \n\t"
      "psllh      %[g0],          %[g0],               %[three]         \n\t"
      "or         %[g0],          %[src0],             %[g0]            \n\t"
      "psrlh      %[r0],          %[dest0_u],          %[three]         \n\t"
      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
      "and        %[dest0_u],     %[src1],             %[c0]            \n\t"
      "and        %[src1],        %[src1],             %[c1]            \n\t"
      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
      "and        %[dest0_v],     %[src0],             %[c2]            \n\t"
      "psllh      %[dest0_v],     %[dest0_v],          %[three]         \n\t"
      "or         %[dest0_v],     %[src1],             %[dest0_v]       \n\t"
      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
      "paddh      %[b0],          %[b0],               %[dest0_u]       \n\t"
      "paddh      %[g0],          %[g0],               %[dest0_v]       \n\t"
      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[src0],        %[dest0_u],          %[dest0_v]       \n\t"
      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
      "psllh      %[r0],          %[src0],             %[one]           \n\t"
      "or         %[b0],          %[b0],               %[r0]            \n\t"
      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[g0],          %[dest0_u],          %[dest0_v]       \n\t"
      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"

      "pmaddhw    %[dest0_v],     %[src0],             %[mask_v]        \n\t"
      "pshufh     %[dest0_u],     %[src0],             %[mask]          \n\t"
      "pmaddhw    %[dest0_u],     %[dest0_u],          %[mask_u]        \n\t"
      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"

      "punpcklwd  %[src0],        %[dest0_u],          %[b0]            \n\t"
      "punpckhwd  %[src1],        %[dest0_u],          %[b0]            \n\t"
      "psubw      %[dest0_u],     %[src0],             %[src1]          \n\t"
      "psraw      %[dest0_u],     %[dest0_u],          %[eight]         \n\t"
      "punpcklwd  %[src0],        %[dest0_v],          %[g0]            \n\t"
      "punpckhwd  %[src1],        %[dest0_v],          %[g0]            \n\t"
      "psubw      %[dest0_v],     %[src1],             %[src0]          \n\t"
      "psraw      %[dest0_v],     %[dest0_v],          %[eight]         \n\t"

      "gsldrc1    %[src0],        0x08(%[src_rgb565])                   \n\t"
      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                   \n\t"
      "gsldrc1    %[src1],        0x08(%[next_rgb565])                  \n\t"
      "gsldlc1    %[src1],        0x0f(%[next_rgb565])                  \n\t"
      "psrlh      %[dest1_u],     %[src0],             %[eight]         \n\t"
      "and        %[b0],          %[src0],             %[c0]            \n\t"
      "and        %[src0],        %[src0],             %[c1]            \n\t"
      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
      "and        %[g0],          %[dest1_u],          %[c2]            \n\t"
      "psllh      %[g0],          %[g0],               %[three]         \n\t"
      "or         %[g0],          %[src0],             %[g0]            \n\t"
      "psrlh      %[r0],          %[dest1_u],          %[three]         \n\t"
      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
      "and        %[dest1_u],     %[src1],             %[c0]            \n\t"
      "and        %[src1],        %[src1],             %[c1]            \n\t"
      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
      "and        %[dest1_v],     %[src0],             %[c2]            \n\t"
      "psllh      %[dest1_v],     %[dest1_v],          %[three]         \n\t"
      "or         %[dest1_v],     %[src1],             %[dest1_v]       \n\t"
      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
      "paddh      %[b0],          %[b0],               %[dest1_u]       \n\t"
      "paddh      %[g0],          %[g0],               %[dest1_v]       \n\t"
      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[src0],        %[dest1_u],          %[dest1_v]       \n\t"
      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
      "psllh      %[r0],          %[src0],             %[one]           \n\t"
      "or         %[b0],          %[b0],               %[r0]            \n\t"
      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[g0],          %[dest1_u],          %[dest1_v]       \n\t"
      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"

      "pmaddhw    %[dest1_v],     %[src0],             %[mask_v]        \n\t"
      "pshufh     %[dest1_u],     %[src0],             %[mask]          \n\t"
      "pmaddhw    %[dest1_u],     %[dest1_u],          %[mask_u]        \n\t"
      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"

      "punpcklwd  %[src0],        %[dest1_u],          %[b0]            \n\t"
      "punpckhwd  %[src1],        %[dest1_u],          %[b0]            \n\t"
      "psubw      %[dest1_u],     %[src0],             %[src1]          \n\t"
      "psraw      %[dest1_u],     %[dest1_u],          %[eight]         \n\t"
      "punpcklwd  %[src0],        %[dest1_v],          %[g0]            \n\t"
      "punpckhwd  %[src1],        %[dest1_v],          %[g0]            \n\t"
      "psubw      %[dest1_v],     %[src1],             %[src0]          \n\t"
      "psraw      %[dest1_v],     %[dest1_v],          %[eight]         \n\t"

      "gsldrc1    %[src0],        0x10(%[src_rgb565])                   \n\t"
      "gsldlc1    %[src0],        0x17(%[src_rgb565])                   \n\t"
      "gsldrc1    %[src1],        0x10(%[next_rgb565])                  \n\t"
      "gsldlc1    %[src1],        0x17(%[next_rgb565])                  \n\t"
      "psrlh      %[dest2_u],     %[src0],             %[eight]         \n\t"
      "and        %[b0],          %[src0],             %[c0]            \n\t"
      "and        %[src0],        %[src0],             %[c1]            \n\t"
      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
      "and        %[g0],          %[dest2_u],          %[c2]            \n\t"
      "psllh      %[g0],          %[g0],               %[three]         \n\t"
      "or         %[g0],          %[src0],             %[g0]            \n\t"
      "psrlh      %[r0],          %[dest2_u],          %[three]         \n\t"
      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
      "and        %[dest2_u],     %[src1],             %[c0]            \n\t"
      "and        %[src1],        %[src1],             %[c1]            \n\t"
      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
      "and        %[dest2_v],     %[src0],             %[c2]            \n\t"
      "psllh      %[dest2_v],     %[dest2_v],          %[three]         \n\t"
      "or         %[dest2_v],     %[src1],             %[dest2_v]       \n\t"
      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
      "paddh      %[b0],          %[b0],               %[dest2_u]       \n\t"
      "paddh      %[g0],          %[g0],               %[dest2_v]       \n\t"
      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[src0],        %[dest2_u],          %[dest2_v]       \n\t"
      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
      "psllh      %[r0],          %[src0],             %[one]           \n\t"
      "or         %[b0],          %[b0],               %[r0]            \n\t"
      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[g0],          %[dest2_u],          %[dest2_v]       \n\t"
      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"

      "pmaddhw    %[dest2_v],     %[src0],             %[mask_v]        \n\t"
      "pshufh     %[dest2_u],     %[src0],             %[mask]          \n\t"
      "pmaddhw    %[dest2_u],     %[dest2_u],          %[mask_u]        \n\t"
      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"

      "punpcklwd  %[src0],        %[dest2_u],          %[b0]            \n\t"
      "punpckhwd  %[src1],        %[dest2_u],          %[b0]            \n\t"
      "psubw      %[dest2_u],     %[src0],             %[src1]          \n\t"
      "psraw      %[dest2_u],     %[dest2_u],          %[eight]         \n\t"
      "punpcklwd  %[src0],        %[dest2_v],          %[g0]            \n\t"
      "punpckhwd  %[src1],        %[dest2_v],          %[g0]            \n\t"
      "psubw      %[dest2_v],     %[src1],             %[src0]          \n\t"
      "psraw      %[dest2_v],     %[dest2_v],          %[eight]         \n\t"

      "gsldrc1    %[src0],        0x18(%[src_rgb565])                   \n\t"
      "gsldlc1    %[src0],        0x1f(%[src_rgb565])                   \n\t"
      "gsldrc1    %[src1],        0x18(%[next_rgb565])                  \n\t"
      "gsldlc1    %[src1],        0x1f(%[next_rgb565])                  \n\t"
      "psrlh      %[dest3_u],     %[src0],             %[eight]         \n\t"
      "and        %[b0],          %[src0],             %[c0]            \n\t"
      "and        %[src0],        %[src0],             %[c1]            \n\t"
      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
      "and        %[g0],          %[dest3_u],          %[c2]            \n\t"
      "psllh      %[g0],          %[g0],               %[three]         \n\t"
      "or         %[g0],          %[src0],             %[g0]            \n\t"
      "psrlh      %[r0],          %[dest3_u],          %[three]         \n\t"
      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
      "and        %[dest3_u],     %[src1],             %[c0]            \n\t"
      "and        %[src1],        %[src1],             %[c1]            \n\t"
      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
      "and        %[dest3_v],     %[src0],             %[c2]            \n\t"
      "psllh      %[dest3_v],     %[dest3_v],          %[three]         \n\t"
      "or         %[dest3_v],     %[src1],             %[dest3_v]       \n\t"
      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
      "paddh      %[b0],          %[b0],               %[dest3_u]       \n\t"
      "paddh      %[g0],          %[g0],               %[dest3_v]       \n\t"
      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[src0],        %[dest3_u],          %[dest3_v]       \n\t"
      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
      "psllh      %[r0],          %[src0],             %[one]           \n\t"
      "or         %[b0],          %[b0],               %[r0]            \n\t"
      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
      "paddh      %[g0],          %[dest3_u],          %[dest3_v]       \n\t"
      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"

      "pmaddhw    %[dest3_v],     %[src0],             %[mask_v]        \n\t"
      "pshufh     %[dest3_u],     %[src0],             %[mask]          \n\t"
      "pmaddhw    %[dest3_u],     %[dest3_u],          %[mask_u]        \n\t"
      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"

      "punpcklwd  %[src0],        %[dest3_u],          %[b0]            \n\t"
      "punpckhwd  %[src1],        %[dest3_u],          %[b0]            \n\t"
      "psubw      %[dest3_u],     %[src0],             %[src1]          \n\t"
      "psraw      %[dest3_u],     %[dest3_u],          %[eight]         \n\t"
      "punpcklwd  %[src0],        %[dest3_v],          %[g0]            \n\t"
      "punpckhwd  %[src1],        %[dest3_v],          %[g0]            \n\t"
      "psubw      %[dest3_v],     %[src1],             %[src0]          \n\t"
      "psraw      %[dest3_v],     %[dest3_v],          %[eight]         \n\t"

      "packsswh   %[src0],        %[dest0_u],          %[dest1_u]       \n\t"
      "packsswh   %[src1],        %[dest2_u],          %[dest3_u]       \n\t"
      "packushb   %[dest0_u],     %[src0],             %[src1]          \n\t"
      "gssdlc1    %[dest0_u],     0x07(%[dst_u])                        \n\t"
      "gssdrc1    %[dest0_u],     0x00(%[dst_u])                        \n\t"
      "packsswh   %[src0],        %[dest0_v],          %[dest1_v]       \n\t"
      "packsswh   %[src1],        %[dest2_v],          %[dest3_v]       \n\t"
      "packushb   %[dest0_v],     %[src0],             %[src1]          \n\t"
      "gssdlc1    %[dest0_v],     0x07(%[dst_v])                        \n\t"
      "gssdrc1    %[dest0_v],     0x00(%[dst_v])                        \n\t"

      "daddiu    %[src_rgb565],   %[src_rgb565],       0x20             \n\t"
      "daddiu    %[next_rgb565],  %[next_rgb565],      0x20             \n\t"
      "daddiu    %[dst_u],        %[dst_u],            0x08             \n\t"
      "daddiu    %[dst_v],        %[dst_v],            0x08             \n\t"
      "daddiu    %[width],        %[width],           -0x10             \n\t"
      "bgtz      %[width],        1b                                    \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
        [dest3_v] "=&f"(ftmp[12])
      : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
        [one] "f"(0x01)
      : "memory");
}

void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
                         int src_stride_argb1555,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  uint64_t ftmp[11];
  uint64_t value = 0x2020202020202020;
  uint64_t mask_u = 0x0026004a00700002;
  uint64_t mask_v = 0x00020070005e0012;
  uint64_t mask = 0x93;
  uint64_t c0 = 0x001f001f001f001f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t c2 = 0x0003000300030003;
  uint64_t c3 = 0x007c007c007c007c;
  __asm__ volatile(
      "daddu      %[next_argb1555], %[src_argb1555],      %[next_argb1555] \n\t"
      "1:                                                                  \n\t"
      "gsldrc1    %[src0],          0x00(%[src_argb1555])                  \n\t"
      "gsldlc1    %[src0],          0x07(%[src_argb1555])                  \n\t"
      "gsldrc1    %[src1],          0x00(%[next_argb1555])                 \n\t"
      "gsldlc1    %[src1],          0x07(%[next_argb1555])                 \n\t"
      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
      "and        %[g0],            %[dest0_u],            %[c2]           \n\t"
      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
      "or         %[g0],            %[src0],               %[g0]           \n\t"
      "and        %[r0],            %[dest0_u],            %[c3]           \n\t"
      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
      "and        %[src0],          %[src0],               %[c3]           \n\t"
      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
      "psllh      %[r0],            %[src0],               %[one]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"

      "gsldrc1    %[src0],          0x08(%[src_argb1555])                  \n\t"
      "gsldlc1    %[src0],          0x0f(%[src_argb1555])                  \n\t"
      "gsldrc1    %[src1],          0x08(%[next_argb1555])                 \n\t"
      "gsldlc1    %[src1],          0x0f(%[next_argb1555])                 \n\t"
      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
      "and        %[g0],            %[dest1_u],            %[c2]           \n\t"
      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
      "or         %[g0],            %[src0],               %[g0]           \n\t"
      "and        %[r0],            %[dest1_u],            %[c3]           \n\t"
      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
      "and        %[src0],          %[src0],               %[c3]           \n\t"
      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
      "psllh      %[r0],            %[src0],               %[one]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"

      "packsswh   %[dest0_u],       %[dest0_u],            %[dest1_u]      \n\t"
      "packsswh   %[dest1_u],       %[dest0_v],            %[dest1_v]      \n\t"

      "gsldrc1    %[src0],          0x10(%[src_argb1555])                  \n\t"
      "gsldlc1    %[src0],          0x17(%[src_argb1555])                  \n\t"
      "gsldrc1    %[src1],          0x10(%[next_argb1555])                 \n\t"
      "gsldlc1    %[src1],          0x17(%[next_argb1555])                 \n\t"
      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
      "and        %[g0],            %[dest2_u],            %[c2]           \n\t"
      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
      "or         %[g0],            %[src0],               %[g0]           \n\t"
      "and        %[r0],            %[dest2_u],            %[c3]           \n\t"
      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
      "and        %[src0],          %[src0],               %[c3]           \n\t"
      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest2_u],            %[dest0_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
      "psllh      %[r0],            %[src0],               %[one]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest2_u],            %[dest0_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"

      "gsldrc1    %[src0],          0x18(%[src_argb1555])                  \n\t"
      "gsldlc1    %[src0],          0x1f(%[src_argb1555])                  \n\t"
      "gsldrc1    %[src1],          0x18(%[next_argb1555])                 \n\t"
      "gsldlc1    %[src1],          0x1f(%[next_argb1555])                 \n\t"
      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
      "and        %[g0],            %[dest3_u],            %[c2]           \n\t"
      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
      "or         %[g0],            %[src0],               %[g0]           \n\t"
      "and        %[r0],            %[dest3_u],            %[c3]           \n\t"
      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
      "and        %[src0],          %[src0],               %[c3]           \n\t"
      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest3_u],            %[dest1_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
      "psllh      %[r0],            %[src0],               %[one]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest3_u],            %[dest1_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"

      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
      "packushb   %[dest0_u],       %[dest0_u],            %[src1]         \n\t"
      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
      "packsswh   %[src1],          %[dest0_v],            %[dest1_v]      \n\t"
      "packushb   %[dest0_v],       %[dest1_u],            %[src1]         \n\t"
      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"

      "daddiu    %[src_argb1555],   %[src_argb1555],       0x20            \n\t"
      "daddiu    %[next_argb1555],  %[next_argb1555],      0x20            \n\t"
      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
      "daddiu    %[width],          %[width],             -0x10            \n\t"
      "bgtz      %[width],          1b                                     \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
        [dest1_v] "=&f"(ftmp[10])
      : [src_argb1555] "r"(src_argb1555),
        [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
        [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
        [two] "f"(0x02), [one] "f"(0x01)
      : "memory");
}

void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
                         int src_stride_argb4444,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  uint64_t ftmp[13];
  uint64_t value = 0x2020202020202020;
  uint64_t mask_u = 0x0026004a00700002;
  uint64_t mask_v = 0x00020070005e0012;
  uint64_t mask = 0x93;
  uint64_t c0 = 0x000f000f000f000f;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  __asm__ volatile(
      "daddu      %[next_argb4444], %[src_argb4444],      %[next_argb4444] \n\t"
      "1:                                                                  \n\t"
      "gsldrc1    %[src0],          0x00(%[src_argb4444])                  \n\t"
      "gsldlc1    %[src0],          0x07(%[src_argb4444])                  \n\t"
      "gsldrc1    %[src1],          0x00(%[next_argb4444])                 \n\t"
      "gsldlc1    %[src1],          0x07(%[next_argb4444])                 \n\t"
      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
      "and        %[r0],            %[dest0_u],            %[c0]           \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[dest0_v],       %[src1],               %[four]         \n\t"
      "and        %[src0],          %[src0],               %[c0]           \n\t"
      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
      "psllh      %[r0],            %[src0],               %[two]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"

      "gsldrc1    %[src0],          0x08(%[src_argb4444])                  \n\t"
      "gsldlc1    %[src0],          0x0f(%[src_argb4444])                  \n\t"
      "gsldrc1    %[src1],          0x08(%[next_argb4444])                 \n\t"
      "gsldlc1    %[src1],          0x0f(%[next_argb4444])                 \n\t"
      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
      "and        %[r0],            %[dest1_u],            %[c0]           \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[dest1_v],       %[src1],               %[four]         \n\t"
      "and        %[src0],          %[src0],               %[c0]           \n\t"
      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
      "psllh      %[r0],            %[src0],               %[two]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"

      "gsldrc1    %[src0],          0x10(%[src_argb4444])                  \n\t"
      "gsldlc1    %[src0],          0x17(%[src_argb4444])                  \n\t"
      "gsldrc1    %[src1],          0x10(%[next_argb4444])                 \n\t"
      "gsldlc1    %[src1],          0x17(%[next_argb4444])                 \n\t"
      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
      "and        %[r0],            %[dest2_u],            %[c0]           \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[dest2_v],       %[src1],               %[four]         \n\t"
      "and        %[src0],          %[src0],               %[c0]           \n\t"
      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest2_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest2_u],            %[dest2_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
      "psllh      %[r0],            %[src0],               %[two]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest2_u],            %[dest2_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest2_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest2_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest2_v],            %[g0]           \n\t"
      "psubw      %[dest2_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest2_v],       %[dest2_v],            %[eight]        \n\t"

      "gsldrc1    %[src0],          0x18(%[src_argb4444])                  \n\t"
      "gsldlc1    %[src0],          0x1f(%[src_argb4444])                  \n\t"
      "gsldrc1    %[src1],          0x18(%[next_argb4444])                 \n\t"
      "gsldlc1    %[src1],          0x1f(%[next_argb4444])                 \n\t"
      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
      "and        %[b0],            %[src0],               %[c0]           \n\t"
      "and        %[src0],          %[src0],               %[c1]           \n\t"
      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
      "and        %[r0],            %[dest3_u],            %[c0]           \n\t"
      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
      "and        %[src1],          %[src1],               %[c1]           \n\t"
      "psrlh      %[dest3_v],       %[src1],               %[four]         \n\t"
      "and        %[src0],          %[src0],               %[c0]           \n\t"
      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
      "paddh      %[g0],            %[g0],                 %[dest3_v]      \n\t"
      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[src0],          %[dest3_u],            %[dest3_v]      \n\t"
      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
      "psllh      %[r0],            %[src0],               %[two]          \n\t"
      "or         %[b0],            %[b0],                 %[r0]           \n\t"
      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
      "or         %[g0],            %[g0],                 %[r0]           \n\t"
      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
      "paddh      %[g0],            %[dest3_u],            %[dest3_v]      \n\t"
      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"

      "pmaddhw    %[dest3_v],       %[src0],               %[mask_v]       \n\t"
      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"

      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
      "punpcklwd  %[src0],          %[dest3_v],            %[g0]           \n\t"
      "punpckhwd  %[src1],          %[dest3_v],            %[g0]           \n\t"
      "psubw      %[dest3_v],       %[src1],               %[src0]         \n\t"
      "psraw      %[dest3_v],       %[dest3_v],            %[eight]        \n\t"

      "packsswh   %[src0],          %[dest0_u],            %[dest1_u]      \n\t"
      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
      "packushb   %[dest0_u],       %[src0],               %[src1]         \n\t"
      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
      "packsswh   %[src0],          %[dest0_v],            %[dest1_v]      \n\t"
      "packsswh   %[src1],          %[dest2_v],            %[dest3_v]      \n\t"
      "packushb   %[dest0_v],       %[src0],               %[src1]         \n\t"
      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"

      "daddiu    %[src_argb4444],   %[src_argb4444],       0x20            \n\t"
      "daddiu    %[next_argb4444],  %[next_argb4444],      0x20            \n\t"
      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
      "daddiu    %[width],          %[width],             -0x10            \n\t"
      "bgtz      %[width],          1b                                     \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
        [dest3_v] "=&f"(ftmp[12])
      : [src_argb4444] "r"(src_argb4444),
        [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
        [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
        [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
        [two] "f"(0x02)
      : "memory");
}

void ARGBToUV444Row_MMI(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
  uint64_t ftmp[12];
  const uint64_t value = 0x4040;
  const uint64_t mask_u = 0x0026004a00700002;
  const uint64_t mask_v = 0x00020070005e0012;

  __asm__ volatile(
      "1:                                                               \n\t"
      "gsldrc1    %[src0],         0x00(%[src_argb])                    \n\t"
      "gsldlc1    %[src0],         0x07(%[src_argb])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "dsll       %[dest0_u],      %[src_lo],         %[sixteen]        \n\t"
      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
      "pinsrh_3   %[dest0_v],      %[src_lo],         %[value]          \n\t"
      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"

      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x08(%[src_argb])                    \n\t"
      "gsldlc1    %[src0],         0x0f(%[src_argb])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "dsll       %[dest1_u],      %[src_lo],         %[sixteen]        \n\t"
      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
      "pinsrh_3   %[dest1_v],      %[src_lo],         %[value]          \n\t"
      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x10(%[src_argb])                    \n\t"
      "gsldlc1    %[src0],         0x17(%[src_argb])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "dsll       %[dest2_u],      %[src_lo],         %[sixteen]        \n\t"
      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
      "pinsrh_3   %[dest2_v],      %[src_lo],         %[value]          \n\t"
      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"

      "gsldrc1    %[src0],         0x18(%[src_argb])                    \n\t"
      "gsldlc1    %[src0],         0x1f(%[src_argb])                    \n\t"
      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
      "dsll       %[dest3_u],      %[src_lo],         %[sixteen]        \n\t"
      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
      "pinsrh_3   %[dest3_v],      %[src_lo],         %[value]          \n\t"
      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"

      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"

      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"

      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"

      "daddiu     %[src_argb],     %[src_argb],       0x20              \n\t"
      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
      "daddi      %[width],        %[width],         -0x08              \n\t"
      "bgtz       %[width],        1b                                   \n\t"
      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
        [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
        [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
        [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
        [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
        [dest3_v] "=&f"(ftmp[11])
      : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
        [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
        [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
        [eight] "f"(0x08)
      : "memory");
}

void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
  uint64_t tmp0, tmp1;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x01;
  const uint64_t mask2 = 0x00400026004B000FULL;
  const uint64_t mask3 = 0xFF000000FF000000ULL;
  const uint64_t mask4 = ~mask3;
  const uint64_t shift = 0x07;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"

      "and        %[src37],        %[src],            %[mask3]      \n\t"

      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_lo],       %[src_lo],         %[mask1]      \n\t"
      "pmaddhw    %[dest_lo],      %[src_lo],         %[mask2]      \n\t"
      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_lo]    \n\t"
      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_lo]    \n\t"
      "paddw      %[dest_lo],      %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
      "packsswh   %[dest_lo],      %[dest_lo],        %[dest_lo]    \n\t"

      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
      "pinsrh_3   %[src_hi],       %[src_hi],         %[mask1]      \n\t"
      "pmaddhw    %[dest_hi],      %[src_hi],         %[mask2]      \n\t"
      "punpcklwd  %[tmp0],         %[dest_hi],        %[dest_hi]    \n\t"
      "punpckhwd  %[tmp1],         %[dest_hi],        %[dest_hi]    \n\t"
      "paddw      %[dest_hi],      %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
      "packsswh   %[dest_hi],      %[dest_hi],        %[dest_hi]    \n\t"

      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "and        %[dest],         %[dest],           %[mask4]      \n\t"
      "or         %[dest],         %[dest],           %[src37]      \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
        [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
        [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
        [src37] "=&f"(src37)
      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
        [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
      : "memory");
}

// Convert a row of image to Sepia tone.
void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
  uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
  uint64_t tmp0, tmp1;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x002300440011ULL;
  const uint64_t mask2 = 0x002D00580016ULL;
  const uint64_t mask3 = 0x003200620018ULL;
  const uint64_t mask4 = 0xFF000000FF000000ULL;
  const uint64_t shift = 0x07;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "and        %[dest37],       %[dest],           %[mask4]      \n\t"

      "punpcklbh  %[dest_lo],      %[dest],           %[mask0]      \n\t"
      "pmaddhw    %[dest0],        %[dest_lo],        %[mask1]      \n\t"
      "pmaddhw    %[dest1],        %[dest_lo],        %[mask2]      \n\t"
      "pmaddhw    %[dest2],        %[dest_lo],        %[mask3]      \n\t"
      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"

      "punpckhbh  %[dest_hi],      %[dest],           %[mask0]      \n\t"
      "pmaddhw    %[dest0],        %[dest_hi],        %[mask1]      \n\t"
      "pmaddhw    %[dest1],        %[dest_hi],        %[mask2]      \n\t"
      "pmaddhw    %[dest2],        %[dest_hi],        %[mask3]      \n\t"
      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"

      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "or         %[dest],         %[dest],           %[dest37]     \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
        [dest] "=&f"(dest)
      : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
        [mask4] "f"(mask4), [shift] "f"(shift)
      : "memory");
}

// Apply color matrix to a row of image. Matrix is signed.
// TODO(fbarchard): Consider adding rounding (+32).
void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            const int8_t* matrix_argb,
                            int width) {
  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
      dest3;
  uint64_t matrix, matrix_hi, matrix_lo;
  uint64_t tmp0, tmp1;
  const uint64_t shift0 = 0x06;
  const uint64_t shift1 = 0x08;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x08;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"

      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"

      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
      "psraw      %[dest0],        %[dest0],          %[shift0]     \n\t"

      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
      "psraw      %[dest1],        %[dest1],          %[shift0]     \n\t"

      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"

      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
      "psraw      %[dest2],        %[dest2],          %[shift0]     \n\t"

      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
      "psraw      %[dest3],        %[dest3],          %[shift0]     \n\t"

      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
        [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
      : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
        [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
        [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
      : "memory");
}

void ARGBShadeRow_MMI(const uint8_t* src_argb,
                      uint8_t* dst_argb,
                      int width,
                      uint32_t value) {
  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
  const uint64_t shift = 0x08;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"

      "punpcklbh  %[value],        %[value],          %[value]      \n\t"

      "pmulhuh    %[dest_lo],      %[src_lo],         %[value]      \n\t"
      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
      "pmulhuh    %[dest_hi],      %[src_hi],         %[value]      \n\t"
      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
        [dest] "=&f"(dest)
      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
        [value] "f"(value), [shift] "f"(shift)
      : "memory");
}

void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width) {
  uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
  uint64_t dest, dest_lo, dest_hi;
  const uint64_t mask = 0x0;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
      "punpcklbh  %[src0_lo],      %[src0],           %[src0]       \n\t"
      "punpckhbh  %[src0_hi],      %[src0],           %[src0]       \n\t"

      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
      "punpcklbh  %[src1_lo],      %[src1],           %[mask]       \n\t"
      "punpckhbh  %[src1_hi],      %[src1],           %[mask]       \n\t"

      "pmulhuh    %[dest_lo],      %[src0_lo],        %[src1_lo]    \n\t"
      "pmulhuh    %[dest_hi],      %[src0_hi],        %[src1_hi]    \n\t"
      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
        [src1] "=&f"(src1), [dest] "=&f"(dest)
      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
        [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
      : "memory");
}

void ARGBAddRow_MMI(const uint8_t* src_argb0,
                    const uint8_t* src_argb1,
                    uint8_t* dst_argb,
                    int width) {
  uint64_t src0, src1, dest;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
      "paddusb    %[dest],         %[src0],           %[src1]       \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
        [dst_ptr] "r"(dst_argb), [width] "r"(width)
      : "memory");
}

void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width) {
  uint64_t src0, src1, dest;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
      "psubusb    %[dest],         %[src0],           %[src1]       \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
        [dst_ptr] "r"(dst_argb), [width] "r"(width)
      : "memory");
}

// Sobel functions which mimics SSSE3.
void SobelXRow_MMI(const uint8_t* src_y0,
                   const uint8_t* src_y1,
                   const uint8_t* src_y2,
                   uint8_t* dst_sobelx,
                   int width) {
  uint64_t y00 = 0, y10 = 0, y20 = 0;
  uint64_t y02 = 0, y12 = 0, y22 = 0;
  uint64_t zero = 0x0;
  uint64_t sobel = 0x0;
  __asm__ volatile(
      "1:	                                         \n\t"
      "gsldlc1   %[y00],        0x07(%[src_y0])          \n\t"  // a=src_y0[i]
      "gsldrc1   %[y00],        0x00(%[src_y0])          \n\t"
      "gsldlc1   %[y02],        0x09(%[src_y0])          \n\t"  // a_sub=src_y0[i+2]
      "gsldrc1   %[y02],        0x02(%[src_y0])          \n\t"

      "gsldlc1   %[y10],        0x07(%[src_y1])          \n\t"  // b=src_y1[i]
      "gsldrc1   %[y10],        0x00(%[src_y1])          \n\t"
      "gsldlc1   %[y12],        0x09(%[src_y1])          \n\t"  // b_sub=src_y1[i+2]
      "gsldrc1   %[y12],        0x02(%[src_y1])          \n\t"

      "gsldlc1   %[y20],        0x07(%[src_y2])          \n\t"  // c=src_y2[i]
      "gsldrc1   %[y20],        0x00(%[src_y2])          \n\t"
      "gsldlc1   %[y22],        0x09(%[src_y2])          \n\t"  // c_sub=src_y2[i+2]
      "gsldrc1   %[y22],        0x02(%[src_y2])          \n\t"

      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"

      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"

      "paddh     %[y00],        %[y00],          %[y10]  \n\t"  // a+b
      "paddh     %[y20],        %[y20],          %[y10]  \n\t"  // c+b
      "paddh     %[y00],        %[y00],          %[y20]  \n\t"  // a+2b+c

      "paddh     %[y02],        %[y02],          %[y12]  \n\t"  // a_sub+b_sub
      "paddh     %[y22],        %[y22],          %[y12]  \n\t"  // c_sub+b_sub
      "paddh     %[y02],        %[y02],          %[y22]  \n\t"  // a_sub+2b_sub+c_sub

      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
      "psubh     %[sobel],      %[y10],          %[y20]  \n\t"  // Abs

      "gsldlc1   %[y00],        0x0B(%[src_y0])          \n\t"
      "gsldrc1   %[y00],        0x04(%[src_y0])          \n\t"
      "gsldlc1   %[y02],        0x0D(%[src_y0])          \n\t"
      "gsldrc1   %[y02],        0x06(%[src_y0])          \n\t"

      "gsldlc1   %[y10],        0x0B(%[src_y1])          \n\t"
      "gsldrc1   %[y10],        0x04(%[src_y1])          \n\t"
      "gsldlc1   %[y12],        0x0D(%[src_y1])          \n\t"
      "gsldrc1   %[y12],        0x06(%[src_y1])          \n\t"

      "gsldlc1   %[y20],        0x0B(%[src_y2])          \n\t"
      "gsldrc1   %[y20],        0x04(%[src_y2])          \n\t"
      "gsldlc1   %[y22],        0x0D(%[src_y2])          \n\t"
      "gsldrc1   %[y22],        0x06(%[src_y2])          \n\t"

      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"

      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"

      "paddh     %[y00],        %[y00],          %[y10]  \n\t"
      "paddh     %[y20],        %[y20],          %[y10]  \n\t"
      "paddh     %[y00],        %[y00],          %[y20]  \n\t"

      "paddh     %[y02],        %[y02],          %[y12]  \n\t"
      "paddh     %[y22],        %[y22],          %[y12]  \n\t"
      "paddh     %[y02],        %[y02],          %[y22]  \n\t"

      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
      "psubh     %[y00],        %[y10],          %[y20]  \n\t"

      "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"

      "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
      "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
      "daddiu    %[src_y2],     %[src_y2],      8        \n\t"
      "daddiu    %[dst_sobelx], %[dst_sobelx],  8        \n\t"
      "daddiu    %[width],      %[width],      -8        \n\t"
      "bgtz      %[width],      1b                       \n\t"
      "nop                                               \n\t"
      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
        [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
        [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
      : "memory");
}

void SobelYRow_MMI(const uint8_t* src_y0,
                   const uint8_t* src_y1,
                   uint8_t* dst_sobely,
                   int width) {
  uint64_t y00 = 0, y01 = 0, y02 = 0;
  uint64_t y10 = 0, y11 = 0, y12 = 0;
  uint64_t zero = 0x0;
  uint64_t sobel = 0x0;
  __asm__ volatile(
      "1:	                                        \n\t"
      "gsldlc1   %[y00],        0x07(%[src_y0])         \n\t"  // a=src_y0[i]
      "gsldrc1   %[y00],        0x00(%[src_y0])         \n\t"
      "gsldlc1   %[y01],        0x08(%[src_y0])         \n\t"  // b=src_y0[i+1]
      "gsldrc1   %[y01],        0x01(%[src_y0])         \n\t"
      "gsldlc1   %[y02],        0x09(%[src_y0])         \n\t"  // c=src_y0[i+2]
      "gsldrc1   %[y02],        0x02(%[src_y0])         \n\t"

      "gsldlc1   %[y10],        0x07(%[src_y1])         \n\t"  // a_sub=src_y1[i]
      "gsldrc1   %[y10],        0x00(%[src_y1])         \n\t"
      "gsldlc1   %[y11],        0x08(%[src_y1])         \n\t"  // b_sub=src_y1[i+1]
      "gsldrc1   %[y11],        0x01(%[src_y1])         \n\t"
      "gsldlc1   %[y12],        0x09(%[src_y1])         \n\t"  // c_sub=src_y1[i+2]
      "gsldrc1   %[y12],        0x02(%[src_y1])         \n\t"

      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"

      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"

      "paddh     %[y00],        %[y00],         %[y01]  \n\t"  // a+b
      "paddh     %[y02],        %[y02],         %[y01]  \n\t"  // c+b
      "paddh     %[y00],        %[y00],         %[y02]  \n\t"  // a+2b+c

      "paddh     %[y10],        %[y10],         %[y11]  \n\t"  // a_sub+b_sub
      "paddh     %[y12],        %[y12],         %[y11]  \n\t"  // c_sub+b_sub
      "paddh     %[y10],        %[y10],         %[y12]  \n\t"  // a_sub+2b_sub+c_sub

      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
      "psubh     %[sobel],      %[y02],         %[y12]  \n\t"  // Abs

      "gsldlc1   %[y00],        0x0B(%[src_y0])         \n\t"
      "gsldrc1   %[y00],        0x04(%[src_y0])         \n\t"
      "gsldlc1   %[y01],        0x0C(%[src_y0])         \n\t"
      "gsldrc1   %[y01],        0x05(%[src_y0])         \n\t"
      "gsldlc1   %[y02],        0x0D(%[src_y0])         \n\t"
      "gsldrc1   %[y02],        0x06(%[src_y0])         \n\t"

      "gsldlc1   %[y10],        0x0B(%[src_y1])         \n\t"
      "gsldrc1   %[y10],        0x04(%[src_y1])         \n\t"
      "gsldlc1   %[y11],        0x0C(%[src_y1])         \n\t"
      "gsldrc1   %[y11],        0x05(%[src_y1])         \n\t"
      "gsldlc1   %[y12],        0x0D(%[src_y1])         \n\t"
      "gsldrc1   %[y12],        0x06(%[src_y1])         \n\t"

      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"

      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"

      "paddh     %[y00],        %[y00],         %[y01]  \n\t"
      "paddh     %[y02],        %[y02],         %[y01]  \n\t"
      "paddh     %[y00],        %[y00],         %[y02]  \n\t"

      "paddh     %[y10],        %[y10],         %[y11]  \n\t"
      "paddh     %[y12],        %[y12],         %[y11]  \n\t"
      "paddh     %[y10],        %[y10],         %[y12]  \n\t"

      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
      "psubh     %[y00],        %[y02],         %[y12]  \n\t"

      "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"

      "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
      "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
      "daddiu    %[dst_sobely], %[dst_sobely],  8       \n\t"
      "daddiu    %[width],      %[width],      -8       \n\t"
      "bgtz      %[width],      1b                      \n\t"
      "nop                                              \n\t"
      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
        [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
        [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
      : "memory");
}

void SobelRow_MMI(const uint8_t* src_sobelx,
                  const uint8_t* src_sobely,
                  uint8_t* dst_argb,
                  int width) {
  double temp[3];
  uint64_t c1 = 0xff000000ff000000;
  __asm__ volatile(
      "1:	                                          \n\t"
      "gsldlc1   %[t0],         0x07(%[src_sobelx])       \n\t"  // a=src_sobelx[i]
      "gsldrc1   %[t0],         0x00(%[src_sobelx])       \n\t"
      "gsldlc1   %[t1],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
      "gsldrc1   %[t1],         0x00(%[src_sobely])       \n\t"
      // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
      "paddusb   %[t2] ,        %[t0],              %[t1] \n\t"

      // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
      "punpcklbh %[t0],         %[t2],              %[t2] \n\t"

      // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
      "or        %[t1],         %[t1],              %[c1] \n\t"
      // 255 s1 s1 s1 s55 s0 s0 s0
      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"

      // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
      "or        %[t1],         %[t1],              %[c1] \n\t"
      // 255 s3 s3 s3 255 s2 s2 s2
      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"

      // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
      "punpckhbh %[t0],         %[t2],              %[t2] \n\t"

      // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
      "or        %[t1],         %[t1],              %[c1] \n\t"
      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"

      // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
      "or        %[t1],         %[t1],              %[c1] \n\t"
      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"

      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
      "daddiu    %[width],      %[width],          -8     \n\t"
      "bgtz      %[width],      1b                        \n\t"
      "nop                                                \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
      : "memory");
}

void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
                         const uint8_t* src_sobely,
                         uint8_t* dst_y,
                         int width) {
  uint64_t tr = 0;
  uint64_t tb = 0;
  __asm__ volatile(
      "1:	                                       \n\t"
      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
      "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"

      "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
      "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
      "daddiu  %[src_sobely], %[src_sobely],     8     \n\t"
      "daddiu  %[width],      %[width],         -8     \n\t"
      "bgtz    %[width],      1b                       \n\t"
      "nop                                             \n\t"
      : [tr] "=&f"(tr), [tb] "=&f"(tb)
      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
        [dst_y] "r"(dst_y), [width] "r"(width)
      : "memory");
}

void SobelXYRow_MMI(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
  uint64_t temp[3];
  uint64_t result = 0;
  uint64_t gb = 0;
  uint64_t cr = 0;
  uint64_t c1 = 0xffffffffffffffff;
  __asm__ volatile(
      "1:	                                          \n\t"
      "gsldlc1   %[tr],         0x07(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
      "gsldrc1   %[tr],         0x00(%[src_sobelx])       \n\t"
      "gsldlc1   %[tb],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
      "gsldrc1   %[tb],         0x00(%[src_sobely])       \n\t"
      "paddusb   %[tg] ,        %[tr],              %[tb] \n\t"  // g

      // g3 b3 g2 b2 g1 b1 g0 b0
      "punpcklbh %[gb],         %[tb],              %[tg] \n\t"
      // c3 r3 r2 r2 c1 r1 c0 r0
      "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
      // c1 r1 g1 b1 c0 r0 g0 b0
      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
      // c3 r3 g3 b3 c2 r2 g2 b2
      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"

      // g7 b7 g6 b6 g5 b5 g4 b4
      "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
      // c7 r7 c6 r6 c5 r5 c4 r4
      "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
      // c5 r5 g5 b5 c4 r4 g4 b4
      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
      // c7 r7 g7 b7 c6 r6 g6 b6
      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"

      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
      "daddiu    %[width],      %[width],          -8     \n\t"
      "bgtz      %[width],      1b                        \n\t"
      "nop                                                \n\t"
      : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
        [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
      : "memory");
}

void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  // Copy a Y to RGB.
  uint64_t src, dest;
  const uint64_t mask0 = 0x00ffffff00ffffffULL;
  const uint64_t mask1 = ~mask0;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "punpcklbh  %[src],          %[src],            %[src]        \n\t"
      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
      "and        %[dest],         %[dest],           %[mask0]      \n\t"
      "or         %[dest],         %[dest],           %[mask1]      \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"

      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
      "and        %[dest],         %[dest],           %[mask0]      \n\t"
      "or         %[dest],         %[dest],           %[mask1]      \n\t"
      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
      "daddi      %[width],        %[width],         -0x04          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [dest] "=&f"(dest)
      : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [width] "r"(width)
      : "memory");
}

void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
  uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x55;
  const uint64_t mask2 = 0xAA;
  const uint64_t mask3 = 0xFF;
  const uint64_t mask4 = 0x4A354A354A354A35ULL;
  const uint64_t mask5 = 0x0488048804880488ULL;
  const uint64_t shift0 = 0x08;
  const uint64_t shift1 = 0x06;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"

      "pshufh     %[src],          %[src_lo],         %[mask0]      \n\t"
      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
      "pshufh     %[src],          %[src_lo],         %[mask1]      \n\t"
      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "pshufh     %[src],          %[src_lo],         %[mask2]      \n\t"
      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
      "pshufh     %[src],          %[src_lo],         %[mask3]      \n\t"
      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"

      "pshufh     %[src],          %[src_hi],         %[mask0]      \n\t"
      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
      "pshufh     %[src],          %[src_hi],         %[mask1]      \n\t"
      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"

      "pshufh     %[src],          %[src_hi],         %[mask2]      \n\t"
      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
      "pshufh     %[src],          %[src_hi],         %[mask3]      \n\t"
      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"

      "daddi      %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
        [dest_lo] "=&f"(dest_lo)
      : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
        [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
        [shift1] "f"(shift1), [width] "r"(width)
      : "memory");
}

void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  uint64_t source, src0, src1, dest;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x1b;

  src += width - 1;
  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[source],       0(%[src_ptr])                    \n\t"
      "gsldrc1    %[source],       -7(%[src_ptr])                   \n\t"
      "punpcklbh  %[src0],         %[source],         %[mask0]      \n\t"
      "pshufh     %[src0],         %[src0],           %[mask1]      \n\t"
      "punpckhbh  %[src1],         %[source],         %[mask0]      \n\t"
      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
      "packushb   %[dest],         %[src1],           %[src0]       \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddi      %[src_ptr],      %[src_ptr],       -0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
        [src1] "=&f"(src1)
      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [width] "r"(width)
      : "memory");
}

void MirrorUVRow_MMI(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  uint64_t src0, src1, dest0, dest1;
  const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
  const uint64_t mask1 = 0x1b;
  const uint64_t shift = 0x08;

  src_uv += (width - 1) << 1;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src0],         1(%[src_ptr])                    \n\t"
      "gsldrc1    %[src0],         -6(%[src_ptr])                   \n\t"
      "gsldlc1    %[src1],         -7(%[src_ptr])                   \n\t"
      "gsldrc1    %[src1],         -14(%[src_ptr])                  \n\t"

      "and        %[dest0],        %[src0],           %[mask0]      \n\t"
      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
      "and        %[dest1],        %[src1],           %[mask0]      \n\t"
      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
      "gssdlc1    %[dest0],        0x07(%[dstu_ptr])                \n\t"
      "gssdrc1    %[dest0],        0x00(%[dstu_ptr])                \n\t"

      "psrlh      %[dest0],        %[src0],           %[shift]      \n\t"
      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
      "psrlh      %[dest1],        %[src1],           %[shift]      \n\t"
      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
      "gssdlc1    %[dest0],        0x07(%[dstv_ptr])                \n\t"
      "gssdrc1    %[dest0],        0x00(%[dstv_ptr])                \n\t"

      "daddi      %[src_ptr],      %[src_ptr],       -0x10          \n\t"
      "daddiu     %[dstu_ptr],     %[dstu_ptr],       0x08          \n\t"
      "daddiu     %[dstv_ptr],     %[dstv_ptr],       0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
        [src1] "=&f"(src1)
      : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
        [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
        [shift] "f"(shift)
      : "memory");
}

void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  src += (width - 1) * 4;
  uint64_t temp = 0x0;
  uint64_t shuff = 0x4e;  // 01 00 11 10
  __asm__ volatile(
      "1:                                      \n\t"
      "gsldlc1 %[temp],  3(%[src])     	       \n\t"
      "gsldrc1 %[temp], -4(%[src])     	       \n\t"
      "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"

      "daddiu  %[src],   %[src],    -0x08      \n\t"
      "daddiu  %[dst],   %[dst],     0x08      \n\t"
      "daddiu  %[width], %[width],  -0x02      \n\t"
      "bnez    %[width], 1b                    \n\t"
      : [temp] "=&f"(temp)
      : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
      : "memory");
}

void SplitUVRow_MMI(const uint8_t* src_uv,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width) {
  uint64_t c0 = 0x00ff00ff00ff00ff;
  uint64_t temp[4];
  uint64_t shift = 0x08;
  __asm__ volatile(
      "1:	                                    \n\t"
      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"

      "and      %[t2],     %[t0],          %[c0]    \n\t"
      "and      %[t3],     %[t1],          %[c0]    \n\t"
      "packushb %[t2],     %[t2],          %[t3]    \n\t"
      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"

      "psrlh    %[t2],     %[t0],          %[shift] \n\t"
      "psrlh    %[t3],     %[t1],          %[shift] \n\t"
      "packushb %[t2],     %[t2],          %[t3]    \n\t"
      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"

      "daddiu   %[src_uv], %[src_uv],      16       \n\t"
      "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
      "daddiu   %[dst_v],  %[dst_v],       8        \n\t"
      "daddiu   %[width],  %[width],      -8        \n\t"
      "bgtz     %[width],  1b                       \n\t"
      "nop                                          \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
        [t3] "=&f"(temp[3])
      : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
      : "memory");
}

void MergeUVRow_MMI(const uint8_t* src_u,
                    const uint8_t* src_v,
                    uint8_t* dst_uv,
                    int width) {
  uint64_t temp[3];
  __asm__ volatile(
      "1:	                                 \n\t"
      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
      "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
      "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"

      "daddiu    %[src_u],  %[src_u],      8     \n\t"
      "daddiu    %[src_v],  %[src_v],      8     \n\t"
      "daddiu    %[dst_uv], %[dst_uv],     16    \n\t"
      "daddiu    %[width],  %[width],     -8     \n\t"
      "bgtz      %[width],  1b                   \n\t"
      "nop                                       \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
      : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
        [width] "r"(width)
      : "memory");
}

void SplitRGBRow_MMI(const uint8_t* src_rgb,
                     uint8_t* dst_r,
                     uint8_t* dst_g,
                     uint8_t* dst_b,
                     int width) {
  uint64_t src[4];
  uint64_t dest_hi, dest_lo, dest;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
      "punpcklbh  %[dest_lo],      %[src0],           %[src1]       \n\t"
      "gslwlc1    %[src2],         0x09(%[src_ptr])                 \n\t"
      "gslwrc1    %[src2],         0x06(%[src_ptr])                 \n\t"
      "gslwlc1    %[src3],         0x0c(%[src_ptr])                 \n\t"
      "gslwrc1    %[src3],         0x09(%[src_ptr])                 \n\t"
      "punpcklbh  %[dest_hi],      %[src2],           %[src3]       \n\t"

      "punpcklhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "gsswlc1    %[dest],         0x03(%[dstr_ptr])                \n\t"
      "gsswrc1    %[dest],         0x00(%[dstr_ptr])                \n\t"
      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
      "gsswlc1    %[dest],         0x03(%[dstg_ptr])                \n\t"
      "gsswrc1    %[dest],         0x00(%[dstg_ptr])                \n\t"
      "punpckhhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "gsswlc1    %[dest],         0x03(%[dstb_ptr])                \n\t"
      "gsswrc1    %[dest],         0x00(%[dstb_ptr])                \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
      "daddiu     %[dstr_ptr],     %[dstr_ptr],       0x04          \n\t"
      "daddiu     %[dstg_ptr],     %[dstg_ptr],       0x04          \n\t"
      "daddiu     %[dstb_ptr],     %[dstb_ptr],       0x04          \n\t"
      "daddi      %[width],        %[width],         -0x04          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
        [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
      : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
        [dstb_ptr] "r"(dst_b), [width] "r"(width)
      : "memory");
}

void MergeRGBRow_MMI(const uint8_t* src_r,
                     const uint8_t* src_g,
                     const uint8_t* src_b,
                     uint8_t* dst_rgb,
                     int width) {
  uint64_t srcr, srcg, srcb, dest;
  uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
  const uint64_t temp = 0x0;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[srcr],         0x07(%[srcr_ptr])                \n\t"
      "gsldrc1    %[srcr],         0x00(%[srcr_ptr])                \n\t"
      "gsldlc1    %[srcg],         0x07(%[srcg_ptr])                \n\t"
      "gsldrc1    %[srcg],         0x00(%[srcg_ptr])                \n\t"
      "punpcklbh  %[srcrg_lo],     %[srcr],           %[srcg]       \n\t"
      "punpckhbh  %[srcrg_hi],     %[srcr],           %[srcg]       \n\t"

      "gsldlc1    %[srcb],         0x07(%[srcb_ptr])                \n\t"
      "gsldrc1    %[srcb],         0x00(%[srcb_ptr])                \n\t"
      "punpcklbh  %[srcbz_lo],     %[srcb],           %[temp]       \n\t"
      "punpckhbh  %[srcbz_hi],     %[srcb],           %[temp]       \n\t"

      "punpcklhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
      "gsswlc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
      "gsswlc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
      "punpckhhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
      "gsswlc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
      "gsswlc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
      "punpcklhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
      "gsswlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
      "gsswlc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
      "punpckhhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
      "gsswlc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
      "gsswlc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
      "gsswrc1    %[dest],         0x15(%[dst_ptr])                 \n\t"

      "daddiu     %[srcr_ptr],     %[srcr_ptr],       0x08          \n\t"
      "daddiu     %[srcg_ptr],     %[srcg_ptr],       0x08          \n\t"
      "daddiu     %[srcb_ptr],     %[srcb_ptr],       0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x18          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
        [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
        [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
        [srcbz_lo] "=&f"(srcbz_lo)
      : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
        [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
      : "memory");
}

// Filter 2 rows of YUY2 UV's (422) into U and V (420).
void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
                     int src_stride_yuy2,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  uint64_t c0 = 0xff00ff00ff00ff00;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t temp[3];
  uint64_t data[4];
  uint64_t shift = 0x08;
  uint64_t src_stride = 0x0;
  __asm__ volatile(
      "1:	                                                     \n\t"
      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
      "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"

      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
      "pavgb    %[t1],         %[t2],             %[t1]              \n\t"

      "and      %[t0],         %[t0],             %[c0]              \n\t"
      "and      %[t1],         %[t1],             %[c0]              \n\t"
      "psrlh    %[t0],         %[t0],             %[shift]           \n\t"
      "psrlh    %[t1],         %[t1],             %[shift]           \n\t"
      "packushb %[t0],         %[t0],             %[t1]              \n\t"
      "mov.s    %[t1],         %[t0]                                 \n\t"
      "and      %[d0],         %[t0],             %[c1]              \n\t"
      "psrlh    %[d1],         %[t1],             %[shift]           \n\t"

      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
      "pavgb    %[t0],         %[t0],              %[t1]             \n\t"

      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
      "pavgb    %[t1],         %[t2],              %[t1]             \n\t"

      "and      %[t0],         %[t0],              %[c0]             \n\t"
      "and      %[t1],         %[t1],              %[c0]             \n\t"
      "psrlh    %[t0],         %[t0],              %[shift]          \n\t"
      "psrlh    %[t1],         %[t1],              %[shift]          \n\t"
      "packushb %[t0],         %[t0],              %[t1]             \n\t"
      "mov.s    %[t1],         %[t0]                                 \n\t"
      "and      %[d2],         %[t0],              %[c1]             \n\t"
      "psrlh    %[d3],         %[t1],              %[shift]          \n\t"

      "packushb %[d0],         %[d0],              %[d2]             \n\t"
      "packushb %[d1],         %[d1],              %[d3]             \n\t"
      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
      "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
      "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
      "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
      "daddiu   %[width],      %[width],          -16                \n\t"
      "bgtz     %[width],      1b                                    \n\t"
      "nop                                                           \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
      : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
      : "memory");
}

// Copy row of YUY2 UV's (422) into U and V (422).
void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
  uint64_t c0 = 0xff00ff00ff00ff00;
  uint64_t c1 = 0x00ff00ff00ff00ff;
  uint64_t temp[2];
  uint64_t data[4];
  uint64_t shift = 0x08;
  __asm__ volatile(
      "1:	                                        \n\t"
      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
      "and      %[t0],       %[t0],            %[c0]    \n\t"
      "and      %[t1],       %[t1],            %[c0]    \n\t"
      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
      "packushb %[t0],       %[t0],            %[t1]    \n\t"
      "mov.s    %[t1],       %[t0]                      \n\t"
      "and      %[d0],       %[t0],            %[c1]    \n\t"
      "psrlh    %[d1],       %[t1],            %[shift] \n\t"

      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
      "and      %[t0],       %[t0],            %[c0]    \n\t"
      "and      %[t1],       %[t1],            %[c0]    \n\t"
      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
      "packushb %[t0],       %[t0],            %[t1]    \n\t"
      "mov.s    %[t1],       %[t0]                      \n\t"
      "and      %[d2],       %[t0],            %[c1]    \n\t"
      "psrlh    %[d3],       %[t1],            %[shift] \n\t"

      "packushb %[d0],       %[d0],            %[d2]    \n\t"
      "packushb %[d1],       %[d1],            %[d3]    \n\t"
      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
      "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
      "daddiu   %[width],    %[width],        -16       \n\t"
      "bgtz     %[width],    1b                         \n\t"
      "nop                                              \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
      : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
      : "memory");
}

// Copy row of YUY2 Y's (422) into Y (420/422).
void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  uint64_t c0 = 0x00ff00ff00ff00ff;
  uint64_t temp[2];
  __asm__ volatile(
      "1:	                                     \n\t"
      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
      "and      %[t0],       %[t0],            %[c0] \n\t"
      "and      %[t1],       %[t1],            %[c0] \n\t"
      "packushb %[t0],       %[t0],            %[t1] \n\t"
      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
      "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
      "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
      "daddiu   %[width],    %[width],        -8     \n\t"
      "bgtz     %[width],    1b                      \n\t"
      "nop                                           \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
      : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
        [c0] "f"(c0)
      : "memory");
}

// Filter 2 rows of UYVY UV's (422) into U and V (420).
void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
                     int src_stride_uyvy,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  // Output a row of UV values.
  uint64_t c0 = 0x00ff00ff00ff00ff;
  uint64_t temp[3];
  uint64_t data[4];
  uint64_t shift = 0x08;
  uint64_t src_stride = 0x0;
  __asm__ volatile(
      "1:	                                                      \n\t"
      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
      "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"

      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"

      "and      %[t0],         %[t0],              %[c0]              \n\t"
      "and      %[t1],         %[t1],              %[c0]              \n\t"
      "packushb %[t0],         %[t0],              %[t1]              \n\t"
      "mov.s    %[t1],         %[t0]                                  \n\t"
      "and      %[d0],         %[t0],              %[c0]              \n\t"
      "psrlh    %[d1],         %[t1],              %[shift]           \n\t"

      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"

      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"

      "and      %[t0],         %[t0],              %[c0]              \n\t"
      "and      %[t1],         %[t1],              %[c0]              \n\t"
      "packushb %[t0],         %[t0],              %[t1]              \n\t"
      "mov.s    %[t1],         %[t0]                                  \n\t"
      "and      %[d2],         %[t0],              %[c0]              \n\t"
      "psrlh    %[d3],         %[t1],              %[shift]           \n\t"

      "packushb %[d0],         %[d0],              %[d2]              \n\t"
      "packushb %[d1],         %[d1],              %[d3]              \n\t"
      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
      "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
      "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
      "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
      "daddiu   %[width],      %[width],          -16                 \n\t"
      "bgtz     %[width],      1b                                     \n\t"
      "nop                                                            \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
      : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
        [c0] "f"(c0), [shift] "f"(shift)
      : "memory");
}

// Copy row of UYVY UV's (422) into U and V (422).
void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
  // Output a row of UV values.
  uint64_t c0 = 0x00ff00ff00ff00ff;
  uint64_t temp[2];
  uint64_t data[4];
  uint64_t shift = 0x08;
  __asm__ volatile(
      "1:	                                        \n\t"
      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
      "and      %[t0],       %[t0],            %[c0]    \n\t"
      "and      %[t1],       %[t1],            %[c0]    \n\t"
      "packushb %[t0],       %[t0],            %[t1]    \n\t"
      "mov.s    %[t1],       %[t0]                      \n\t"
      "and      %[d0],       %[t0],            %[c0]    \n\t"
      "psrlh    %[d1],       %[t1],            %[shift] \n\t"

      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
      "and      %[t0],       %[t0],            %[c0]    \n\t"
      "and      %[t1],       %[t1],            %[c0]    \n\t"
      "packushb %[t0],       %[t0],            %[t1]    \n\t"
      "mov.s    %[t1],       %[t0]                      \n\t"
      "and      %[d2],       %[t0],            %[c0]    \n\t"
      "psrlh    %[d3],       %[t1],            %[shift] \n\t"

      "packushb %[d0],       %[d0],            %[d2]    \n\t"
      "packushb %[d1],       %[d1],            %[d3]    \n\t"
      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
      "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
      "daddiu   %[width],    %[width],        -16       \n\t"
      "bgtz     %[width],    1b                         \n\t"
      "nop                                              \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
      : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
      : "memory");
}

// Copy row of UYVY Y's (422) into Y (420/422).
void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  // Output a row of Y values.
  uint64_t c0 = 0x00ff00ff00ff00ff;
  uint64_t shift = 0x08;
  uint64_t temp[2];
  __asm__ volatile(
      "1:	                                        \n\t"
      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
      "dsrl     %[t0],       %[t0],            %[shift] \n\t"
      "dsrl     %[t1],       %[t1],            %[shift] \n\t"
      "and      %[t0],       %[t0],            %[c0]    \n\t"
      "and      %[t1],       %[t1],            %[c0]    \n\t"
      "and      %[t1],       %[t1],            %[c0]    \n\t"
      "packushb %[t0],       %[t0],            %[t1]    \n\t"
      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
      "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
      "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
      "daddiu   %[width],    %[width],        -8        \n\t"
      "bgtz     %[width],    1b                         \n\t"
      "nop                                              \n\t"
      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
      : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
        [c0] "f"(c0), [shift] "f"(shift)
      : "memory");
}

// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
// This code mimics the SSSE3 version for better testability.
void ARGBBlendRow_MMI(const uint8_t* src_argb0,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
  uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
      dest_lo;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
  const uint64_t mask3 = 0xFF;
  const uint64_t mask4 = ~mask1;
  const uint64_t shift = 0x08;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"

      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"

      "psubush    %[alpha],        %[mask2],          %[src0_lo]    \n\t"
      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
      "pmullh     %[dest_lo],      %[src1_lo],        %[alpha]      \n\t"
      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
      "paddush    %[dest_lo],      %[dest_lo],        %[src0_lo]    \n\t"

      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"

      "psubush    %[alpha],        %[mask2],          %[src0_hi]    \n\t"
      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
      "pmullh     %[dest_hi],      %[src1_hi],        %[alpha]      \n\t"
      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
      "paddush    %[dest_hi],      %[dest_hi],        %[src0_hi]    \n\t"

      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "and        %[dest],         %[dest],           %[mask1]      \n\t"
      "or         %[dest],         %[dest],           %[mask4]      \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
        [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
        [shift] "f"(shift), [width] "r"(width)
      : "memory");
}

void BlendPlaneRow_MMI(const uint8_t* src0,
                       const uint8_t* src1,
                       const uint8_t* alpha,
                       uint8_t* dst,
                       int width) {
  uint64_t source0, source1, dest, alph;
  uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
      dest_lo;
  uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
  const uint64_t shift = 0x08;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"

      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"

      "gsldlc1    %[alpha],        0x07(%[alpha_ptr])               \n\t"
      "gsldrc1    %[alpha],        0x00(%[alpha_ptr])               \n\t"
      "psubusb    %[alpha_r],      %[mask1],          %[alpha]      \n\t"
      "punpcklbh  %[alpha_lo],     %[alpha],          %[mask0]      \n\t"
      "punpckhbh  %[alpha_hi],     %[alpha],          %[mask0]      \n\t"
      "punpcklbh  %[alpha_rlo],    %[alpha_r],        %[mask0]      \n\t"
      "punpckhbh  %[alpha_rhi],    %[alpha_r],        %[mask0]      \n\t"

      "pmullh     %[dest_lo],      %[src0_lo],        %[alpha_lo]   \n\t"
      "pmullh     %[dest],         %[src1_lo],        %[alpha_rlo]  \n\t"
      "paddush    %[dest_lo],      %[dest_lo],        %[dest]       \n\t"
      "paddush    %[dest_lo],      %[dest_lo],        %[mask2]      \n\t"
      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"

      "pmullh     %[dest_hi],      %[src0_hi],        %[alpha_hi]   \n\t"
      "pmullh     %[dest],         %[src1_hi],        %[alpha_rhi]  \n\t"
      "paddush    %[dest_hi],      %[dest_hi],        %[dest]       \n\t"
      "paddush    %[dest_hi],      %[dest_hi],        %[mask2]      \n\t"
      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"

      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
      "daddiu     %[alpha_ptr],    %[alpha_ptr],      0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
        [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
        [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
        [alpha_r] "=&f"(alpha_rev)
      : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
        [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
        [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
      : "memory");
}

// Multiply source RGB by alpha and store to destination.
// This code mimics the SSSE3 version for better testability.
void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          int width) {
  uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
  const uint64_t mask0 = 0xFF;
  const uint64_t mask1 = 0xFF000000FF000000ULL;
  const uint64_t mask2 = ~mask1;
  const uint64_t shift = 0x08;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"

      "pshufh     %[alpha],        %[src_lo],         %[mask0]      \n\t"
      "pmulhuh    %[dest_lo],      %[alpha],          %[src_lo]     \n\t"
      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
      "pshufh     %[alpha],        %[src_hi],         %[mask0]      \n\t"
      "pmulhuh    %[dest_hi],      %[alpha],          %[src_hi]     \n\t"
      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"

      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
      "and        %[dest],         %[dest],           %[mask2]      \n\t"
      "and        %[src],          %[src],            %[mask1]      \n\t"
      "or         %[dest],         %[dest],           %[src]        \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
        [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
        [width] "r"(width)
      : "memory");
}

void ComputeCumulativeSumRow_MMI(const uint8_t* row,
                                 int32_t* cumsum,
                                 const int32_t* previous_cumsum,
                                 int width) {
  int64_t row_sum[2] = {0, 0};
  uint64_t src, dest0, dest1, presrc0, presrc1, dest;
  const uint64_t mask = 0x0;

  __asm__ volatile(
      "xor        %[row_sum0],     %[row_sum0],       %[row_sum0]   \n\t"
      "xor        %[row_sum1],     %[row_sum1],       %[row_sum1]   \n\t"

      "1:                                                           \n\t"
      "gslwlc1    %[src],          0x03(%[row_ptr])                 \n\t"
      "gslwrc1    %[src],          0x00(%[row_ptr])                 \n\t"

      "punpcklbh  %[src],          %[src],            %[mask]       \n\t"
      "punpcklhw  %[dest0],        %[src],            %[mask]       \n\t"
      "punpckhhw  %[dest1],        %[src],            %[mask]       \n\t"

      "paddw      %[row_sum0],     %[row_sum0],       %[dest0]      \n\t"
      "paddw      %[row_sum1],     %[row_sum1],       %[dest1]      \n\t"

      "gsldlc1    %[presrc0],      0x07(%[pre_ptr])                 \n\t"
      "gsldrc1    %[presrc0],      0x00(%[pre_ptr])                 \n\t"
      "gsldlc1    %[presrc1],      0x0f(%[pre_ptr])                 \n\t"
      "gsldrc1    %[presrc1],      0x08(%[pre_ptr])                 \n\t"

      "paddw      %[dest0],        %[row_sum0],       %[presrc0]    \n\t"
      "paddw      %[dest1],        %[row_sum1],       %[presrc1]    \n\t"

      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"

      "daddiu     %[row_ptr],      %[row_ptr],        0x04          \n\t"
      "daddiu     %[pre_ptr],      %[pre_ptr],        0x10          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
      "daddi      %[width],        %[width],         -0x01          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
        [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
        [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
        [presrc1] "=&f"(presrc1)
      : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
        [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
      : "memory");
}

// C version 2x2 -> 2x1.
void InterpolateRow_MMI(uint8_t* dst_ptr,
                        const uint8_t* src_ptr,
                        ptrdiff_t src_stride,
                        int width,
                        int source_y_fraction) {
  if (source_y_fraction == 0) {
    __asm__ volatile(
        "1:	                              \n\t"
        "ld     $t0,        0x0(%[src_ptr])   \n\t"
        "sd     $t0,        0x0(%[dst_ptr])   \n\t"
        "daddiu %[src_ptr], %[src_ptr],     8 \n\t"
        "daddiu %[dst_ptr], %[dst_ptr],     8 \n\t"
        "daddiu %[width],   %[width],      -8 \n\t"
        "bgtz   %[width],   1b                \n\t"
        "nop                                  \n\t"
        :
        : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
        : "memory");
    return;
  }
  if (source_y_fraction == 128) {
    uint64_t uv = 0x0;
    uint64_t uv_stride = 0x0;
    __asm__ volatile(
        "1:	                                            \n\t"
        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"

        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"

        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
        "daddiu  %[width],     %[width],      -8            \n\t"
        "bgtz    %[width],     1b                           \n\t"
        "nop                                                \n\t"
        : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
        : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
          [stride] "r"((int64_t)src_stride)
        : "memory");
    return;
  }
  const uint8_t* src_ptr1 = src_ptr + src_stride;
  uint64_t temp;
  uint64_t data[4];
  uint64_t zero = 0x0;
  uint64_t c0 = 0x0080008000800080;
  uint64_t fy0 = 0x0100010001000100;
  uint64_t shift = 0x8;
  __asm__ volatile(
      "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
      "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
      "1:	                                        \n\t"
      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
      "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
      "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
      "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
      "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"

      "pmullh    %[d0],       %[d0],           %[fy0]   \n\t"
      "pmullh    %[d2],       %[d2],           %[fy1]   \n\t"
      "paddh     %[d0],       %[d0],           %[d2]    \n\t"
      "paddh     %[d0],       %[d0],           %[c0]    \n\t"
      "psrlh     %[d0],       %[d0],           %[shift] \n\t"

      "pmullh    %[d1],       %[d1],           %[fy0]   \n\t"
      "pmullh    %[d3],       %[d3],           %[fy1]   \n\t"
      "paddh     %[d1],       %[d1],           %[d3]    \n\t"
      "paddh     %[d1],       %[d1],           %[c0]    \n\t"
      "psrlh     %[d1],       %[d1],           %[shift] \n\t"

      "packushb  %[d0],       %[d0],           %[d1]    \n\t"
      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
      "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
      "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
      "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
      "daddiu    %[width],    %[width],       -8        \n\t"
      "bgtz      %[width],    1b                        \n\t"
      "nop                                              \n\t"
      : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
        [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
      : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
        [dst_ptr] "r"(dst_ptr), [width] "r"(width),
        [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
        [shift] "f"(shift), [zero] "f"(zero)
      : "memory");
}

// Use first 4 shuffler values to reorder ARGB channels.
void ARGBShuffleRow_MMI(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        const uint8_t* shuffler,
                        int width) {
  uint64_t source, dest0, dest1, dest;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
                         ((shuffler[2] & 0x03) << 4) |
                         ((shuffler[3] & 0x03) << 6);

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"

      "punpcklbh  %[dest0],        %[src],            %[mask0]      \n\t"
      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
      "punpckhbh  %[dest1],        %[src],            %[mask0]      \n\t"
      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
        [dest1] "=&f"(dest1)
      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [width] "r"(width)
      : "memory");
}

void I422ToYUY2Row_MMI(const uint8_t* src_y,
                       const uint8_t* src_u,
                       const uint8_t* src_v,
                       uint8_t* dst_frame,
                       int width) {
  uint64_t temp[3];
  uint64_t vu = 0x0;
  __asm__ volatile(
      "1:	                                        \n\t"
      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
      "punpcklbh %[tu],        %[ty],             %[vu]	\n\t"  // g
      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
      "punpckhbh %[tu],        %[ty],             %[vu]	\n\t"  // g
      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
      "daddiu    %[src_y],     %[src_y],          8     \n\t"
      "daddiu    %[src_u],     %[src_u],          4     \n\t"
      "daddiu    %[src_v],     %[src_v],          4     \n\t"
      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
      "daddiu    %[width],     %[width],         -8     \n\t"
      "bgtz      %[width],     1b                       \n\t"
      "nop                                              \n\t"
      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
        [vu] "=&f"(vu)
      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
        [dst_frame] "r"(dst_frame), [width] "r"(width)
      : "memory");
}

void I422ToUYVYRow_MMI(const uint8_t* src_y,
                       const uint8_t* src_u,
                       const uint8_t* src_v,
                       uint8_t* dst_frame,
                       int width) {
  uint64_t temp[3];
  uint64_t vu = 0x0;
  __asm__ volatile(
      "1:	                                        \n\t"
      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
      "punpcklbh %[tu],        %[vu],             %[ty]	\n\t"  // g
      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
      "punpckhbh %[tu],        %[vu],             %[ty]	\n\t"  // g
      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
      "daddiu    %[src_y],     %[src_y],          8     \n\t"
      "daddiu    %[src_u],     %[src_u],          4     \n\t"
      "daddiu    %[src_v],     %[src_v],          4     \n\t"
      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
      "daddiu    %[width],     %[width],         -8     \n\t"
      "bgtz      %[width],     1b                       \n\t"
      "nop                                              \n\t"
      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
        [vu] "=&f"(vu)
      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
        [dst_frame] "r"(dst_frame), [width] "r"(width)
      : "memory");
}

void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  uint64_t source, dest;
  const uint64_t mask0 = 0xff000000ff000000ULL;
  const uint64_t mask1 = ~mask0;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "and        %[src],          %[src],            %[mask0]      \n\t"
      "and        %[dest],         %[dest],           %[mask1]      \n\t"
      "or         %[dest],         %[src],            %[dest]       \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x02          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(source), [dest] "=&f"(dest)
      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [width] "r"(width)
      : "memory");
}

void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
                             uint8_t* dst_a,
                             int width) {
  uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
  const uint64_t mask = 0xff000000ff000000ULL;
  const uint64_t shift = 0x18;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
      "and        %[dest0],        %[src],            %[mask]       \n\t"
      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
      "and        %[dest1],        %[src],            %[mask]       \n\t"
      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"

      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
      "and        %[dest0],        %[src],            %[mask]       \n\t"
      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
      "and        %[dest1],        %[src],            %[mask]       \n\t"
      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"

      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"

      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
        [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
        [shift] "f"(shift), [width] "r"(width)
      : "memory");
}

void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
  uint64_t source, dest0, dest1, dest;
  const uint64_t mask0 = 0x0;
  const uint64_t mask1 = 0x00ffffff00ffffffULL;

  __asm__ volatile(
      "1:                                                           \n\t"
      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"

      "punpcklbh  %[dest0],        %[mask0],          %[src]        \n\t"
      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
      "and        %[dest],         %[dest],           %[mask1]      \n\t"
      "or         %[dest],         %[dest],           %[dest1]      \n\t"
      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
      "gsldlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
      "gsldrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
      "and        %[dest],         %[dest],           %[mask1]      \n\t"
      "or         %[dest],         %[dest],           %[dest1]      \n\t"
      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"

      "punpckhbh  %[dest0],        %[mask0],          %[src]        \n\t"
      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
      "gsldlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
      "gsldrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
      "and        %[dest],         %[dest],           %[mask1]      \n\t"
      "or         %[dest],         %[dest],           %[dest1]      \n\t"
      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
      "gsldlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
      "gsldrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
      "and        %[dest],         %[dest],           %[mask1]      \n\t"
      "or         %[dest],         %[dest],           %[dest1]      \n\t"
      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"

      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
      "daddi      %[width],        %[width],         -0x08          \n\t"
      "bnez       %[width],        1b                               \n\t"
      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
        [dest1] "=&f"(dest1)
      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
        [mask1] "f"(mask1), [width] "r"(width)
      : "memory");
}

#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif