ARGBToYUV with sse3 on any size/alignment

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/366011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@161 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBToYUV with sse3 on any size/alignment
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/366011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@161 16f28f9a-4ce2-e073-06de-1de4eb20be90
b5b27d13 · fbarchard@google.com · caf39525 · b5b27d13 · b5b27d13 · b5b27d13
Commit b5b27d13 authored Jan 28, 2012 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 160
+Version: 161
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -16,7 +16,7 @@ namespace libyuv {
 extern "C" {
 #endif

-#define LIBYUV_VERSION 160
+#define LIBYUV_VERSION 161

 #ifdef __cplusplus
 }  // extern "C"

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -365,6 +365,11 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
      IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
    ARGBToYRow = ARGBToYRow_SSSE3;
+  } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
+    ARGBToYRow = ARGBToYAnyRow_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+    }
  } else
 #endif
  {
@@ -375,6 +380,12 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
    ARGBToUVRow = ARGBToUVRow_SSSE3;
+  } else if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 2) && width <= kMaxStride) {
+    ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+    }
  } else
 #endif
  {
@@ -416,6 +427,11 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
      IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
    ARGBToYRow = BGRAToYRow_SSSE3;
+  } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
+    ARGBToYRow = BGRAToYAnyRow_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = BGRAToYRow_Unaligned_SSSE3;
+    }
  } else
 #endif
  {
@@ -426,6 +442,12 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
    ARGBToUVRow = BGRAToUVRow_SSSE3;
+  } else if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 2) && width <= kMaxStride) {
+    ARGBToUVRow = BGRAToUVAnyRow_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = BGRAToUVRow_Unaligned_SSSE3;
+    }
  } else
 #endif
  {
@@ -467,6 +489,11 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
      IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
    ARGBToYRow = ABGRToYRow_SSSE3;
+  } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
+    ARGBToYRow = ABGRToYAnyRow_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ABGRToYRow_Unaligned_SSSE3;
+    }
  } else
 #endif
  {
@@ -477,6 +504,12 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
    ARGBToUVRow = ABGRToUVRow_SSSE3;
+  } else if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 2) && width <= kMaxStride) {
+    ARGBToUVRow = ABGRToUVAnyRow_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ABGRToUVRow_Unaligned_SSSE3;
+    }
  } else
 #endif
  {

--- a/source/row.h
+++ b/source/row.h
@@ -100,12 +100,22 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width);

 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
@@ -235,6 +245,16 @@ void ARGBToRGB565AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+
 void FastConvertYUVToARGBAnyRow_NEON(const uint8* y_buf,
                                     const uint8* u_buf,
                                     const uint8* v_buf,

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -380,8 +380,17 @@ void NAMEANY(const uint8* y_buf,                                               \
  memcpy(rgb_buf, row, width << 2);                                            \
 }

+#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
+MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3)
+MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3)
+MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3)
+#endif
+#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
+MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON)
+MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON)
+MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON)
+#endif

-// Wrappers to handle odd sizes/alignments
 #define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP)                                 \
 void NAMEANY(const uint8* argb_buf,                                            \
             uint8* rgb_buf,                                                   \
@@ -391,20 +400,40 @@ void NAMEANY(const uint8* argb_buf,                                            \
  memcpy(rgb_buf, row, width * BPP);                                           \
 }

-#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
-MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3)
-MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3)
-MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3)
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
 MAKEYUVANYRGB(ARGBToRGB24AnyRow_SSSE3, ARGBToRGB24Row_SSSE3, 3)
 MAKEYUVANYRGB(ARGBToRAWAnyRow_SSSE3, ARGBToRAWRow_SSSE3, 3)
 MAKEYUVANYRGB(ARGBToRGB565AnyRow_SSE2, ARGBToRGB565Row_SSE2, 2)
 MAKEYUVANYRGB(ARGBToARGB1555AnyRow_SSE2, ARGBToARGB1555Row_SSE2, 2)
 MAKEYUVANYRGB(ARGBToARGB4444AnyRow_SSE2, ARGBToARGB4444Row_SSE2, 2)
 #endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
-MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON)
-MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON)
-MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON)
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+#define MAKEARGBTOYANY(NAMEANY, ARGBTOY)                                       \
+    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
+      SIMD_ALIGNED(uint8 row[kMaxStride]);                                     \
+      ARGBTOY(src_argb, row, width);                                           \
+      memcpy(dst_y, row, width);                                               \
+    }
+
+MAKEARGBTOYANY(ARGBToYAnyRow_SSSE3, ARGBToYRow_Unaligned_SSSE3)
+MAKEARGBTOYANY(BGRAToYAnyRow_SSSE3, BGRAToYRow_Unaligned_SSSE3)
+MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3)
+
+#define MAKEARGBTOUVANY(NAMEANY, ARGBTOUV)                                     \
+    void NAMEANY(const uint8* src_argb0, int src_stride_argb,                  \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      SIMD_ALIGNED(uint8 row[kMaxStride * 2]);                                 \
+      ARGBTOUV(src_argb0, src_stride_argb, row, row + kMaxStride, width);      \
+      int halfwidth = (width + 1) >> 1;                                        \
+      memcpy(dst_u, row, halfwidth);                                           \
+      memcpy(dst_v, row + kMaxStride, halfwidth);                              \
+    }
+
+MAKEARGBTOUVANY(ARGBToUVAnyRow_SSSE3, ARGBToUVRow_Unaligned_SSSE3)
+MAKEARGBTOUVANY(BGRAToUVAnyRow_SSSE3, BGRAToUVRow_Unaligned_SSSE3)
+MAKEARGBTOUVANY(ABGRToUVAnyRow_SSSE3, ABGRToUVRow_Unaligned_SSSE3)
 #endif

 #ifdef __cplusplus

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -257,6 +257,43 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif

+);
+}
+
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+  "movdqa     %4,%%xmm5                        \n"
+  "movdqa     %3,%%xmm4                        \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     0x10(%0),%%xmm1                  \n"
+  "movdqu     0x20(%0),%%xmm2                  \n"
+  "movdqu     0x30(%0),%%xmm3                  \n"
+  "pmaddubsw  %%xmm4,%%xmm0                    \n"
+  "pmaddubsw  %%xmm4,%%xmm1                    \n"
+  "pmaddubsw  %%xmm4,%%xmm2                    \n"
+  "pmaddubsw  %%xmm4,%%xmm3                    \n"
+  "lea        0x40(%0),%0                      \n"
+  "phaddw     %%xmm1,%%xmm0                    \n"
+  "phaddw     %%xmm3,%%xmm2                    \n"
+  "psrlw      $0x7,%%xmm0                      \n"
+  "psrlw      $0x7,%%xmm2                      \n"
+  "packuswb   %%xmm2,%%xmm0                    \n"
+  "paddb      %%xmm5,%%xmm0                    \n"
+  "movdqu     %%xmm0,(%1)                      \n"
+  "lea        0x10(%1),%1                      \n"
+  "sub        $0x10,%2                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+
 );
 }
 #endif
@@ -325,6 +362,74 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 #endif
 );
 }
+
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+  "movdqa     %0,%%xmm4                        \n"
+  "movdqa     %1,%%xmm3                        \n"
+  "movdqa     %2,%%xmm5                        \n"
+  :
+  : "m"(kARGBToU),         // %0
+    "m"(kARGBToV),         // %1
+    "m"(kAddUV128)         // %2
+  :
+#if defined(__SSE2__)
+    "xmm3", "xmm4", "xmm5"
+#endif
+ );
+ asm volatile (
+  "sub        %1,%2                            \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     0x10(%0),%%xmm1                  \n"
+  "movdqu     0x20(%0),%%xmm2                  \n"
+  "movdqu     0x30(%0),%%xmm6                  \n"
+  "movdqu     (%0,%4,1),%%xmm7                 \n"
+  "pavgb      %%xmm7,%%xmm0                    \n"
+  "movdqu     0x10(%0,%4,1),%%xmm7             \n"
+  "pavgb      %%xmm7,%%xmm1                    \n"
+  "movdqu     0x20(%0,%4,1),%%xmm7             \n"
+  "pavgb      %%xmm7,%%xmm2                    \n"
+  "movdqu     0x30(%0,%4,1),%%xmm7             \n"
+  "pavgb      %%xmm7,%%xmm6                    \n"
+  "lea        0x40(%0),%0                      \n"
+  "movdqa     %%xmm0,%%xmm7                    \n"
+  "shufps     $0x88,%%xmm1,%%xmm0              \n"
+  "shufps     $0xdd,%%xmm1,%%xmm7              \n"
+  "pavgb      %%xmm7,%%xmm0                    \n"
+  "movdqa     %%xmm2,%%xmm7                    \n"
+  "shufps     $0x88,%%xmm6,%%xmm2              \n"
+  "shufps     $0xdd,%%xmm6,%%xmm7              \n"
+  "pavgb      %%xmm7,%%xmm2                    \n"
+  "movdqa     %%xmm0,%%xmm1                    \n"
+  "movdqa     %%xmm2,%%xmm6                    \n"
+  "pmaddubsw  %%xmm4,%%xmm0                    \n"
+  "pmaddubsw  %%xmm4,%%xmm2                    \n"
+  "pmaddubsw  %%xmm3,%%xmm1                    \n"
+  "pmaddubsw  %%xmm3,%%xmm6                    \n"
+  "phaddw     %%xmm2,%%xmm0                    \n"
+  "phaddw     %%xmm6,%%xmm1                    \n"
+  "psraw      $0x8,%%xmm0                      \n"
+  "psraw      $0x8,%%xmm1                      \n"
+  "packsswb   %%xmm1,%%xmm0                    \n"
+  "paddb      %%xmm5,%%xmm0                    \n"
+  "movlps     %%xmm0,(%1)                      \n"
+  "movhps     %%xmm0,(%1,%2,1)                 \n"
+  "lea        0x8(%1),%1                       \n"
+  "sub        $0x10,%3                         \n"
+  "ja         1b                               \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_argb))
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+);
+}
 #endif

 #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
@@ -624,6 +729,18 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  BGRAToARGBRow_SSSE3(src_argb, row, pix);
  ARGBToYRow_SSSE3(row, dst_y, pix);
 }
+
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  ABGRToARGBRow_C(src_argb, row, pix);
+  ARGBToYRow_SSSE3(row, dst_y, pix);
+}
+
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  BGRAToARGBRow_C(src_argb, row, pix);
+  ARGBToYRow_SSSE3(row, dst_y, pix);
+}
 #endif

 #ifdef HAS_ARGBTOUVROW_SSSE3
@@ -642,6 +759,22 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
  BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
 }
+
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  ABGRToARGBRow_C(src_argb, row, pix);
+  ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  BGRAToARGBRow_C(src_argb, row, pix);
+  BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
+}
 #endif

 #ifdef HAS_MIRRORROW_SSSE3

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -611,6 +611,39 @@ __asm {
  }
 }

+__declspec(naked)
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kARGBToY
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         convertloop
+    ret
+  }
+}
+
 __declspec(naked)
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
@@ -644,6 +677,39 @@ __asm {
  }
 }

+__declspec(naked)
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kBGRAToY
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         convertloop
+    ret
+  }
+}
+
 __declspec(naked)
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
@@ -677,6 +743,39 @@ __asm {
  }
 }

+__declspec(naked)
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm5, kAddY16
+    movdqa     xmm4, kABGRToY
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    ja         convertloop
+    ret
+  }
+}
+
 __declspec(naked)
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
@@ -741,6 +840,75 @@ __asm {
  }
 }

+
+__declspec(naked)
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kARGBToU
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 __declspec(naked)
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
@@ -805,6 +973,74 @@ __asm {
  }
 }

+__declspec(naked)
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kBGRAToU
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 __declspec(naked)
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
@@ -869,6 +1105,75 @@ __asm {
  }
 }

+
+__declspec(naked)
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                                 uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm7, kABGRToU
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    ja         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3

 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */