I422ToYUY2_NEON

BUG=none TEST=convert_test *I422ToYUY2* Review URL: https://webrtc-codereview.appspot.com/869008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@413 16f28f9a-4ce2-e073-06de-1de4eb20be90

I422ToYUY2_NEON
BUG=none TEST=convert_test *I422ToYUY2* Review URL: https://webrtc-codereview.appspot.com/869008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@413 16f28f9a-4ce2-e073-06de-1de4eb20be90
9de8867a · fbarchard@google.com · 7d3786c6 · 9de8867a · 9de8867a · 9de8867a
Commit 9de8867a authored Oct 12, 2012 by fbarchard@google.com
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 410
+Version: 413
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -84,6 +84,8 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I422TOUYVYROW_SSE2

 // Effects
 #define HAS_ARGBAFFINEROW_SSE2
@@ -119,6 +121,9 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOARGBROW_NEON
 #define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGBAROW_NEON
 #define HAS_BGRATOARGBROW_NEON
 #define HAS_COPYROW_NEON
 #define HAS_HALFROW_NEON
@@ -130,6 +135,8 @@ extern "C" {
 #define HAS_I422TORGBAROW_NEON
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORROWUV_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV21TOARGBROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGBATOARGBROW_NEON
@@ -141,13 +148,9 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I422TOUYVYROW_NEON

-// TODO(fbarchard): Hook these up to calling functions.
-#define HAS_ARGBTORAWROW_NEON
-#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORGBAROW_NEON
-#define HAS_NV12TOARGBROW_NEON
-#define HAS_NV21TOARGBROW_NEON
 #endif

 #if defined(_MSC_VER) && !defined(__CLR_VER)
@@ -768,6 +771,31 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb,
 void ARGBToBayerRow_NEON(const uint8* src_argb,
                         uint8* dst_bayer, uint32 selector, int pix);

+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 410
+#define LIBYUV_VERSION 413

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -222,208 +222,6 @@ int I400Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }

-// YUY2 - Macro-pixel = 2 image pixels
-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-
-// UYVY - Macro-pixel = 2 image pixels
-// U0Y0V0Y1
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_I422TOYUY2ROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void I422ToYUY2Row_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
-    sub        edx, esi
-
-    align      16
-  convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqa     xmm0, [eax] // Y
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2 // YUYV
-    punpckhbw  xmm1, xmm2
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#define HAS_I422TOUYVYROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void I422ToUYVYRow_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
-    sub        edx, esi
-
-    align      16
-  convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqa     xmm0, [eax] // Y
-    movdqa     xmm1, xmm2
-    lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0 // UYVY
-    punpckhbw  xmm2, xmm0
-    movdqa     [edi], xmm1
-    movdqa     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_I422TOYUY2ROW_SSE2
-static void I422ToYUY2Row_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    ".p2align  4                                 \n"
-  "1:                                            \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq      (%1,%2,1),%%xmm3                  \n"
-    "lea       0x8(%1),%1                        \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqa    (%0),%%xmm0                       \n"
-    "lea       0x10(%0),%0                       \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
-    "punpcklbw %%xmm2,%%xmm0                     \n"
-    "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqa    %%xmm0,(%3)                       \n"
-    "movdqa    %%xmm1,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-
-#define HAS_I422TOUYVYROW_SSE2
-static void I422ToUYVYRow_SSE2(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    ".p2align  4                                 \n"
-  "1:                                            \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq      (%1,%2,1),%%xmm3                  \n"
-    "lea       0x8(%1),%1                        \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqa    (%0),%%xmm0                       \n"
-    "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       0x10(%0),%0                       \n"
-    "punpcklbw %%xmm0,%%xmm1                     \n"
-    "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqa    %%xmm1,(%3)                       \n"
-    "movdqa    %%xmm2,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-#endif
-
-static void I422ToYUY2Row_C(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_frame, int width) {
-    for (int x = 0; x < width - 1; x += 2) {
-      dst_frame[0] = src_y[0];
-      dst_frame[1] = src_u[0];
-      dst_frame[2] = src_y[1];
-      dst_frame[3] = src_v[0];
-      dst_frame += 4;
-      src_y += 2;
-      src_u += 1;
-      src_v += 1;
-    }
-    if (width & 1) {
-      dst_frame[0] = src_y[0];
-      dst_frame[1] = src_u[0];
-      dst_frame[2] = src_y[0];  // duplicate last y
-      dst_frame[3] = src_v[0];
-    }
-}
-
-static void I422ToUYVYRow_C(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_frame, int width) {
-    for (int x = 0; x < width - 1; x += 2) {
-      dst_frame[0] = src_u[0];
-      dst_frame[1] = src_y[0];
-      dst_frame[2] = src_v[0];
-      dst_frame[3] = src_y[1];
-      dst_frame += 4;
-      src_y += 2;
-      src_u += 1;
-      src_v += 1;
-    }
-    if (width & 1) {
-      dst_frame[0] = src_u[0];
-      dst_frame[1] = src_y[0];
-      dst_frame[2] = src_v[0];
-      dst_frame[3] = src_y[0];  // duplicate last y
-    }
-}
-
 // Visual C x86 or GCC little endian.
 #if defined(__x86_64__) || defined(_M_X64) || \
  defined(__i386__) || defined(_M_IX86) || \
@@ -463,7 +261,6 @@ static void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) {
  }
 }

-// TODO(fbarchard): Deprecate, move or expand 422 support?
 LIBYUV_API
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@@ -490,6 +287,10 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
    I422ToYUY2Row = I422ToYUY2Row_SSE2;
  }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    I422ToYUY2Row = I422ToYUY2Row_NEON;
+  }
 #endif

  for (int y = 0; y < height; ++y) {
@@ -528,6 +329,10 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
    I422ToYUY2Row = I422ToYUY2Row_SSE2;
  }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    I422ToYUY2Row = I422ToYUY2Row_NEON;
+  }
 #endif

  for (int y = 0; y < height - 1; y += 2) {
@@ -572,6 +377,10 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
    I422ToUYVYRow = I422ToUYVYRow_SSE2;
  }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    I422ToUYVYRow = I422ToUYVYRow_NEON;
+  }
 #endif

  for (int y = 0; y < height; ++y) {
@@ -610,6 +419,10 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
    I422ToUYVYRow = I422ToUYVYRow_SSE2;
  }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    I422ToUYVYRow = I422ToUYVYRow_NEON;
+  }
 #endif

  for (int y = 0; y < height - 1; y += 2) {

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1279,6 +1279,50 @@ void ARGBToBayerRow_C(const uint8* src_argb,
  }
 }

+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+    for (int x = 0; x < width - 1; x += 2) {
+      dst_frame[0] = src_y[0];
+      dst_frame[1] = src_u[0];
+      dst_frame[2] = src_y[1];
+      dst_frame[3] = src_v[0];
+      dst_frame += 4;
+      src_y += 2;
+      src_u += 1;
+      src_v += 1;
+    }
+    if (width & 1) {
+      dst_frame[0] = src_y[0];
+      dst_frame[1] = src_u[0];
+      dst_frame[2] = src_y[0];  // duplicate last y
+      dst_frame[3] = src_v[0];
+    }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+    for (int x = 0; x < width - 1; x += 2) {
+      dst_frame[0] = src_u[0];
+      dst_frame[1] = src_y[0];
+      dst_frame[2] = src_v[0];
+      dst_frame[3] = src_y[1];
+      dst_frame += 4;
+      src_y += 2;
+      src_u += 1;
+      src_v += 1;
+    }
+    if (width & 1) {
+      dst_frame[0] = src_u[0];
+      dst_frame[1] = src_y[0];
+      dst_frame[2] = src_v[0];
+      dst_frame[3] = src_y[0];  // duplicate last y
+    }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -75,10 +75,10 @@ static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
 #endif

 #ifdef HAS_I422TOARGBROW_NEON
-void I422ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
@@ -94,13 +94,13 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
    "vmov.u8    d23, #255                      \n"
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
@@ -108,10 +108,10 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
 #endif  // HAS_I422TOARGBROW_NEON

 #ifdef HAS_I422TOBGRAROW_NEON
-void I422ToBGRARow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
@@ -128,13 +128,13 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
    "vmov.u8    d19, #255                      \n"
    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
@@ -142,10 +142,10 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
 #endif  // HAS_I422TOBGRAROW_NEON

 #ifdef HAS_I422TOABGRROW_NEON
-void I422ToABGRRow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
@@ -162,13 +162,13 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
    "vmov.u8    d23, #255                      \n"
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
@@ -176,10 +176,10 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
 #endif  // HAS_I422TOABGRROW_NEON

 #ifdef HAS_I422TORGBAROW_NEON
-void I422ToRGBARow_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
@@ -195,13 +195,13 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
    "vmov.u8    d19, #255                      \n"
    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
@@ -209,10 +209,10 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
 #endif  // HAS_I422TORGBAROW_NEON

 #ifdef HAS_I422TORGB24ROW_NEON
-void I422ToRGB24Row_NEON(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgb24,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
@@ -227,13 +227,13 @@ void I422ToRGB24Row_NEON(const uint8* y_buf,
    "subs       %4, %4, #8                     \n"
    "vst3.8     {d20, d21, d22}, [%3]!         \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : "+r"(src_y),      // %0
+      "+r"(src_u),      // %1
+      "+r"(src_v),      // %2
+      "+r"(dst_rgb24),  // %3
+      "+r"(width)       // %4
+    : "r"(&kUVToRB),    // %5
+      "r"(&kUVToG)      // %6
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
@@ -241,10 +241,10 @@ void I422ToRGB24Row_NEON(const uint8* y_buf,
 #endif  // HAS_I422TORGB24ROW_NEON

 #ifdef HAS_I422TORAWROW_NEON
-void I422ToRAWRow_NEON(const uint8* y_buf,
-                       const uint8* u_buf,
-                       const uint8* v_buf,
-                       uint8* rgb_buf,
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
                       int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
@@ -260,10 +260,10 @@ void I422ToRAWRow_NEON(const uint8* y_buf,
    "vswp.u8    d20, d22                       \n"
    "vst3.8     {d20, d21, d22}, [%3]!         \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(u_buf),    // %1
-      "+r"(v_buf),    // %2
-      "+r"(rgb_buf),  // %3
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_raw),  // %3
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
@@ -274,9 +274,9 @@ void I422ToRAWRow_NEON(const uint8* y_buf,
 #endif  // HAS_I422TORAWROW_NEON

 #ifdef HAS_NV12TOARGBROW_NEON
-void NV12ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%4]                    \n"
@@ -292,12 +292,12 @@ void NV12ToARGBRow_NEON(const uint8* y_buf,
    "vmov.u8    d23, #255                      \n"
    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(uv_buf),   // %1
-      "+r"(rgb_buf),  // %2
-      "+r"(width)     // %3
-    : "r"(&kUVToRB),  // %4
-      "r"(&kUVToG)    // %5
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
@@ -305,9 +305,9 @@ void NV12ToARGBRow_NEON(const uint8* y_buf,
 #endif  // HAS_NV12TOARGBROW_NEON

 #ifdef HAS_NV21TOARGBROW_NEON
-void NV21ToARGBRow_NEON(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%4]                    \n"
@@ -323,12 +323,12 @@ void NV21ToARGBRow_NEON(const uint8* y_buf,
    "vmov.u8    d23, #255                      \n"
    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
    "bgt        1b                             \n"
-    : "+r"(y_buf),    // %0
-      "+r"(uv_buf),   // %1
-      "+r"(rgb_buf),  // %2
-      "+r"(width)     // %3
-    : "r"(&kUVToRB),  // %4
-      "r"(&kUVToG)    // %5
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
@@ -862,6 +862,52 @@ void ARGBToBayerRow_NEON(const uint8* src_argb,
   );
 }

+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    "vst4.u8    {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_yuy2),  // %3
+      "+r"(width)      // %4
+    :
+    : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    "vst4.u8    {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_uyvy),  // %3
+      "+r"(width)      // %4
+    :
+    : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
 #endif  // __ARM_NEON__

 #ifdef __cplusplus

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -4109,6 +4109,77 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
 #endif
  );
 }
+
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
+  "1:                                            \n"
+    "movq      (%1),%%xmm2                       \n"
+    "movq      (%1,%2,1),%%xmm3                  \n"
+    "lea       0x8(%1),%1                        \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqa    (%0),%%xmm0                       \n"
+    "lea       0x10(%0),%0                       \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
+    "movdqa    %%xmm0,(%3)                       \n"
+    "movdqa    %%xmm1,0x10(%3)                   \n"
+    "lea       0x20(%3),%3                       \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
+  "1:                                            \n"
+    "movq      (%1),%%xmm2                       \n"
+    "movq      (%1,%2,1),%%xmm3                  \n"
+    "lea       0x8(%1),%1                        \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqa    (%0),%%xmm0                       \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       0x10(%0),%0                       \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqa    %%xmm1,(%3)                       \n"
+    "movdqa    %%xmm2,0x10(%3)                   \n"
+    "lea       0x20(%3),%3                       \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4239,6 +4239,87 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  }
 }

+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked) __declspec(align(16))
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+    align      16
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqa     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2 // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqa     [edi], xmm0
+    movdqa     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+    align      16
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqa     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqa     [edi], xmm1
+    movdqa     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
 #endif  // _M_IX86

 #ifdef __cplusplus