SplitRGBPlane and MergeRGBPlane functions added

Converts packed RGB to planar and back. TBR=kjellander@chromium.org BUG=libyuv:728 TEST=MergeRGBPlane_Opt and SplitRGBPlane_Opt unittests added Change-Id: Ida59af940afcb1fc4a48bbf62c714f592665c3cc Reviewed-on: https://chromium-review.googlesource.com/658069Reviewed-by: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com>

SplitRGBPlane and MergeRGBPlane functions added
Converts packed RGB to planar and back. TBR=kjellander@chromium.org BUG=libyuv:728 TEST=MergeRGBPlane_Opt and SplitRGBPlane_Opt unittests added Change-Id: Ida59af940afcb1fc4a48bbf62c714f592665c3cc Reviewed-on: https://chromium-review.googlesource.com/658069Reviewed-by: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com>
1e16cb5c · Frank Barchard · 367c0d8f · 1e16cb5c · 1e16cb5c · 1e16cb5c
Commit 1e16cb5c authored Sep 11, 2017 by Frank Barchard
12 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1667
+Version: 1668
 License: BSD
 License File: LICENSE


--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -69,6 +69,32 @@ void MergeUVPlane(const uint8* src_u,
                  int width,
                  int height);

+// Split interleaved RGB plane into separate R, G and B planes.
+LIBYUV_API
+void SplitRGBPlane(const uint8* src_rgb,
+                   int src_stride_rgb,
+                   uint8* dst_r,
+                   int dst_stride_r,
+                   uint8* dst_g,
+                   int dst_stride_g,
+                   uint8* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height);
+
+// Merge separate R, G and B planes into one interleaved RGB plane.
+LIBYUV_API
+void MergeRGBPlane(const uint8* src_r,
+                   int src_stride_r,
+                   const uint8* src_g,
+                   int src_stride_g,
+                   const uint8* src_b,
+                   int src_stride_b,
+                   uint8* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height);
+
 // Copy I400.  Supports inverting.
 LIBYUV_API
 int I400ToI400(const uint8* src_y,

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -271,6 +271,14 @@ extern "C" {
 #define HAS_I422TOARGBROW_SSSE3
 #endif

+// The following are available forr gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_MERGERGBROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
+#endif
+
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -330,6 +338,7 @@ extern "C" {
 #define HAS_RGBATOUVROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
+#define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
@@ -1462,6 +1471,58 @@ void MergeUVRow_Any_MSA(const uint8* src_u,
                        uint8* dst_uv,
                        int width);

+void SplitRGBRow_C(const uint8* src_rgb,
+                   uint8* dst_r,
+                   uint8* dst_g,
+                   uint8* dst_b,
+                   int width);
+void SplitRGBRow_SSSE3(const uint8* src_rgb,
+                       uint8* dst_r,
+                       uint8* dst_g,
+                       uint8* dst_b,
+                       int width);
+void SplitRGBRow_NEON(const uint8* src_rgb,
+                      uint8* dst_r,
+                      uint8* dst_g,
+                      uint8* dst_b,
+                      int width);
+void SplitRGBRow_Any_SSSE3(const uint8* src_rgb,
+                           uint8* dst_r,
+                           uint8* dst_g,
+                           uint8* dst_b,
+                           int width);
+void SplitRGBRow_Any_NEON(const uint8* src_rgb,
+                          uint8* dst_r,
+                          uint8* dst_g,
+                          uint8* dst_b,
+                          int width);
+
+void MergeRGBRow_C(const uint8* src_r,
+                   const uint8* src_g,
+                   const uint8* src_b,
+                   uint8* dst_rgb,
+                   int width);
+void MergeRGBRow_SSSE3(const uint8* src_r,
+                       const uint8* src_g,
+                       const uint8* src_b,
+                       uint8* dst_rgb,
+                       int width);
+void MergeRGBRow_NEON(const uint8* src_r,
+                      const uint8* src_g,
+                      const uint8* src_b,
+                      uint8* dst_rgb,
+                      int width);
+void MergeRGBRow_Any_SSSE3(const uint8* src_r,
+                           const uint8* src_g,
+                           const uint8* src_b,
+                           uint8* dst_rgb,
+                           int width);
+void MergeRGBRow_Any_NEON(const uint8* src_r,
+                          const uint8* src_g,
+                          const uint8* src_b,
+                          uint8* dst_rgb,
+                          int width);
+
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1667
+#define LIBYUV_VERSION 1668

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -407,6 +407,122 @@ void MergeUVPlane(const uint8* src_u,
  }
 }

+// Support function for NV12 etc RGB channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitRGBPlane(const uint8* src_rgb,
+                   int src_stride_rgb,
+                   uint8* dst_r,
+                   int dst_stride_r,
+                   uint8* dst_g,
+                   int dst_stride_g,
+                   uint8* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height) {
+  int y;
+  void (*SplitRGBRow)(const uint8* src_rgb, uint8* dst_r, uint8* dst_g,
+                      uint8* dst_b, int width) = SplitRGBRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_stride_r = -dst_stride_r;
+    dst_stride_g = -dst_stride_g;
+    dst_stride_b = -dst_stride_b;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb == width * 3 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+  }
+#if defined(HAS_SPLITRGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitRGBRow = SplitRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitRGBRow = SplitRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of RGB.
+    SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    src_rgb += src_stride_rgb;
+  }
+}
+
+LIBYUV_API
+void MergeRGBPlane(const uint8* src_r,
+                   int src_stride_r,
+                   const uint8* src_g,
+                   int src_stride_g,
+                   const uint8* src_b,
+                   int src_stride_b,
+                   uint8* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height) {
+  int y;
+  void (*MergeRGBRow)(const uint8* src_r, const uint8* src_g,
+                      const uint8* src_b, uint8* dst_rgb, int width) =
+      MergeRGBRow_C;
+  // Coalesce rows.
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_rgb == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
+  }
+#if defined(HAS_MERGERGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MergeRGBRow = MergeRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MERGERGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeRGBRow = MergeRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of RGB.
+    MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_rgb += dst_stride_rgb;
+  }
+}
+
 // Mirror a plane of data.
 void MirrorPlane(const uint8* src_y,
                 int src_stride_y,

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -84,6 +84,14 @@ ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                    \
           SS(r, DUVSHIFT) * BPP);                                         \
  }
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
 #ifdef HAS_I422TOYUY2ROW_SSE2
 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@@ -943,6 +951,31 @@ ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
 #endif
 #undef ANY12

+// Any 1 to 3.  Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                    \
+  void NAMEANY(const uint8* src_ptr, uint8* dst_r, uint8* dst_g, uint8* dst_b, \
+               int width) {                                                    \
+    SIMD_ALIGNED(uint8 temp[16 * 6]);                                          \
+    memset(temp, 0, 16 * 3); /* for msan */                                    \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                               \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * BPP, r * BPP);                                  \
+    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1);     \
+    memcpy(dst_r + n, temp + 16 * 3, r);                                       \
+    memcpy(dst_g + n, temp + 16 * 4, r);                                       \
+    memcpy(dst_b + n, temp + 16 * 5, r);                                       \
+  }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
 #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1770,6 +1770,34 @@ void MergeUVRow_C(const uint8* src_u,
  }
 }

+void SplitRGBRow_C(const uint8* src_rgb,
+                   uint8* dst_r,
+                   uint8* dst_g,
+                   uint8* dst_b,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_r[x] = src_rgb[0];
+    dst_g[x] = src_rgb[1];
+    dst_b[x] = src_rgb[2];
+    src_rgb += 3;
+  }
+}
+
+void MergeRGBRow_C(const uint8* src_r,
+                   const uint8* src_g,
+                   const uint8* src_b,
+                   uint8* dst_rgb,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_rgb[0] = src_r[x];
+    dst_rgb[1] = src_g[x];
+    dst_rgb[2] = src_b[x];
+    dst_rgb += 3;
+  }
+}
+
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
  memcpy(dst, src, count);
 }

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -38,9 +38,8 @@ static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
                         127, -84, -43, 0, 127, -84, -43, 0};

-static vec8 kARGBToV = {
-    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
+static vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+                        -18, -94, 112, 0, -18, -94, 112, 0};

 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
                         -20, -107, 127, 0, -20, -107, 127, 0};
@@ -2754,6 +2753,199 @@ void MergeUVRow_SSE2(const uint8* src_u,
 }
 #endif  // HAS_MERGEUVROW_SSE2

+#ifdef HAS_SPLITRGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
+                                    128u, 128u, 128u, 128u, 128u, 128u,
+                                    128u, 128u, 128u, 128u};
+static uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                    2u,   5u,   8u,   11u,  14u,  128u,
+                                    128u, 128u, 128u, 128u};
+static uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                    128u, 128u, 128u, 128u, 128u, 1u,
+                                    4u,   7u,   10u,  13u};
+
+static uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
+                                    128u, 128u, 128u, 128u, 128u, 128u,
+                                    128u, 128u, 128u, 128u};
+static uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
+                                    3u,   6u,   9u,   12u,  15u,  128u,
+                                    128u, 128u, 128u, 128u};
+static uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                    128u, 128u, 128u, 128u, 128u, 2u,
+                                    5u,   8u,   11u,  14u};
+
+static uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
+                                    128u, 128u, 128u, 128u, 128u, 128u,
+                                    128u, 128u, 128u, 128u};
+static uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
+                                    4u,   7u,   10u,  13u,  128u, 128u,
+                                    128u, 128u, 128u, 128u};
+static uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                    128u, 128u, 128u, 128u, 0u,   3u,
+                                    6u,   9u,   12u,  15u};
+
+void SplitRGBRow_SSSE3(const uint8* src_rgb,
+                       uint8* dst_r,
+                       uint8* dst_g,
+                       uint8* dst_b,
+                       int width) {
+  asm volatile (
+    LABELALIGN
+    "1:                                        \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
+    "movdqu     " MEMACCESS2(0x20,0) ",%%xmm2  \n"
+    "pshufb     %5, %%xmm0                     \n"
+    "pshufb     %6, %%xmm1                     \n"
+    "pshufb     %7, %%xmm2                     \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm0                  \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+
+    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
+    "movdqu     " MEMACCESS2(0x20,0) ",%%xmm2  \n"
+    "pshufb     %8, %%xmm0                     \n"
+    "pshufb     %9, %%xmm1                     \n"
+    "pshufb     %10, %%xmm2                    \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm0                  \n"
+    "movdqu     %%xmm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x10,2) ",%2          \n"
+
+    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
+    "movdqu     " MEMACCESS2(0x20,0) ",%%xmm2  \n"
+    "pshufb     %11, %%xmm0                    \n"
+    "pshufb     %12, %%xmm1                    \n"
+    "pshufb     %13, %%xmm2                    \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm0                  \n"
+    "movdqu     %%xmm0," MEMACCESS(3) "        \n"
+    "lea        " MEMLEA(0x10,3) ",%3          \n"
+    "lea        " MEMLEA(0x30,0) ",%0          \n"
+    "sub        $0x10,%4                       \n"
+    "jg         1b                             \n"
+  : "+r"(src_rgb),              // %0
+    "+r"(dst_r),                // %1
+    "+r"(dst_g),                // %2
+    "+r"(dst_b),                // %3
+    "+r"(width)                 // %4
+  : "m"(kShuffleMaskRGBToR0),   // %5
+    "m"(kShuffleMaskRGBToR1),   // %6
+    "m"(kShuffleMaskRGBToR2),   // %7
+    "m"(kShuffleMaskRGBToG0),   // %8
+    "m"(kShuffleMaskRGBToG1),   // %9
+    "m"(kShuffleMaskRGBToG2),   // %10
+    "m"(kShuffleMaskRGBToB0),   // %11
+    "m"(kShuffleMaskRGBToB1),   // %12
+    "m"(kShuffleMaskRGBToB2)    // %13
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
+                                    2u, 128u, 128u, 3u, 128u, 128u,
+                                    4u, 128u, 128u, 5u};
+static uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
+                                    128u, 2u, 128u, 128u, 3u, 128u,
+                                    128u, 4u, 128u, 128u};
+static uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
+                                    128u, 128u, 2u, 128u, 128u, 3u,
+                                    128u, 128u, 4u, 128u};
+
+static uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
+                                    7u, 128u, 128u, 8u, 128u, 128u,
+                                    9u, 128u, 128u, 10u};
+static uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
+                                    128u, 7u, 128u, 128u, 8u, 128u,
+                                    128u, 9u, 128u, 128u};
+static uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
+                                    128u, 128u, 8u,  128u, 128u, 9u,
+                                    128u, 128u, 10u, 128u};
+
+static uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
+                                    12u, 128u, 128u, 13u, 128u, 128u,
+                                    14u, 128u, 128u, 15u};
+static uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
+                                    128u, 13u, 128u, 128u, 14u, 128u,
+                                    128u, 15u, 128u, 128u};
+static uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
+                                    128u, 128u, 13u, 128u, 128u, 14u,
+                                    128u, 128u, 15u, 128u};
+
+void MergeRGBRow_SSSE3(const uint8* src_r,
+                       const uint8* src_g,
+                       const uint8* src_b,
+                       uint8* dst_rgb,
+                       int width) {
+  asm volatile (
+    LABELALIGN
+    "1:                                        \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
+    "movdqu     " MEMACCESS(1) ",%%xmm1        \n"
+    "movdqu     " MEMACCESS(2) ",%%xmm2        \n"
+    "pshufb     %5, %%xmm0                     \n"
+    "pshufb     %6, %%xmm1                     \n"
+    "pshufb     %7, %%xmm2                     \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm0                  \n"
+    "movdqu     %%xmm0," MEMACCESS(3) "        \n"
+
+    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
+    "movdqu     " MEMACCESS(1) ",%%xmm1        \n"
+    "movdqu     " MEMACCESS(2) ",%%xmm2        \n"
+    "pshufb     %8, %%xmm0                     \n"
+    "pshufb     %9, %%xmm1                     \n"
+    "pshufb     %10, %%xmm2                    \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm0                  \n"
+    "movdqu     %%xmm0," MEMACCESS2(16, 3) "   \n"
+
+    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
+    "movdqu     " MEMACCESS(1) ",%%xmm1        \n"
+    "movdqu     " MEMACCESS(2) ",%%xmm2        \n"
+    "pshufb     %11, %%xmm0                    \n"
+    "pshufb     %12, %%xmm1                    \n"
+    "pshufb     %13, %%xmm2                    \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm0                  \n"
+    "movdqu     %%xmm0," MEMACCESS2(32, 3) "   \n"
+
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+    "lea        " MEMLEA(0x10,2) ",%2          \n"
+    "lea        " MEMLEA(0x30,3) ",%3          \n"
+    "sub        $0x10,%4                       \n"
+    "jg         1b                             \n"
+  : "+r"(src_r),                // %0
+    "+r"(src_g),                // %1
+    "+r"(src_b),                // %2
+    "+r"(dst_rgb),              // %3
+    "+r"(width)                 // %4
+  : "m"(kShuffleMaskRToRGB0),   // %5
+    "m"(kShuffleMaskGToRGB0),   // %6
+    "m"(kShuffleMaskBToRGB0),   // %7
+    "m"(kShuffleMaskRToRGB1),   // %8
+    "m"(kShuffleMaskGToRGB1),   // %9
+    "m"(kShuffleMaskBToRGB1),   // %10
+    "m"(kShuffleMaskRToRGB2),   // %11
+    "m"(kShuffleMaskGToRGB2),   // %12
+    "m"(kShuffleMaskBToRGB2)    // %13
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGERGBROW_SSSE3
+
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  asm volatile (

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -526,7 +526,7 @@ void MergeUVRow_NEON(const uint8* src_u,
      "vld1.8     {q0}, [%0]!                    \n"  // load U
      "vld1.8     {q1}, [%1]!                    \n"  // load V
      "subs       %3, %3, #16                    \n"  // 16 processed per loop
-      "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
      "bgt        1b                             \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
@@ -537,6 +537,56 @@ void MergeUVRow_NEON(const uint8* src_u,
      );
 }

+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8* src_rgb,
+                      uint8* dst_r,
+                      uint8* dst_g,
+                      uint8* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB
+      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q0}, [%1]!                    \n"  // store R
+      "vst1.8     {q1}, [%2]!                    \n"  // store G
+      "vst1.8     {q2}, [%3]!                    \n"  // store B
+      "bgt        1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
+      );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8* src_r,
+                      const uint8* src_g,
+                      const uint8* src_b,
+                      uint8* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load R
+      "vld1.8     {q1}, [%1]!                    \n"  // load G
+      "vld1.8     {q2}, [%2]!                    \n"  // load B
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB
+      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB
+      "bgt        1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
+}
+
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  asm volatile(

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -580,6 +580,54 @@ void MergeUVRow_NEON(const uint8* src_u,
      );
 }

+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8* src_rgb,
+                      uint8* dst_r,
+                      uint8* dst_g,
+                      uint8* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st1        {v0.16b}, [%1], #16            \n"  // store R
+      "st1        {v1.16b}, [%2], #16            \n"  // store G
+      "st1        {v2.16b}, [%3], #16            \n"  // store B
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8* src_r,
+                      const uint8* src_g,
+                      const uint8* src_b,
+                      uint8* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load R
+      "ld1        {v1.16b}, [%1], #16            \n"  // load G
+      "ld1        {v2.16b}, [%2], #16            \n"  // load B
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "b.gt       1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  asm volatile(

--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -1054,7 +1054,7 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
        "+r"(dst_width)    // %3
      : "r"(2LL),          // %4
        "r"(14LL)          // %5
-       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
        "v19"  // Clobber List
      );
 }

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2521,6 +2521,101 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }

+TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 3);
+  align_buffer_page_end(tmp_pixels_r, kPixels);
+  align_buffer_page_end(tmp_pixels_g, kPixels);
+  align_buffer_page_end(tmp_pixels_b, kPixels);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 3);
+  align_buffer_page_end(dst_pixels_c, kPixels * 3);
+
+  MemRandomize(src_pixels, kPixels * 3);
+  MemRandomize(tmp_pixels_r, kPixels);
+  MemRandomize(tmp_pixels_g, kPixels);
+  MemRandomize(tmp_pixels_b, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 3);
+  MemRandomize(dst_pixels_c, kPixels * 3);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+                benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                benchmark_width_, benchmark_width_, benchmark_height_);
+  MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                tmp_pixels_b, benchmark_width_, dst_pixels_c,
+                benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+                benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                benchmark_width_, benchmark_width_, benchmark_height_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
+                  benchmark_width_, tmp_pixels_b, benchmark_width_,
+                  dst_pixels_opt, benchmark_width_ * 3, benchmark_width_,
+                  benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels * 3; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(tmp_pixels_r);
+  free_aligned_buffer_page_end(tmp_pixels_g);
+  free_aligned_buffer_page_end(tmp_pixels_b);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 3);
+  align_buffer_page_end(tmp_pixels_r, kPixels);
+  align_buffer_page_end(tmp_pixels_g, kPixels);
+  align_buffer_page_end(tmp_pixels_b, kPixels);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 3);
+  align_buffer_page_end(dst_pixels_c, kPixels * 3);
+
+  MemRandomize(src_pixels, kPixels * 3);
+  MemRandomize(tmp_pixels_r, kPixels);
+  MemRandomize(tmp_pixels_g, kPixels);
+  MemRandomize(tmp_pixels_b, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 3);
+  MemRandomize(dst_pixels_c, kPixels * 3);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+                benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                benchmark_width_, benchmark_width_, benchmark_height_);
+  MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                tmp_pixels_b, benchmark_width_, dst_pixels_c,
+                benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+                  benchmark_width_, tmp_pixels_g, benchmark_width_,
+                  tmp_pixels_b, benchmark_width_, benchmark_width_,
+                  benchmark_height_);
+  }
+  MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                tmp_pixels_b, benchmark_width_, dst_pixels_opt,
+                benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+
+  for (int i = 0; i < kPixels * 3; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(tmp_pixels_r);
+  free_aligned_buffer_page_end(tmp_pixels_g);
+  free_aligned_buffer_page_end(tmp_pixels_b);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
 float TestScaleMaxSamples(int benchmark_width,
                          int benchmark_height,
                          int benchmark_iterations,