ARGBToRGBA_NEON and ARGBToRGB24_NEON

BUG=68 TEST=none Review URL: https://webrtc-codereview.appspot.com/816004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@367 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBToRGBA_NEON and ARGBToRGB24_NEON
BUG=68 TEST=none Review URL: https://webrtc-codereview.appspot.com/816004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@367 16f28f9a-4ce2-e073-06de-1de4eb20be90
64961c01 · fbarchard@google.com · 6fd84a8a · 64961c01 · 64961c01 · 64961c01
Commit 64961c01 authored Sep 19, 2012 by fbarchard@google.com
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 366
+Version: 367
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -115,6 +115,8 @@ extern "C" {
 #define HAS_I422TOBGRAROW_NEON
 #define HAS_I422TOABGRROW_NEON
 #define HAS_I422TORGBAROW_NEON
+#define HAS_ARGBTORGBAROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
 #endif
 #if defined(_MSC_VER) && !defined(__CLR_VER)
@@ -256,6 +258,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -472,6 +477,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 366
+#define LIBYUV_VERSION 367
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -744,6 +744,24 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
      }
    }
  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 16) {
+      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_Unaligned_NEON;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_NEON;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUVRow = YUY2ToUVRow_NEON;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_NEON;
+        }
+      }
+    }
+  }
 #endif
  for (int y = 0; y < height - 1; y += 2) {
    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -927,6 +927,17 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  // TODO(fbarchard): One step I420ToRGB24Row_NEON.
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
  for (int y = 0; y < height; ++y) {
    I422ToARGBRow(src_y, src_u, src_v, row, width);

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -320,6 +320,12 @@ int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
    ARGBToRGBARow = ARGBToRGBARow_SSSE3;
  }
 #endif
+#if defined(HAS_ARGBTORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(width, 16)) {
+    ARGBToRGBARow = ARGBToRGBARow_NEON;
+  }
+#endif
  for (int y = 0; y < height; ++y) {
    ARGBToRGBARow(src_argb, dst_rgba, width);
@@ -355,6 +361,16 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
  for (int y = 0; y < height; ++y) {
    ARGBToRGB24Row(src_argb, dst_rgb24, width);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -979,6 +979,9 @@ RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
 RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
 RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3)
+#endif
 #undef RGBANY
 #ifdef HAS_ARGBTOYROW_SSSE3

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -18,6 +18,8 @@ extern "C" {
 // This module is for GCC Neon
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+// TODO(fbarchard): Make a fetch macro so different subsamples can be done.
+// TODO(fbarchard): Rework register usage to produce RGB in d21 - d23.
 #define YUV422TORGB                                                            \
    "vld1.u8    {d0}, [%0]!                    \n"                             \
    "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
@@ -358,6 +360,41 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
 }
 #endif  // HAS_MIRRORROWUV_NEON
+#ifdef HAS_ARGBTORGBAROW_NEON
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ARGB.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "vmov.u8    q0, q4                         \n"
+    "vst4.u8    {q0,q1,q2,q3}, [%1]!           \n"  // store 16 pixels of RGBA.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgba),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q4" // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGBAROW_NEON
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ARGB.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "vst3.u8    {q1,q2,q3}, [%1]!              \n"  // store 16 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
 #endif  // __ARM_NEON__
 #ifdef __cplusplus