Color Matrix for Neon

BUG=176 TESTED=*Matrix* Review URL: https://webrtc-codereview.appspot.com/966033 git-svn-id: http://libyuv.googlecode.com/svn/trunk@512 16f28f9a-4ce2-e073-06de-1de4eb20be90

Color Matrix for Neon
BUG=176 TESTED=*Matrix* Review URL: https://webrtc-codereview.appspot.com/966033 git-svn-id: http://libyuv.googlecode.com/svn/trunk@512 16f28f9a-4ce2-e073-06de-1de4eb20be90
62154e53 · fbarchard@google.com · c247625d · 62154e53 · 62154e53 · 62154e53
Commit 62154e53 authored Dec 04, 2012 by fbarchard@google.com
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 511
+Version: 512
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -226,6 +226,7 @@ extern "C" {
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBGRAYROW_NEON
 #define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
 #endif

 // The following are available on Mips platforms
@@ -1229,6 +1230,8 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
 void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
                              int width);
+void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
+                             int width);

 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 511
+#define LIBYUV_VERSION 512

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -940,6 +940,10 @@ int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
  }
+#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+  }
 #endif
  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
  for (int y = 0; y < height; ++y) {

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2619,6 +2619,48 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  );
 }

+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
+                             int width) {
+  asm volatile (
+    "vld1.u8    {q2}, [%2]                     \n"  // load 3 ARGB vectors.
+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R coefficients s16.
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d16, d18, d20, d22}, [%0]     \n"  // load 8 ARGB pixels.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q2, d16                        \n"  // b (0 .. 255) 16 bit
+    "vmovl.u8   q3, d18                        \n"
+    "vmovl.u8   q8, d20                        \n"
+    "vmovl.u8   q9, d22                        \n"
+    "vmul.s16   q12, q2, d0[0]                 \n"  // B to Matrix B
+    "vmla.s16   q12, q3, d0[1]                 \n"  // G
+    "vmla.s16   q12, q8, d0[2]                 \n"  // R
+    "vmla.s16   q12, q9, d0[3]                 \n"  // A
+    "vmul.s16   q13, q2, d1[0]                 \n"  // B to Matrix G
+    "vmla.s16   q13, q3, d1[1]                 \n"  // G
+    "vmla.s16   q13, q8, d1[2]                 \n"  // R
+    "vmla.s16   q13, q9, d1[3]                 \n"  // A
+    "vmul.s16   q14, q2, d2[0]                 \n"  // B to Matrix R
+    "vmla.s16   q14, q3, d2[1]                 \n"  // G
+    "vmla.s16   q14, q8, d2[2]                 \n"  // R
+    "vmla.s16   q14, q9, d2[3]                 \n"  // A
+    "vqshrun.s16 d16, q12, #7                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #7                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #7                  \n"  // 16 bit to 8 bit R
+    "vst4.8     {d16, d18, d20, d22}, [%0]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),   // %0
+    "+r"(width)       // %1
+  : "r"(matrix_argb)  // %2
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14"
+  );
+}
+
 #endif  // __ARM_NEON__

 #ifdef __cplusplus

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -362,6 +362,7 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
    17, 68, 35, 0,
    22, 88, 45, 0,
    24, 98, 50, 0,
+    0, 0, 0, 0,  // Unused but makes matrix 16 bytes.
  };

  // Test blue