Neon 1 step conversion of NV12 to RGB565

BUG=136 TEST=unittest on nv12 Review URL: https://webrtc-codereview.appspot.com/939011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@461 16f28f9a-4ce2-e073-06de-1de4eb20be90

Neon 1 step conversion of NV12 to RGB565
BUG=136 TEST=unittest on nv12 Review URL: https://webrtc-codereview.appspot.com/939011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@461 16f28f9a-4ce2-e073-06de-1de4eb20be90
9f2d4041 · fbarchard@google.com · 11c6d32a · 9f2d4041 · 9f2d4041 · 9f2d4041
Commit 9f2d4041 authored Oct 31, 2012 by fbarchard@google.com
Showing with 241 additions and 65 deletions

row.h include/libyuv/row.h +47 -0

planar_functions.cc source/planar_functions.cc +30 -59

row_any.cc source/row_any.cc +16 -6

row_common.cc source/row_common.cc +86 -0

row_neon.cc source/row_neon.cc +62 -0

No files found.
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -69,6 +69,8 @@ extern "C" {
 #define HAS_MIRRORROWUV_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TORGB565ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
@@ -160,6 +162,8 @@ extern "C" {
 #define HAS_MIRRORROWUV_NEON
 #define HAS_NV12TOARGBROW_NEON
 #define HAS_NV21TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TORGB565ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGBATOARGBROW_NEON
@@ -278,6 +282,14 @@ void NV21ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* uv_buf,
                        uint8* rgb_buf,
                        int width);
+void NV12ToRGB565Row_NEON(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          int width);
+void NV21ToRGB565Row_NEON(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          int width);

 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@@ -463,6 +475,15 @@ void NV12ToARGBRow_C(const uint8* y_buf,
                     uint8* argb_buf,
                     int width);

+void NV21ToRGB565Row_C(const uint8* y_buf,
+                       const uint8* vu_buf,
+                       uint8* argb_buf,
+                       int width);
+void NV12ToRGB565Row_C(const uint8* y_buf,
+                       const uint8* uv_buf,
+                       uint8* argb_buf,
+                       int width);
+
 void NV21ToARGBRow_C(const uint8* y_buf,
                     const uint8* vu_buf,
                     uint8* argb_buf,
@@ -543,6 +564,16 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                         uint8* argb_buf,
                         int width);

+void NV12ToRGB565Row_SSSE3(const uint8* y_buf,
+                           const uint8* uv_buf,
+                           uint8* argb_buf,
+                           int width);
+
+void NV21ToRGB565Row_SSSE3(const uint8* y_buf,
+                           const uint8* vu_buf,
+                           uint8* argb_buf,
+                           int width);
+
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -650,6 +681,14 @@ void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* vu_buf,
                             uint8* argb_buf,
                             int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* y_buf,
+                               const uint8* uv_buf,
+                               uint8* argb_buf,
+                               int width);
+void NV21ToRGB565Row_Any_SSSE3(const uint8* y_buf,
+                               const uint8* vu_buf,
+                               uint8* argb_buf,
+                               int width);
 void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
@@ -785,6 +824,14 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* uv_buf,
                            uint8* argb_buf,
                            int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* y_buf,
+                              const uint8* uv_buf,
+                              uint8* argb_buf,
+                              int width);
+void NV21ToRGB565Row_Any_NEON(const uint8* y_buf,
+                              const uint8* uv_buf,
+                              uint8* argb_buf,
+                              int width);
 void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
                              const uint8* u_buf,
                              const uint8* v_buf,

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -554,13 +554,13 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
 }

 // Convert NV12 to RGB565.
-// TODO(fbarchard): One pass conversion.
 LIBYUV_API
 int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 const uint8* src_uv, int src_stride_uv,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height) {
-  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+  if (!src_y || !src_uv || !dst_rgb565 ||
+      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
@@ -569,43 +569,28 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
    dst_stride_rgb565 = -dst_stride_rgb565;
  }
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+  void (*NV12ToRGB565Row)(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          int width) = NV12ToRGB565Row_C;
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width <= kMaxStride * 4) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
    }
  }
-#elif defined(HAS_NV12TOARGBROW_NEON)
+#elif defined(HAS_NV12TORGB565ROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
-    }
-  }
-#endif
-  if (width * 4 > kMaxStride) {
-    return -1;
-  }
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
    }
  }
 #endif

  for (int y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, row, width);
-    ARGBToRGB565Row(row, dst_rgb565, width);
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
    dst_rgb565 += dst_stride_rgb565;
    src_y += src_stride_y;
    if (y & 1) {
@@ -618,10 +603,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 // Convert NV21 to RGB565.
 LIBYUV_API
 int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
+                 const uint8* src_vu, int src_stride_vu,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height) {
-  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+  if (!src_y || !src_vu || !dst_rgb565 ||
+      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
@@ -630,47 +616,32 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
    dst_stride_rgb565 = -dst_stride_rgb565;
  }
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV21ToARGBRow_C;
-#if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+  void (*NV21ToRGB565Row)(const uint8* y_buf,
+                          const uint8* vu_buf,
+                          uint8* rgb_buf,
+                          int width) = NV21ToRGB565Row_C;
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width <= kMaxStride * 4) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
    }
  }
-#elif defined(HAS_NV21TOARGBROW_NEON)
+#elif defined(HAS_NV21TORGB565ROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_NEON;
-    }
-  }
-#endif
-  if (width * 4 > kMaxStride) {
-    return -1;
-  }
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+      NV21ToRGB565Row = NV21ToRGB565Row_NEON;
    }
  }
 #endif

  for (int y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_uv, row, width);
-    ARGBToRGB565Row(row, dst_rgb565, width);
+    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
    dst_rgb565 += dst_stride_rgb565;
    src_y += src_stride_y;
    if (y & 1) {
-      src_uv += src_stride_uv;
+      src_vu += src_stride_vu;
    }
  }
  return 0;

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -90,16 +90,26 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
                  rgb_buf + n * BPP, width & 7);                               \
    }

-#ifdef HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_NV12TOARGBROW_SSSE3
 NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
-     0, 4)
+      0, 4)
 NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
-     0, 4)
-#endif  // HAS_I422TOARGBROW_SSSE3
-#ifdef HAS_I422TOARGBROW_NEON
+      0, 4)
+#endif  // HAS_NV12TOARGBROW_SSSE3
+#ifdef HAS_NV12TOARGBROW_NEON
 NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
 NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
-#endif  // HAS_I422TOARGBROW_NEON
+#endif  // HAS_NV12TOARGBROW_NEON
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
+      0, 2)
+NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
+      0, 2)
+#endif  // HAS_NV12TORGB565ROW_SSSE3
+#ifdef HAS_NV12TORGB565ROW_NEON
+NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
+NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
+#endif  // HAS_NV12TORGB565ROW_NEON
 #undef NVANY

 // RGB to RGB does multiple of 16 pixels with SIMD and remainder with C.

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -710,6 +710,74 @@ void NV21ToARGBRow_C(const uint8* y_buf,
  }
 }

+void NV12ToRGB565Row_C(const uint8* y_buf,
+                       const uint8* uv_buf,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel2(y_buf[0], uv_buf[0], uv_buf[1], &b0, &g0, &r0);
+    YuvPixel2(y_buf[1], uv_buf[0], uv_buf[1], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    y_buf += 2;
+    uv_buf += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel2(y_buf[0], uv_buf[0], uv_buf[1], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void NV21ToRGB565Row_C(const uint8* y_buf,
+                       const uint8* vu_buf,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel2(y_buf[0], vu_buf[1], vu_buf[0], &b0, &g0, &r0);
+    YuvPixel2(y_buf[1], vu_buf[1], vu_buf[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    y_buf += 2;
+    vu_buf += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel2(y_buf[0], vu_buf[1], vu_buf[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
 void I422ToBGRARow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
@@ -1311,6 +1379,24 @@ void I422ToARGB4444Row_SSSE3(const uint8* y_buf,
  I422ToARGBRow_SSSE3(y_buf, u_buf, v_buf, row, width);
  ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
 }
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_rgb565,
+                           int width) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
+  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+}
+
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_vu,
+                           uint8* dst_rgb565,
+                           int width) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
+  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+}
+
 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
 #endif  // !defined(YUV_DISABLE_ASM)


--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -473,6 +473,68 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
 }
 #endif  // HAS_NV21TOARGBROW_NEON

+#ifdef HAS_NV12TORGB565ROW_NEON
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_rgb565,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%4]                    \n"
+    "vld1.u8    {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_rgb565,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%4]                    \n"
+    "vld1.u8    {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV21TORGB565ROW_NEON
+
 #ifdef HAS_SPLITUV_NEON
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
 // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.