Switch SSSE3 row wrappers from variable sized malloc to fixed size array with…

Switch SSSE3 row wrappers from variable sized malloc to fixed size array with loop to process a portion of the row at a time. This helps performance in the case where the image has been coalesced into a single large row and the allocator, although only called once, is slow to clear the pages. Also the smaller temporary buffer fits cache, further improving performance. BUG=403 TESTED=YUY2ToARGB unittest R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/40849004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1286 16f28f9a-4ce2-e073-06de-1de4eb20be90

Switch SSSE3 row wrappers from variable sized malloc to fixed size array with…
Switch SSSE3 row wrappers from variable sized malloc to fixed size array with loop to process a portion of the row at a time. This helps performance in the case where the image has been coalesced into a single large row and the allocator, although only called once, is slow to clear the pages. Also the smaller temporary buffer fits cache, further improving performance. BUG=403 TESTED=YUY2ToARGB unittest R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/40849004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1286 16f28f9a-4ce2-e073-06de-1de4eb20be90
6a192487 · fbarchard@google.com · 194f740d · 6a192487 · 6a192487 · 6a192487
Commit 6a192487 authored Feb 20, 2015 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 102 additions and 66 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

row_common.cc source/row_common.cc +100 -64

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1285
+Version: 1286
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1285
+#define LIBYUV_VERSION 1286
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2122,6 +2122,9 @@ void I422ToUYVYRow_C(const uint8* src_y,
  }
 }
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 4096
 #if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
 #if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
@@ -2130,11 +2133,17 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
                           const uint8* src_v,
                           uint8* rgb_buf,
                           int width) {
-  // Allocate a row of ARGB.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  align_buffer_64(row, width * 4);
+  while (width > 0) {
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-  ARGBToRGB565Row_SSE2(row, rgb_buf, width);
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
-  free_aligned_buffer_64(row);
+    ARGBToRGB565Row_SSE2(row, rgb_buf, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb += twidth * 2;
+    width -= twidth;
+  }
 }
 #endif  // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
@@ -2144,11 +2153,18 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y,
                             const uint8* src_v,
                             uint8* rgb_buf,
                             int width) {
-  // Allocate a row of ARGB.
+  // Row buffer for intermediate ARGB pixels.
-  align_buffer_64(row, width * 4);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+  while (width > 0) {
-  ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-  free_aligned_buffer_64(row);
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_SSE2(row, rgb_buf, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    rgb_buf += twidth * 2;
+    width -= twidth;
+  }
 }
 void I422ToARGB4444Row_SSSE3(const uint8* src_y,
@@ -2156,61 +2172,81 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
                             const uint8* src_v,
                             uint8* rgb_buf,
                             int width) {
-  // Allocate a row of ARGB.
+  // Row buffer for intermediate ARGB pixels.
-  align_buffer_64(row, width * 4);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+  while (width > 0) {
-  ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-  free_aligned_buffer_64(row);
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
-}
+    ARGBToARGB4444Row_SSE2(row, rgb_buf, twidth);
+    src_y += twidth;
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+    src_u += twidth / 2;
-                           const uint8* src_uv,
+    src_v += twidth / 2;
-                           uint8* dst_rgb565,
+    rgb_buf += twidth * 2;
-                           int width) {
+    width -= twidth;
-  // Allocate a row of ARGB.
+  }
-  align_buffer_64(row, width * 4);
+}
-  NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
-  free_aligned_buffer_64(row);
+                           uint8* dst_rgb565, int width) {
-}
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+  while (width > 0) {
-                           const uint8* src_vu,
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-                           uint8* dst_rgb565,
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
-                           int width) {
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-  // Allocate a row of ARGB.
+    src_y += twidth;
-  align_buffer_64(row, width * 4);
+    src_uv += twidth;
-  NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
+    dst_rgb565 += twidth * 2;
-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+    width -= twidth;
-  free_aligned_buffer_64(row);
+  }
 }
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
-                         uint8* dst_argb,
+                           uint8* dst_rgb565, int width) {
-                         int width) {
+  // Row buffer for intermediate ARGB pixels.
-  // Allocate a rows of yuv.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+  while (width > 0) {
-  uint8* row_u = row_y + ((width + 63) & ~63);
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
-  YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-  YUY2ToYRow_SSE2(src_yuy2, row_y, width);
+    src_y += twidth;
-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
+    src_vu += twidth;
-  free_aligned_buffer_64(row_y);
+    dst_rgb565 += twidth * 2;
-}
+    width -= twidth;
+  }
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+}
-                         uint8* dst_argb,
-                         int width) {
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
-  // Allocate a rows of yuv.
+  // Row buffers for intermediate YUV pixels.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
-  uint8* row_u = row_y + ((width + 63) & ~63);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
-  UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
+  while (width > 0) {
-  UYVYToYRow_SSE2(src_uyvy, row_y, width);
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
+    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
-  free_aligned_buffer_64(row_y);
+    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
 }
 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
 #endif  // !defined(LIBYUV_DISABLE_X86)