Commit 6a192487 authored by fbarchard@google.com's avatar fbarchard@google.com

Switch SSSE3 row wrappers from variable sized malloc to fixed size array with…

Switch SSSE3 row wrappers from variable sized malloc to fixed size array with loop to process a portion of the row at a time.  This helps performance in the case where the image has been coalesced into a single large row and the allocator, although only called once, is slow to clear the pages.  Also the smaller temporary buffer fits cache, further improving performance.
BUG=403
TESTED=YUY2ToARGB unittest
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/40849004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1286 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 194f740d
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1285 Version: 1286
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1285 #define LIBYUV_VERSION 1286
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -2122,6 +2122,9 @@ void I422ToUYVYRow_C(const uint8* src_y, ...@@ -2122,6 +2122,9 @@ void I422ToUYVYRow_C(const uint8* src_y,
} }
} }
// Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 4096
#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) #if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper. // row_win.cc has asm version, but GCC uses 2 step wrapper.
#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) #if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
...@@ -2130,11 +2133,17 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, ...@@ -2130,11 +2133,17 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
// Allocate a row of ARGB. SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
align_buffer_64(row, width * 4); while (width > 0) {
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
ARGBToRGB565Row_SSE2(row, rgb_buf, width); I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
free_aligned_buffer_64(row); ARGBToRGB565Row_SSE2(row, rgb_buf, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_argb += twidth * 2;
width -= twidth;
}
} }
#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) #endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
...@@ -2144,11 +2153,18 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, ...@@ -2144,11 +2153,18 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
// Allocate a row of ARGB. // Row buffer for intermediate ARGB pixels.
align_buffer_64(row, width * 4); SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); while (width > 0) {
ARGBToARGB1555Row_SSE2(row, rgb_buf, width); int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
free_aligned_buffer_64(row); I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
ARGBToARGB1555Row_SSE2(row, rgb_buf, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
rgb_buf += twidth * 2;
width -= twidth;
}
} }
void I422ToARGB4444Row_SSSE3(const uint8* src_y, void I422ToARGB4444Row_SSSE3(const uint8* src_y,
...@@ -2156,61 +2172,81 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, ...@@ -2156,61 +2172,81 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
// Allocate a row of ARGB. // Row buffer for intermediate ARGB pixels.
align_buffer_64(row, width * 4); SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); while (width > 0) {
ARGBToARGB4444Row_SSE2(row, rgb_buf, width); int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
free_aligned_buffer_64(row); I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
} ARGBToARGB4444Row_SSE2(row, rgb_buf, twidth);
src_y += twidth;
void NV12ToRGB565Row_SSSE3(const uint8* src_y, src_u += twidth / 2;
const uint8* src_uv, src_v += twidth / 2;
uint8* dst_rgb565, rgb_buf += twidth * 2;
int width) { width -= twidth;
// Allocate a row of ARGB. }
align_buffer_64(row, width * 4); }
NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
ARGBToRGB565Row_SSE2(row, dst_rgb565, width); void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
free_aligned_buffer_64(row); uint8* dst_rgb565, int width) {
} // Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
void NV21ToRGB565Row_SSSE3(const uint8* src_y, while (width > 0) {
const uint8* src_vu, int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
uint8* dst_rgb565, NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
int width) { ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
// Allocate a row of ARGB. src_y += twidth;
align_buffer_64(row, width * 4); src_uv += twidth;
NV21ToARGBRow_SSSE3(src_y, src_vu, row, width); dst_rgb565 += twidth * 2;
ARGBToRGB565Row_SSE2(row, dst_rgb565, width); width -= twidth;
free_aligned_buffer_64(row); }
} }
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
uint8* dst_argb, uint8* dst_rgb565, int width) {
int width) { // Row buffer for intermediate ARGB pixels.
// Allocate a rows of yuv. SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
align_buffer_64(row_y, ((width + 63) & ~63) * 2); while (width > 0) {
uint8* row_u = row_y + ((width + 63) & ~63); int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
uint8* row_v = row_u + ((width + 63) & ~63) / 2; NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width); ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
YUY2ToYRow_SSE2(src_yuy2, row_y, width); src_y += twidth;
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); src_vu += twidth;
free_aligned_buffer_64(row_y); dst_rgb565 += twidth * 2;
} width -= twidth;
}
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, }
uint8* dst_argb,
int width) { void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
// Allocate a rows of yuv. // Row buffers for intermediate YUV pixels.
align_buffer_64(row_y, ((width + 63) & ~63) * 2); SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
uint8* row_u = row_y + ((width + 63) & ~63); SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
uint8* row_v = row_u + ((width + 63) & ~63) / 2; SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width); while (width > 0) {
UYVYToYRow_SSE2(src_uyvy, row_y, width); int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
free_aligned_buffer_64(row_y); YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
src_yuy2 += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
src_uyvy += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
} }
#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
#endif // !defined(LIBYUV_DISABLE_X86) #endif // !defined(LIBYUV_DISABLE_X86)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment