Commit b2a6af1b authored by fbarchard@google.com's avatar fbarchard@google.com

Change rectangle low level functions to use more conventional row functions…

Change rectangle low level functions to use more conventional row functions including 'any' variations.  Previously the yuv function SetPlane stored 32 bit values. Now a more conventional memset() style function is used for YUV that stores bytes.  On Haswell a rep stosb is used for YUV.  Overall benefit of this CL is improved performance for 'any' width, and simpler row assembly instead of full image assembly.  Previously ARGBRect used a low level function that supported a rectangle in assembly.  Now it uses a row function, and relies on row coalesce to combine into a single low level call.
BUG=371
TESTED=untested
R=brucedawson@google.com, harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/35689004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1222 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 89671c4d
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1220 Version: 1222
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -114,7 +114,8 @@ extern "C" { ...@@ -114,7 +114,8 @@ extern "C" {
#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86 #define HAS_SETROW_X86
#define HAS_ARGBSETROWS_X86 #define HAS_SETROW_ERMS
#define HAS_ARGBSETROW_X86
#define HAS_SPLITUVROW_SSE2 #define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUV422ROW_SSE2
...@@ -302,7 +303,8 @@ extern "C" { ...@@ -302,7 +303,8 @@ extern "C" {
#define HAS_RGB565TOYROW_NEON #define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON #define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON #define HAS_RGBATOYROW_NEON
// #define HAS_SETROW_NEON #define HAS_SETROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUV422ROW_NEON
...@@ -332,7 +334,6 @@ extern "C" { ...@@ -332,7 +334,6 @@ extern "C" {
#define HAS_SOBELXYROW_NEON #define HAS_SOBELXYROW_NEON
#define HAS_SOBELYROW_NEON #define HAS_SOBELYROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBCOLORMATRIXROW_NEON
// #define HAS_ARGBSETROWS_NEON
#define HAS_ARGBSHUFFLEROW_NEON #define HAS_ARGBSHUFFLEROW_NEON
#endif #endif
...@@ -800,15 +801,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); ...@@ -800,15 +801,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void SetRow_C(uint8* dst, uint32 v32, int count); void SetRow_C(uint8* dst, uint8 v8, int count);
void SetRow_X86(uint8* dst, uint32 v32, int count); void SetRow_X86(uint8* dst, uint8 v8, int count);
void SetRow_NEON(uint8* dst, uint32 v32, int count); void SetRow_ERMS(uint8* dst, uint8 v8, int count);
void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, void SetRow_NEON(uint8* dst, uint8 v8, int count);
int height); void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
int dst_stride, int height);
void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
int dst_stride, int height); void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
// ARGBShufflers for BGRAToARGB etc. // ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1220 #define LIBYUV_VERSION 1222
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -1094,8 +1094,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y, ...@@ -1094,8 +1094,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height, int width, int height,
uint32 value) { uint32 value) {
int y; int y;
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
if (height < 0) { if (height < 0) {
height = -height; height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y; dst_y = dst_y + (height - 1) * dst_stride_y;
...@@ -1108,19 +1107,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y, ...@@ -1108,19 +1107,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0; dst_stride_y = 0;
} }
#if defined(HAS_SETROW_NEON) #if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasNEON)) {
SetRow = SetRow_NEON; SetRow = SetRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON;
}
} }
#endif #endif
#if defined(HAS_SETROW_X86) #if defined(HAS_SETROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { if (TestCpuFlag(kCpuHasX86)) {
SetRow = SetRow_X86; SetRow = SetRow_Any_X86;
if (IS_ALIGNED(width, 4)) {
SetRow = SetRow_X86;
}
}
#endif
#if defined(HAS_SETROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
SetRow = SetRow_ERMS;
} }
#endif #endif
// Set plane // Set plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
SetRow(dst_y, v32, width); SetRow(dst_y, value, width);
dst_y += dst_stride_y; dst_y += dst_stride_y;
} }
} }
...@@ -1139,7 +1149,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y, ...@@ -1139,7 +1149,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
if (!dst_y || !dst_u || !dst_v || if (!dst_y || !dst_u || !dst_v ||
width <= 0 || height <= 0 || width <= 0 || height == 0 ||
x < 0 || y < 0 || x < 0 || y < 0 ||
value_y < 0 || value_y > 255 || value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 || value_u < 0 || value_u > 255 ||
...@@ -1159,6 +1169,8 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, ...@@ -1159,6 +1169,8 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y, int dst_x, int dst_y,
int width, int height, int width, int height,
uint32 value) { uint32 value) {
int y;
void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
if (!dst_argb || if (!dst_argb ||
width <= 0 || height == 0 || width <= 0 || height == 0 ||
dst_x < 0 || dst_y < 0) { dst_x < 0 || dst_y < 0) {
...@@ -1176,19 +1188,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, ...@@ -1176,19 +1188,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
height = 1; height = 1;
dst_stride_argb = 0; dst_stride_argb = 0;
} }
#if defined(HAS_ARGBSETROWS_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { #if defined(HAS_ARGBSETROW_NEON)
ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); if (TestCpuFlag(kCpuHasNEON)) {
return 0; ARGBSetRow = ARGBSetRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBSetRow = ARGBSetRow_NEON;
}
} }
#endif #endif
#if defined(HAS_ARGBSETROWS_X86) #if defined(HAS_ARGBSETROW_X86)
if (TestCpuFlag(kCpuHasX86)) { if (TestCpuFlag(kCpuHasX86)) {
ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height); ARGBSetRow = ARGBSetRow_X86;
return 0;
} }
#endif #endif
ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
// Set plane
for (y = 0; y < height; ++y) {
ARGBSetRow(dst_argb, value, width);
dst_argb += dst_stride_argb;
}
return 0; return 0;
} }
......
...@@ -681,6 +681,27 @@ MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31) ...@@ -681,6 +681,27 @@ MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif #endif
#undef MANY #undef MANY
#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \
void NAMEANY(uint8* dst_y, T v8, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
SET_SIMD(dst_y, v8, n); \
} \
SET_C(dst_y + n * BPP, v8, r); \
}
#ifdef HAS_SETROW_X86
SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3)
#endif
#ifdef HAS_SETROW_NEON
SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15)
#endif
#ifdef HAS_ARGBSETROW_NEON
SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3)
#endif
#undef SETANY
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -1623,28 +1623,15 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) { ...@@ -1623,28 +1623,15 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
memcpy(dst, src, count * 2); memcpy(dst, src, count * 2);
} }
void SetRow_C(uint8* dst, uint32 v8, int count) { void SetRow_C(uint8* dst, uint8 v8, int width) {
#ifdef _MSC_VER memset(dst, v8, width);
// VisualC will generate rep stosb.
int x;
for (x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
} }
void ARGBSetRows_C(uint8* dst, uint32 v32, int width, void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
int dst_stride, int height) { uint32* d = (uint32*)(dst_argb);
int y; int x;
for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) {
uint32* d = (uint32*)(dst); d[x] = v32;
int x;
for (x = 0; x < width; ++x) {
d[x] = v32;
}
dst += dst_stride;
} }
} }
......
...@@ -846,7 +846,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -846,7 +846,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
} }
// SetRow writes 'count' bytes using an 8 bit value repeated. // SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint32 v8, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile ( asm volatile (
"vdup.8 q0, %2 \n" // duplicate 16 bytes "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
......
...@@ -736,7 +736,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -736,7 +736,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_NEON #endif // HAS_COPYROW_NEON
// SetRow writes 'count' bytes using an 8 bit value repeated. // SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint32 v8, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile ( asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
......
...@@ -2893,10 +2893,10 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2893,10 +2893,10 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86 #ifdef HAS_SETROW_X86
void SetRow_X86(uint8* dst, uint32 v32, int width) { void SetRow_X86(uint8* dst, uint8 v8, int width) {
size_t width_tmp = (size_t)(width); size_t width_tmp = (size_t)(width >> 2);
const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
asm volatile ( asm volatile (
"shr $0x2,%1 \n"
"rep stosl " MEMSTORESTRING(eax,0) " \n" "rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst), // %0 : "+D"(dst), // %0
"+c"(width_tmp) // %1 "+c"(width_tmp) // %1
...@@ -2904,19 +2904,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) { ...@@ -2904,19 +2904,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) {
: "memory", "cc"); : "memory", "cc");
} }
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
int dst_stride, int height) { size_t width_tmp = (size_t)(width);
for (int y = 0; y < height; ++y) { asm volatile (
size_t width_tmp = (size_t)(width); "rep stosb " MEMSTORESTRING(al,0) " \n"
uint32* d = (uint32*)(dst); : "+D"(dst), // %0
asm volatile ( "+c"(width_tmp) // %1
"rep stosl " MEMSTORESTRING(eax,0) " \n" : "a"(v8) // %2
: "+D"(d), // %0 : "memory", "cc");
"+c"(width_tmp) // %1 }
: "a"(v32) // %2
: "memory", "cc"); void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
dst += dst_stride; size_t width_tmp = (size_t)(width);
} asm volatile (
"rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst_argb), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
} }
#endif // HAS_SETROW_X86 #endif // HAS_SETROW_X86
......
...@@ -2848,13 +2848,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2848,13 +2848,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86 #ifdef HAS_SETROW_X86
// SetRow writes 'count' bytes using a 32 bit value repeated. // Write 'count' bytes using an 8 bit value repeated.
// Count should be multiple of 4.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void SetRow_X86(uint8* dst, uint32 v32, int count) { void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm { __asm {
movzx eax, byte ptr [esp + 8] // v8
mov edx, 0x01010101 // Duplicate byte to all bytes.
mul edx // overwrites edx with upper part of result.
mov edx, edi mov edx, edi
mov edi, [esp + 4] // dst mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
shr ecx, 2 shr ecx, 2
rep stosd rep stosd
...@@ -2863,32 +2866,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) { ...@@ -2863,32 +2866,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) {
} }
} }
// SetRow32 writes 'count' words using a 32 bit value repeated. // Write 'count' bytes using an 8 bit value repeated.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
int dst_stride, int height) {
__asm { __asm {
push esi mov edx, edi
push edi mov edi, [esp + 4] // dst
push ebp mov eax, [esp + 8] // v8
mov edi, [esp + 12 + 4] // dst mov ecx, [esp + 12] // count
mov eax, [esp + 12 + 8] // v32 rep stosb
mov ebp, [esp + 12 + 12] // width mov edi, edx
mov edx, [esp + 12 + 16] // dst_stride ret
mov esi, [esp + 12 + 20] // height }
lea ecx, [ebp * 4] }
sub edx, ecx // stride - width * 4
convertloop: // Write 'count' 32 bit values.
mov ecx, ebp __declspec(naked) __declspec(align(16))
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
rep stosd rep stosd
add edi, edx mov edi, edx
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
ret ret
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment