Commit b2a6af1b authored by fbarchard@google.com's avatar fbarchard@google.com

Change rectangle low level functions to use more conventional row functions…

Change rectangle low level functions to use more conventional row functions including 'any' variations.  Previously the yuv function SetPlane stored 32 bit values. Now a more conventional memset() style function is used for YUV that stores bytes.  On Haswell a rep stosb is used for YUV.  Overall benefit of this CL is improved performance for 'any' width, and simpler row assembly instead of full image assembly.  Previously ARGBRect used a low level function that supported a rectangle in assembly.  Now it uses a row function, and relies on row coalesce to combine into a single low level call.
BUG=371
TESTED=untested
R=brucedawson@google.com, harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/35689004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1222 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 89671c4d
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1220
Version: 1222
License: BSD
License File: LICENSE
......
......@@ -114,7 +114,8 @@ extern "C" {
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86
#define HAS_ARGBSETROWS_X86
#define HAS_SETROW_ERMS
#define HAS_ARGBSETROW_X86
#define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2
......@@ -302,7 +303,8 @@ extern "C" {
#define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
// #define HAS_SETROW_NEON
#define HAS_SETROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
......@@ -332,7 +334,6 @@ extern "C" {
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELYROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON
// #define HAS_ARGBSETROWS_NEON
#define HAS_ARGBSHUFFLEROW_NEON
#endif
......@@ -800,15 +801,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void SetRow_C(uint8* dst, uint32 v32, int count);
void SetRow_X86(uint8* dst, uint32 v32, int count);
void SetRow_NEON(uint8* dst, uint32 v32, int count);
void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
int height);
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height);
void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height);
void SetRow_C(uint8* dst, uint8 v8, int count);
void SetRow_X86(uint8* dst, uint8 v8, int count);
void SetRow_ERMS(uint8* dst, uint8 v8, int count);
void SetRow_NEON(uint8* dst, uint8 v8, int count);
void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
// ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1220
#define LIBYUV_VERSION 1222
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -1094,8 +1094,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
uint32 value) {
int y;
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
......@@ -1108,19 +1107,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0;
}
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
SetRow = SetRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON;
}
}
#endif
#if defined(HAS_SETROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
SetRow = SetRow_X86;
if (TestCpuFlag(kCpuHasX86)) {
SetRow = SetRow_Any_X86;
if (IS_ALIGNED(width, 4)) {
SetRow = SetRow_X86;
}
}
#endif
#if defined(HAS_SETROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
SetRow = SetRow_ERMS;
}
#endif
// Set plane
for (y = 0; y < height; ++y) {
SetRow(dst_y, v32, width);
SetRow(dst_y, value, width);
dst_y += dst_stride_y;
}
}
......@@ -1139,7 +1149,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
if (!dst_y || !dst_u || !dst_v ||
width <= 0 || height <= 0 ||
width <= 0 || height == 0 ||
x < 0 || y < 0 ||
value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 ||
......@@ -1159,6 +1169,8 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y,
int width, int height,
uint32 value) {
int y;
void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
if (!dst_argb ||
width <= 0 || height == 0 ||
dst_x < 0 || dst_y < 0) {
......@@ -1176,19 +1188,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
height = 1;
dst_stride_argb = 0;
}
#if defined(HAS_ARGBSETROWS_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
return 0;
#if defined(HAS_ARGBSETROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBSetRow = ARGBSetRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBSetRow = ARGBSetRow_NEON;
}
}
#endif
#if defined(HAS_ARGBSETROWS_X86)
#if defined(HAS_ARGBSETROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
return 0;
ARGBSetRow = ARGBSetRow_X86;
}
#endif
ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
// Set plane
for (y = 0; y < height; ++y) {
ARGBSetRow(dst_argb, value, width);
dst_argb += dst_stride_argb;
}
return 0;
}
......
......@@ -681,6 +681,27 @@ MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif
#undef MANY
#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \
void NAMEANY(uint8* dst_y, T v8, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
SET_SIMD(dst_y, v8, n); \
} \
SET_C(dst_y + n * BPP, v8, r); \
}
#ifdef HAS_SETROW_X86
SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3)
#endif
#ifdef HAS_SETROW_NEON
SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15)
#endif
#ifdef HAS_ARGBSETROW_NEON
SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3)
#endif
#undef SETANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -1623,28 +1623,15 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
memcpy(dst, src, count * 2);
}
void SetRow_C(uint8* dst, uint32 v8, int count) {
#ifdef _MSC_VER
// VisualC will generate rep stosb.
int x;
for (x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
void SetRow_C(uint8* dst, uint8 v8, int width) {
memset(dst, v8, width);
}
void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
int y;
for (y = 0; y < height; ++y) {
uint32* d = (uint32*)(dst);
int x;
for (x = 0; x < width; ++x) {
d[x] = v32;
}
dst += dst_stride;
void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
uint32* d = (uint32*)(dst_argb);
int x;
for (x = 0; x < width; ++x) {
d[x] = v32;
}
}
......
......@@ -846,7 +846,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
}
// SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint32 v8, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile (
"vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
......
......@@ -736,7 +736,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_NEON
// SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint32 v8, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
......
......@@ -2893,10 +2893,10 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
void SetRow_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = (size_t)(width);
void SetRow_X86(uint8* dst, uint8 v8, int width) {
size_t width_tmp = (size_t)(width >> 2);
const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
asm volatile (
"shr $0x2,%1 \n"
"rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
......@@ -2904,19 +2904,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) {
: "memory", "cc");
}
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
size_t width_tmp = (size_t)(width);
uint32* d = (uint32*)(dst);
asm volatile (
"rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(d), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
dst += dst_stride;
}
void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
size_t width_tmp = (size_t)(width);
asm volatile (
"rep stosb " MEMSTORESTRING(al,0) " \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v8) // %2
: "memory", "cc");
}
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
size_t width_tmp = (size_t)(width);
asm volatile (
"rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst_argb), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
}
#endif // HAS_SETROW_X86
......
......@@ -2848,13 +2848,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
// SetRow writes 'count' bytes using a 32 bit value repeated.
// Write 'count' bytes using an 8 bit value repeated.
// Count should be multiple of 4.
__declspec(naked) __declspec(align(16))
void SetRow_X86(uint8* dst, uint32 v32, int count) {
void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm {
movzx eax, byte ptr [esp + 8] // v8
mov edx, 0x01010101 // Duplicate byte to all bytes.
mul edx // overwrites edx with upper part of result.
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
......@@ -2863,32 +2866,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) {
}
}
// SetRow32 writes 'count' words using a 32 bit value repeated.
// Write 'count' bytes using an 8 bit value repeated.
__declspec(naked) __declspec(align(16))
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
__asm {
push esi
push edi
push ebp
mov edi, [esp + 12 + 4] // dst
mov eax, [esp + 12 + 8] // v32
mov ebp, [esp + 12 + 12] // width
mov edx, [esp + 12 + 16] // dst_stride
mov esi, [esp + 12 + 20] // height
lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v8
mov ecx, [esp + 12] // count
rep stosb
mov edi, edx
ret
}
}
convertloop:
mov ecx, ebp
// Write 'count' 32 bit values.
__declspec(naked) __declspec(align(16))
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
rep stosd
add edi, edx
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
mov edi, edx
ret
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment