Commit 810cd910 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGBUnattenuateRow_SSE2 use reciprocal table and pmul

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/497001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@244 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f2c86d01
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include <stdio.h> // printf()
#include <string.h> // for memset() #include <string.h> // for memset()
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
...@@ -909,80 +910,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ...@@ -909,80 +910,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// Divide source RGB by alpha and store to destination.
// b = (b * 255 + (a / 2)) / a;
// g = (g * 255 + (a / 2)) / a;
// r = (r * 255 + (a / 2)) / a;
// Reciprocal method is off by 1 on some values. ie 125
// 8.16 fixed point inverse table
#define T(a) 0x1000000 / a
static uint32 fixed_invtbl[256] = {
0, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), T(0xff) };
#undef T
static void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb,
int width) {
for (int i = 0; i < width; ++i) {
uint32 b = src_argb[0];
uint32 g = src_argb[1];
uint32 r = src_argb[2];
const uint32 a = src_argb[3];
if (a) {
const uint32 ia = fixed_invtbl[a]; // 8.16 fixed point
b = (b * ia + 0x8000) >> 16;
g = (g * ia + 0x8000) >> 16;
r = (r * ia + 0x8000) >> 16;
// Clamping should not be necessary but is free in assembly.
if (b > 255) {
b = 255;
}
if (g > 255) {
g = 255;
}
if (r > 255) {
r = 255;
}
}
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = a;
src_argb += 4;
dst_argb += 4;
}
}
// Convert unattentuated ARGB values to preattenuated ARGB. // Convert unattentuated ARGB values to preattenuated ARGB.
int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
...@@ -1010,7 +937,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, ...@@ -1010,7 +937,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -67,11 +67,8 @@ extern "C" { ...@@ -67,11 +67,8 @@ extern "C" {
#define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBBLENDROW_SSE2
#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBATTENUATE_SSE2 #define HAS_ARGBATTENUATE_SSE2
#endif
// The following are available on Windows 32 bit
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBATTENUATE_SSSE3 #define HAS_ARGBATTENUATE_SSSE3
#define HAS_ARGBUNATTENUATE_SSE2
#endif #endif
// The following are available on Neon platforms // The following are available on Neon platforms
...@@ -312,11 +309,11 @@ void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); ...@@ -312,11 +309,11 @@ void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void I420ToARGBRow_Any_NEON(const uint8* y_buf, void I420ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
...@@ -370,6 +367,9 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); ...@@ -370,6 +367,9 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -700,6 +700,79 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -700,6 +700,79 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
} }
} }
// Divide source RGB by alpha and store to destination.
// b = (b * 255 + (a / 2)) / a;
// g = (g * 255 + (a / 2)) / a;
// r = (r * 255 + (a / 2)) / a;
// Reciprocal method is off by 1 on some values. ie 125
// 8.16 fixed point inverse table
#define T(a) 0x1000000 / a
static uint32 fixed_invtbl[256] = {
0, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), T(0xff) };
#undef T
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
for (int i = 0; i < width; ++i) {
uint32 b = src_argb[0];
uint32 g = src_argb[1];
uint32 r = src_argb[2];
const uint32 a = src_argb[3];
if (a) {
const uint32 ia = fixed_invtbl[a]; // 8.16 fixed point
b = (b * ia + 0x8000) >> 16;
g = (g * ia + 0x8000) >> 16;
r = (r * ia + 0x8000) >> 16;
// Clamping should not be necessary but is free in assembly.
if (b > 255) {
b = 255;
}
if (g > 255) {
g = 255;
}
if (r > 255) {
r = 255;
}
}
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = a;
src_argb += 4;
dst_argb += 4;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -1730,6 +1730,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -1730,6 +1730,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
"sub %0,%1 \n" "sub %0,%1 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
...@@ -2192,9 +2193,9 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2192,9 +2193,9 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"movdqu 0x10(%0),%%xmm3 \n" "movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n" "movdqa %%xmm0,(%2) \n"
"jle 9f \n" "jle 9f \n"
...@@ -2242,6 +2243,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2242,6 +2243,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pslld $0x18,%%xmm4 \n" "pslld $0x18,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x8,%%xmm5 \n" "psrld $0x8,%%xmm5 \n"
// 4 pixel loop // 4 pixel loop
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
...@@ -2254,13 +2256,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2254,13 +2256,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshufhw $0xff,%%xmm1,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n"
"movdqa (%0),%%xmm3 \n" "movdqa (%0),%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"pand %%xmm4,%%xmm3 \n" "pand %%xmm4,%%xmm2 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n"
"por %%xmm3,%%xmm0 \n" "por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n" "movdqa %%xmm0,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
...@@ -2277,6 +2279,156 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2277,6 +2279,156 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
} }
#endif // HAS_ARGBATTENUATE_SSE2 #endif // HAS_ARGBATTENUATE_SSE2
#ifdef HAS_ARGBATTENUATE_SSSE3
// Shuffle table duplicating alpha
CONST uvec8 kShuffleAlpha0 = {
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
};
CONST uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
// Attenuate 4 pixels at a time.
// aligned to 16 bytes
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm3,%%xmm3 \n"
"pslld $0x18,%%xmm3 \n"
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
// 4 pixel loop
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n"
"punpcklbw %%xmm1,%%xmm1 \n"
"pmulhuw %%xmm1,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n"
"pshufb %%xmm5,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n"
"punpckhbw %%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n"
"pand %%xmm3,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kShuffleAlpha0), // %3
"m"(kShuffleAlpha1) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
#endif // HAS_ARGBATTENUATE_SSSE3
#ifdef HAS_ARGBUNATTENUATE_SSE2
// Divide source RGB by alpha and store to destination.
// b = (b * 255 + (a / 2)) / a;
// g = (g * 255 + (a / 2)) / a;
// r = (r * 255 + (a / 2)) / a;
// Reciprocal method is off by 1 on some values. ie 125
// 8.16 fixed point inverse table
#define T(a) 0x10000 / a
CONST uint32 fixed_invtbl8[256] = {
0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 };
#undef T
// Unattenuate 4 pixels at a time.
// aligned to 16 bytes
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
uintptr_t alpha = 0;
asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
// 4 pixel loop
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movzb 0x3(%0),%3 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movd 0x0(%4,%3,4),%%xmm2 \n"
"movzb 0x7(%0),%3 \n"
"movd 0x0(%4,%3,4),%%xmm3 \n"
"pshuflw $0xc0,%%xmm2,%%xmm2 \n"
"pshuflw $0xc0,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n"
"movzb 0xb(%0),%3 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"movd 0x0(%4,%3,4),%%xmm2 \n"
"movzb 0xf(%0),%3 \n"
"movd 0x0(%4,%3,4),%%xmm3 \n"
"pshuflw $0xc0,%%xmm2,%%xmm2 \n"
"pshuflw $0xc0,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n"
"pand %%xmm4,%%xmm2 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
"+r"(alpha) // %3
: "r"(fixed_invtbl8) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
#endif // HAS_ARGBUNATTENUATE_SSE2
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -2319,13 +2319,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2319,13 +2319,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pshufhw xmm2, xmm1,0FFh // 8 alpha words pshufhw xmm2, xmm1,0FFh // 8 alpha words
pshuflw xmm2, xmm2,0FFh pshuflw xmm2, xmm2,0FFh
pmulhuw xmm1, xmm2 // rgb * a pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm3, [eax] // alphas movdqa xmm2, [eax] // alphas
psrlw xmm0, 8 psrlw xmm0, 8
pand xmm3, xmm4 pand xmm2, xmm4
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
pand xmm0, xmm5 // keep original alphas pand xmm0, xmm5 // keep original alphas
por xmm0, xmm3 por xmm0, xmm2
sub ecx, 4 sub ecx, 4
movdqa [eax + edx], xmm0 movdqa [eax + edx], xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -2347,7 +2347,6 @@ static const uvec8 kShuffleAlpha1 = { ...@@ -2347,7 +2347,6 @@ static const uvec8 kShuffleAlpha1 = {
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0 mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
...@@ -2360,7 +2359,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2360,7 +2359,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax] // read 4 pixels movdqa xmm0, [eax] // read 4 pixels
pshufb xmm0, xmm4 // isolate first 2 alphas pshufb xmm0, xmm4 // isolate first 2 alphas
movdqa xmm1, [eax] // read 4 pixels movdqa xmm1, [eax] // read 4 pixels
punpcklbw xmm1, xmm1 // first 2 pixel rgbs punpcklbw xmm1, xmm1 // first 2 pixel rgbs
pmulhuw xmm0, xmm1 // rgb * a pmulhuw xmm0, xmm1 // rgb * a
...@@ -2383,9 +2382,105 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2383,9 +2382,105 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
ret ret
} }
} }
#endif // HAS_ARGBATTENUATE_SSSE3 #endif // HAS_ARGBATTENUATE_SSSE3
#ifdef HAS_ARGBUNATTENUATE_SSE2
// Divide source RGB by alpha and store to destination.
// b = (b * 255 + (a / 2)) / a;
// g = (g * 255 + (a / 2)) / a;
// r = (r * 255 + (a / 2)) / a;
// Reciprocal method is off by 1 on some values. ie 125
// 8.16 fixed point inverse table
#define T(a) 0x10000 / a
static uint32 fixed_invtbl8[256] = {
0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 };
#undef T
// Unattenuate 4 pixels at a time.
// aligned to 16 bytes
__declspec(naked) __declspec(align(16))
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb0
mov edx, [esp + 8 + 8] // dst_argb
mov ecx, [esp + 8 + 12] // width
sub edx, eax
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
align 16
convertloop:
movdqa xmm0, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 3] // first alpha
movzx edi, byte ptr [eax + 7] // second alpha
punpcklbw xmm0, xmm0 // first 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm0, xmm2 // rgb * a
movdqa xmm1, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 11] // third alpha
movzx edi, byte ptr [eax + 15] // forth alpha
punpckhbw xmm1, xmm1 // next 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // alphas
pand xmm2, xmm4
packuswb xmm0, xmm1
por xmm0, xmm2
sub ecx, 4
movdqa [eax + edx], xmm0
lea eax, [eax + 16]
jg convertloop
pop edi
pop esi
ret
}
}
#endif // HAS_ARGBUNATTENUATE_SSE2
#endif // _M_IX86 #endif // _M_IX86
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -63,7 +63,7 @@ TEST_F(libyuvTest, BenchmakDjb2_C) { ...@@ -63,7 +63,7 @@ TEST_F(libyuvTest, BenchmakDjb2_C) {
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
uint32 h1; uint32 h1;
MaskCpuFlags(kCpuInitialized); MaskCpuFlags(kCpuInitialized);
for (int i = 0; i < _benchmark_iterations; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
h1 = HashDjb2(src_a, kMaxTest, 5381); h1 = HashDjb2(src_a, kMaxTest, 5381);
} }
MaskCpuFlags(-1); MaskCpuFlags(-1);
...@@ -80,7 +80,7 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) { ...@@ -80,7 +80,7 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) {
} }
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
uint32 h1; uint32 h1;
for (int i = 0; i < _benchmark_iterations; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
h1 = HashDjb2(src_a, kMaxTest, 5381); h1 = HashDjb2(src_a, kMaxTest, 5381);
} }
EXPECT_EQ(h1, h2); EXPECT_EQ(h1, h2);
...@@ -96,7 +96,7 @@ TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) { ...@@ -96,7 +96,7 @@ TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
} }
uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381); uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
uint32 h1; uint32 h1;
for (int i = 0; i < _benchmark_iterations; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
h1 = HashDjb2(src_a + 1, kMaxTest, 5381); h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
} }
EXPECT_EQ(h1, h2); EXPECT_EQ(h1, h2);
...@@ -110,7 +110,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_C) { ...@@ -110,7 +110,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
align_buffer_16(src_b, max_width) align_buffer_16(src_b, max_width)
MaskCpuFlags(kCpuInitialized); MaskCpuFlags(kCpuInitialized);
for (int i = 0; i < _benchmark_iterations; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
ComputeSumSquareError(src_a, src_b, max_width); ComputeSumSquareError(src_a, src_b, max_width);
} }
...@@ -128,7 +128,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) { ...@@ -128,7 +128,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) {
align_buffer_16(src_a, max_width) align_buffer_16(src_a, max_width)
align_buffer_16(src_b, max_width) align_buffer_16(src_b, max_width)
for (int i = 0; i < _benchmark_iterations; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
ComputeSumSquareError(src_a, src_b, max_width); ComputeSumSquareError(src_a, src_b, max_width);
} }
...@@ -183,18 +183,18 @@ TEST_F(libyuvTest, SumSquareError) { ...@@ -183,18 +183,18 @@ TEST_F(libyuvTest, SumSquareError) {
} }
TEST_F(libyuvTest, BenchmarkPsnr_C) { TEST_F(libyuvTest, BenchmarkPsnr_C) {
align_buffer_16(src_a, _benchmark_width * _benchmark_height) align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
align_buffer_16(src_b, _benchmark_width * _benchmark_height) align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
MaskCpuFlags(kCpuInitialized); MaskCpuFlags(kCpuInitialized);
double c_time = get_time(); double c_time = get_time();
for (int i = 0; i < _benchmark_iterations; ++i) for (int i = 0; i < benchmark_iterations_; ++i)
CalcFramePsnr(src_a, _benchmark_width, CalcFramePsnr(src_a, benchmark_width_,
src_b, _benchmark_width, src_b, benchmark_width_,
_benchmark_width, _benchmark_height); benchmark_width_, benchmark_height_);
c_time = (get_time() - c_time) / _benchmark_iterations; c_time = (get_time() - c_time) / benchmark_iterations_;
printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6); printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6);
MaskCpuFlags(-1); MaskCpuFlags(-1);
...@@ -206,18 +206,18 @@ TEST_F(libyuvTest, BenchmarkPsnr_C) { ...@@ -206,18 +206,18 @@ TEST_F(libyuvTest, BenchmarkPsnr_C) {
} }
TEST_F(libyuvTest, BenchmarkPsnr_OPT) { TEST_F(libyuvTest, BenchmarkPsnr_OPT) {
align_buffer_16(src_a, _benchmark_width * _benchmark_height) align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
align_buffer_16(src_b, _benchmark_width * _benchmark_height) align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
MaskCpuFlags(-1); MaskCpuFlags(-1);
double opt_time = get_time(); double opt_time = get_time();
for (int i = 0; i < _benchmark_iterations; ++i) for (int i = 0; i < benchmark_iterations_; ++i)
CalcFramePsnr(src_a, _benchmark_width, CalcFramePsnr(src_a, benchmark_width_,
src_b, _benchmark_width, src_b, benchmark_width_,
_benchmark_width, _benchmark_height); benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / _benchmark_iterations; opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6); printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
EXPECT_EQ(0, 0); EXPECT_EQ(0, 0);
...@@ -304,18 +304,18 @@ TEST_F(libyuvTest, Psnr) { ...@@ -304,18 +304,18 @@ TEST_F(libyuvTest, Psnr) {
} }
TEST_F(libyuvTest, BenchmarkSsim_C) { TEST_F(libyuvTest, BenchmarkSsim_C) {
align_buffer_16(src_a, _benchmark_width * _benchmark_height) align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
align_buffer_16(src_b, _benchmark_width * _benchmark_height) align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
MaskCpuFlags(kCpuInitialized); MaskCpuFlags(kCpuInitialized);
double c_time = get_time(); double c_time = get_time();
for (int i = 0; i < _benchmark_iterations; ++i) for (int i = 0; i < benchmark_iterations_; ++i)
CalcFrameSsim(src_a, _benchmark_width, CalcFrameSsim(src_a, benchmark_width_,
src_b, _benchmark_width, src_b, benchmark_width_,
_benchmark_width, _benchmark_height); benchmark_width_, benchmark_height_);
c_time = (get_time() - c_time) / _benchmark_iterations; c_time = (get_time() - c_time) / benchmark_iterations_;
printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6); printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6);
MaskCpuFlags(-1); MaskCpuFlags(-1);
...@@ -327,18 +327,18 @@ TEST_F(libyuvTest, BenchmarkSsim_C) { ...@@ -327,18 +327,18 @@ TEST_F(libyuvTest, BenchmarkSsim_C) {
} }
TEST_F(libyuvTest, BenchmarkSsim_OPT) { TEST_F(libyuvTest, BenchmarkSsim_OPT) {
align_buffer_16(src_a, _benchmark_width * _benchmark_height) align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
align_buffer_16(src_b, _benchmark_width * _benchmark_height) align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
MaskCpuFlags(-1); MaskCpuFlags(-1);
double opt_time = get_time(); double opt_time = get_time();
for (int i = 0; i < _benchmark_iterations; ++i) for (int i = 0; i < benchmark_iterations_; ++i)
CalcFrameSsim(src_a, _benchmark_width, CalcFrameSsim(src_a, benchmark_width_,
src_b, _benchmark_width, src_b, benchmark_width_,
_benchmark_width, _benchmark_height); benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / _benchmark_iterations; opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6); printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
EXPECT_EQ(0, 0); EXPECT_EQ(0, 0);
......
...@@ -8,14 +8,13 @@ ...@@ -8,14 +8,13 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "unit_test.h"
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/version.h" #include "libyuv/version.h"
#include "unit_test/unit_test.h"
namespace libyuv { namespace libyuv {
......
...@@ -8,8 +8,6 @@ ...@@ -8,8 +8,6 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "unit_test.h"
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
...@@ -17,6 +15,7 @@ ...@@ -17,6 +15,7 @@
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include "libyuv/rotate.h" #include "libyuv/rotate.h"
#include "unit_test/unit_test.h"
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var #define SIMD_ALIGNED(var) __declspec(align(16)) var
...@@ -26,20 +25,20 @@ ...@@ -26,20 +25,20 @@
namespace libyuv { namespace libyuv {
TEST_F (libyuvTest, BenchmarkI420ToARGB_C) { TEST_F(libyuvTest, BenchmarkI420ToARGB_C) {
align_buffer_16(src_y, _benchmark_width * _benchmark_height); align_buffer_16(src_y, benchmark_width_ * benchmark_height_);
align_buffer_16(src_u, ((_benchmark_width * _benchmark_height) >> 2)); align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(src_v, ((_benchmark_width * _benchmark_height) >> 2)); align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(dst_argb, ((_benchmark_width << 2) * _benchmark_height)); align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
MaskCpuFlags(kCpuInitialized); MaskCpuFlags(kCpuInitialized);
for (int i = 0; i < _benchmark_iterations; ++i) for (int i = 0; i < benchmark_iterations_; ++i)
I420ToARGB(src_y, _benchmark_width, I420ToARGB(src_y, benchmark_width_,
src_u, _benchmark_width >> 1, src_u, benchmark_width_ >> 1,
src_v, _benchmark_width >> 1, src_v, benchmark_width_ >> 1,
dst_argb, _benchmark_width << 2, dst_argb, benchmark_width_ << 2,
_benchmark_width, _benchmark_height); benchmark_width_, benchmark_height_);
MaskCpuFlags(-1); MaskCpuFlags(-1);
...@@ -51,18 +50,18 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_C) { ...@@ -51,18 +50,18 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_C) {
free_aligned_buffer_16(dst_argb) free_aligned_buffer_16(dst_argb)
} }
TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) { TEST_F(libyuvTest, BenchmarkI420ToARGB_OPT) {
align_buffer_16(src_y, _benchmark_width * _benchmark_height); align_buffer_16(src_y, benchmark_width_ * benchmark_height_);
align_buffer_16(src_u, (_benchmark_width * _benchmark_height) >> 2); align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(src_v, (_benchmark_width * _benchmark_height) >> 2); align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(dst_argb, (_benchmark_width << 2) * _benchmark_height); align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
for (int i = 0; i < _benchmark_iterations; ++i) for (int i = 0; i < benchmark_iterations_; ++i)
I420ToARGB(src_y, _benchmark_width, I420ToARGB(src_y, benchmark_width_,
src_u, _benchmark_width >> 1, src_u, benchmark_width_ >> 1,
src_v, _benchmark_width >> 1, src_v, benchmark_width_ >> 1,
dst_argb, _benchmark_width << 2, dst_argb, benchmark_width_ << 2,
_benchmark_width, _benchmark_height); benchmark_width_, benchmark_height_);
free_aligned_buffer_16(src_y) free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u) free_aligned_buffer_16(src_u)
...@@ -71,7 +70,7 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) { ...@@ -71,7 +70,7 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) {
} }
#define TESTI420TO(FMT) \ #define TESTI420TO(FMT) \
TEST_F (libyuvTest, I420To##FMT##_CvsOPT) { \ TEST_F(libyuvTest, I420To##FMT##_CvsOPT) { \
const int src_width = 1280; \ const int src_width = 1280; \
const int src_height = 720; \ const int src_height = 720; \
align_buffer_16(src_y, src_width * src_height); \ align_buffer_16(src_y, src_width * src_height); \
...@@ -103,8 +102,8 @@ TEST_F (libyuvTest, I420To##FMT##_CvsOPT) { \ ...@@ -103,8 +102,8 @@ TEST_F (libyuvTest, I420To##FMT##_CvsOPT) { \
int err = 0; \ int err = 0; \
for (int i = 0; i < src_height; ++i) { \ for (int i = 0; i < src_height; ++i) { \
for (int j = 0; j < src_width << 2; ++j) { \ for (int j = 0; j < src_width << 2; ++j) { \
int diff = (int)(dst_rgb_c[i * src_height + j]) - \ int diff = static_cast<int>(dst_rgb_c[i * src_height + j]) - \
(int)(dst_rgb_opt[i * src_height + j]); \ static_cast<int>(dst_rgb_opt[i * src_height + j]); \
if (abs(diff) > 2) \ if (abs(diff) > 2) \
err++; \ err++; \
} \ } \
...@@ -121,11 +120,48 @@ TESTI420TO(ARGB) ...@@ -121,11 +120,48 @@ TESTI420TO(ARGB)
TESTI420TO(BGRA) TESTI420TO(BGRA)
TESTI420TO(ABGR) TESTI420TO(ABGR)
TEST_F (libyuvTest, TestAttenuate) { TEST_F(libyuvTest, TestAttenuate) {
SIMD_ALIGNED(uint8 orig_pixels[256][4]); SIMD_ALIGNED(uint8 orig_pixels[256][4]);
SIMD_ALIGNED(uint8 atten_pixels[256][4]); SIMD_ALIGNED(uint8 atten_pixels[256][4]);
SIMD_ALIGNED(uint8 unatten_pixels[256][4]); SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
SIMD_ALIGNED(uint8 atten2_pixels[256][4]); SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
// Test unattenuation clamps
orig_pixels[0][0] = 200u;
orig_pixels[0][1] = 129u;
orig_pixels[0][2] = 127u;
orig_pixels[0][3] = 128u;
// Test unattenuation transparent and opaque are unaffected
orig_pixels[1][0] = 16u;
orig_pixels[1][1] = 64u;
orig_pixels[1][2] = 192u;
orig_pixels[1][3] = 0u;
orig_pixels[2][0] = 16u;
orig_pixels[2][1] = 64u;
orig_pixels[2][2] = 192u;
orig_pixels[2][3] = 255u;
orig_pixels[3][0] = 16u;
orig_pixels[3][1] = 64u;
orig_pixels[3][2] = 192u;
orig_pixels[3][3] = 128u;
ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
EXPECT_EQ(255u, unatten_pixels[0][0]);
EXPECT_EQ(255u, unatten_pixels[0][1]);
EXPECT_EQ(254u, unatten_pixels[0][2]);
EXPECT_EQ(128u, unatten_pixels[0][3]);
EXPECT_EQ(16u, unatten_pixels[1][0]);
EXPECT_EQ(64u, unatten_pixels[1][1]);
EXPECT_EQ(192u, unatten_pixels[1][2]);
EXPECT_EQ(0u, unatten_pixels[1][3]);
EXPECT_EQ(16u, unatten_pixels[2][0]);
EXPECT_EQ(64u, unatten_pixels[2][1]);
EXPECT_EQ(192u, unatten_pixels[2][2]);
EXPECT_EQ(255u, unatten_pixels[2][3]);
EXPECT_EQ(32u, unatten_pixels[3][0]);
EXPECT_EQ(128u, unatten_pixels[3][1]);
EXPECT_EQ(255u, unatten_pixels[3][2]);
EXPECT_EQ(128u, unatten_pixels[3][3]);
for (int i = 0; i < 256; ++i) { for (int i = 0; i < 256; ++i) {
orig_pixels[i][0] = i; orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2; orig_pixels[i][1] = i / 2;
...@@ -156,17 +192,5 @@ TEST_F (libyuvTest, TestAttenuate) { ...@@ -156,17 +192,5 @@ TEST_F (libyuvTest, TestAttenuate) {
EXPECT_EQ(127, atten_pixels[255][1]); EXPECT_EQ(127, atten_pixels[255][1]);
EXPECT_EQ(85, atten_pixels[255][2]); EXPECT_EQ(85, atten_pixels[255][2]);
EXPECT_EQ(255, atten_pixels[255][3]); EXPECT_EQ(255, atten_pixels[255][3]);
// Test unattenuation clamps
orig_pixels[0][0] = 200;
orig_pixels[0][1] = 129;
orig_pixels[0][2] = 127;
orig_pixels[0][3] = 128;
ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 1, 1);
EXPECT_EQ(255, unatten_pixels[0][0]);
EXPECT_EQ(255, unatten_pixels[0][1]);
EXPECT_EQ(254, unatten_pixels[0][2]);
EXPECT_EQ(128, unatten_pixels[0][3]);
} }
} }
...@@ -8,13 +8,12 @@ ...@@ -8,13 +8,12 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "unit_test.h"
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#include "libyuv/rotate.h"
#include "../source/rotate_priv.h" #include "../source/rotate_priv.h"
#include "libyuv/rotate.h"
#include "unit_test/unit_test.h"
namespace libyuv { namespace libyuv {
...@@ -33,8 +32,8 @@ TEST_F(libyuvTest, Transpose) { ...@@ -33,8 +32,8 @@ TEST_F(libyuvTest, Transpose) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = ih; ow = ih;
oh = iw; oh = iw;
...@@ -77,8 +76,8 @@ TEST_F(libyuvTest, TransposeUV) { ...@@ -77,8 +76,8 @@ TEST_F(libyuvTest, TransposeUV) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = ih; ow = ih;
...@@ -134,8 +133,8 @@ TEST_F(libyuvTest, RotatePlane90) { ...@@ -134,8 +133,8 @@ TEST_F(libyuvTest, RotatePlane90) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = ih; ow = ih;
...@@ -191,8 +190,8 @@ TEST_F(libyuvTest, RotateUV90) { ...@@ -191,8 +190,8 @@ TEST_F(libyuvTest, RotateUV90) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = ih; ow = ih;
...@@ -265,8 +264,8 @@ TEST_F(libyuvTest, RotateUV180) { ...@@ -265,8 +264,8 @@ TEST_F(libyuvTest, RotateUV180) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = iw >> 1; ow = iw >> 1;
...@@ -339,8 +338,8 @@ TEST_F(libyuvTest, RotateUV270) { ...@@ -339,8 +338,8 @@ TEST_F(libyuvTest, RotateUV270) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = ih; ow = ih;
...@@ -414,8 +413,8 @@ TEST_F(libyuvTest, RotatePlane180) { ...@@ -414,8 +413,8 @@ TEST_F(libyuvTest, RotatePlane180) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = iw; ow = iw;
...@@ -459,8 +458,8 @@ TEST_F(libyuvTest, RotatePlane270) { ...@@ -459,8 +458,8 @@ TEST_F(libyuvTest, RotatePlane270) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
for (ih = 8; ih < _rotate_max_h && !err; ++ih) { for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i; int i;
ow = ih; ow = ih;
...@@ -516,8 +515,8 @@ TEST_F(libyuvTest, RotatePlane90and270) { ...@@ -516,8 +515,8 @@ TEST_F(libyuvTest, RotatePlane90and270) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 16; iw < _rotate_max_w && !err; iw += 4) for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
int i; int i;
ow = ih; ow = ih;
...@@ -561,8 +560,8 @@ TEST_F(libyuvTest, RotatePlane90Pitch) { ...@@ -561,8 +560,8 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
int iw, ih; int iw, ih;
int err = 0; int err = 0;
for (iw = 16; iw < _rotate_max_w && !err; iw += 4) for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
int i; int i;
int ow = ih; int ow = ih;
...@@ -618,8 +617,8 @@ TEST_F(libyuvTest, RotatePlane270Pitch) { ...@@ -618,8 +617,8 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
int iw, ih, ow, oh; int iw, ih, ow, oh;
int err = 0; int err = 0;
for (iw = 16; iw < _rotate_max_w && !err; iw += 4) for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
int i; int i;
ow = ih; ow = ih;
......
...@@ -8,13 +8,12 @@ ...@@ -8,13 +8,12 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "unit_test.h"
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/scale.h" #include "libyuv/scale.h"
#include "unit_test/unit_test.h"
namespace libyuv { namespace libyuv {
......
...@@ -8,15 +8,13 @@ ...@@ -8,15 +8,13 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "unit_test/unit_test.h"
#include <cstring> #include <cstring>
#include "unit_test.h"
libyuvTest::libyuvTest() : libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
_rotate_max_w(128), benchmark_iterations_(1000), benchmark_width_(1280),
_rotate_max_h(128), benchmark_height_(720) {
_benchmark_iterations(1000),
_benchmark_width(1280),
_benchmark_height(720) {
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
......
...@@ -8,17 +8,17 @@ ...@@ -8,17 +8,17 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#ifndef UINIT_TEST_H_ #ifndef UNIT_TEST_UNIT_TEST_H_
#define UINIT_TEST_H_ #define UNIT_TEST_UNIT_TEST_H_
#include <gtest/gtest.h> #include <gtest/gtest.h>
#define align_buffer_16(var, size) \ #define align_buffer_16(var, size) \
uint8 *var; \ uint8* var; \
uint8 *var##_mem; \ uint8* var##_mem; \
var##_mem = reinterpret_cast<uint8*>(calloc((size)+15, sizeof(uint8))); \ var##_mem = reinterpret_cast<uint8*>(calloc((size) + 15, sizeof(uint8))); \
var = reinterpret_cast<uint8*> \ var = reinterpret_cast<uint8*> \
((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f)); ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f)); \
#define free_aligned_buffer_16(var) \ #define free_aligned_buffer_16(var) \
free(var##_mem); \ free(var##_mem); \
...@@ -27,12 +27,11 @@ ...@@ -27,12 +27,11 @@
#ifdef WIN32 #ifdef WIN32
#include <windows.h> #include <windows.h>
static double get_time() static double get_time() {
{ LARGE_INTEGER t, f;
LARGE_INTEGER t, f; QueryPerformanceCounter(&t);
QueryPerformanceCounter(&t); QueryPerformanceFrequency(&f);
QueryPerformanceFrequency(&f); return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
return double(t.QuadPart)/double(f.QuadPart);
} }
#define random rand #define random rand
...@@ -46,7 +45,7 @@ static double get_time() { ...@@ -46,7 +45,7 @@ static double get_time() {
struct timeval t; struct timeval t;
struct timezone tzp; struct timezone tzp;
gettimeofday(&t, &tzp); gettimeofday(&t, &tzp);
return t.tv_sec + t.tv_usec*1e-6; return t.tv_sec + t.tv_usec * 1e-6;
} }
#endif #endif
...@@ -55,13 +54,12 @@ class libyuvTest : public ::testing::Test { ...@@ -55,13 +54,12 @@ class libyuvTest : public ::testing::Test {
protected: protected:
libyuvTest(); libyuvTest();
const int _rotate_max_w; const int rotate_max_w_;
const int _rotate_max_h; const int rotate_max_h_;
const int _benchmark_iterations;
const int _benchmark_width;
const int _benchmark_height;
const int benchmark_iterations_;
const int benchmark_width_;
const int benchmark_height_;
}; };
#endif // UNIT_TEST_H_ #endif // UNIT_TEST_UNIT_TEST_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment