Commit d93d4486 authored by fbarchard@google.com's avatar fbarchard@google.com

row functions for windows use ssse3 for yuv to rgb. mac use sse3 for rgb to yuv

Review URL: http://webrtc-codereview.appspot.com/267007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@66 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 82ba1b77
...@@ -36,7 +36,7 @@ static void SplitUV_NEON(const uint8* src_uv, ...@@ -36,7 +36,7 @@ static void SplitUV_NEON(const uint8* src_uv,
"+r"(dst_v), "+r"(dst_v),
"+r"(pix) // Output registers "+r"(pix) // Output registers
: // Input registers : // Input registers
: "q0", "q1" // Clobber List : "memory", "cc", "q0", "q1" // Clobber List
); );
} }
...@@ -1080,6 +1080,13 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1080,6 +1080,13 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 4 == 0) && (width % 4 == 0) &&
...@@ -1132,6 +1139,13 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, ...@@ -1132,6 +1139,13 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2) #if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) { (width % 2 == 0)) {
...@@ -1176,6 +1190,13 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, ...@@ -1176,6 +1190,13 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2) #if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) { (width % 2 == 0)) {
...@@ -1220,6 +1241,13 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1220,6 +1241,13 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) { (width % 2 == 0)) {
...@@ -1263,6 +1291,13 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1263,6 +1291,13 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#if defined(HAS_FASTCONVERTYUV444TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2; FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
...@@ -1300,10 +1335,10 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, ...@@ -1300,10 +1335,10 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
void (*FastConvertYToARGBRow)(const uint8* y_buf, void (*FastConvertYToARGBRow)(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) #if defined(HAS_FASTCONVERTYTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 2 == 0) && (width % 8 == 0) &&
IS_ALIGNED(dst_argb, 8) && (dst_stride_argb % 8 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2; FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
} else } else
#endif #endif
......
...@@ -15,51 +15,61 @@ ...@@ -15,51 +15,61 @@
#define kMaxStride (2048 * 4) #define kMaxStride (2048 * 4)
#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
#define YUV_DISABLE_ASM
#endif
// The following are available on all x86 platforms // The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \ #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(LIBYUV_DISABLE_ASM)
#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ABGRTOARGBROW_SSSE3
#define HAS_BGRATOARGBROW_SSSE3 #define HAS_BGRATOARGBROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3 #define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3 #define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3 #define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3 #define HAS_RAWTOUVROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on Windows and Linux
#if (defined(WIN32) || defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif #endif
// The following are available on Linux (32/64 bit) // The following are available on Linux (32/64 bit)
// TODO(fbarchard): enable for fpic on linux // TODO(fbarchard): enable for fpic on linux
#if (defined(__x86_64__) || \ #if (defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \ (defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSE2 #define HAS_FASTCONVERTYUVTOARGBROW_SSE2
#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2 #define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
#define HAS_FASTCONVERTYUVTOABGRROW_SSE2 #define HAS_FASTCONVERTYUVTOABGRROW_SSE2
#define HAS_FASTCONVERTYUV444TOARGBROW_SSE2
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif #endif
// The following are available on Windows and GCC 32 bit // The following are available on Windows and GCC 32 bit
#if (defined(WIN32) || \ #if (defined(WIN32) || \
defined(__i386__)) && \ defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_MMX #define HAS_FASTCONVERTYUVTOARGBROW_MMX
#define HAS_FASTCONVERTYUVTOBGRAROW_MMX #define HAS_FASTCONVERTYUVTOBGRAROW_MMX
#define HAS_FASTCONVERTYUVTOABGRROW_MMX #define HAS_FASTCONVERTYUVTOABGRROW_MMX
#endif #endif
// The following are available on Windows
#if defined(WIN32) && \
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif
extern "C" { extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
...@@ -224,6 +234,40 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf, ...@@ -224,6 +234,40 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf,
int width); int width);
#endif #endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
// Method to force C version. // Method to force C version.
//#define USE_MMX 0 //#define USE_MMX 0
//#define USE_SSE2 0 //#define USE_SSE2 0
......
...@@ -254,36 +254,46 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -254,36 +254,46 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
asm volatile( asm volatile(
"movdqa %5,%%xmm7\n" "movdqa %0,%%xmm4\n"
"movdqa %6,%%xmm6\n" "movdqa %1,%%xmm3\n"
"movdqa %7,%%xmm5\n" "movdqa %2,%%xmm5\n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
"m"(kAddUV128) // %2
:
#if defined(__SSE2__)
"xmm3", "xmm4", "xmm5"
#endif
);
asm volatile(
"sub %1,%2\n" "sub %1,%2\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm2\n" "movdqa 0x20(%0),%%xmm2\n"
"movdqa 0x30(%0),%%xmm3\n" "movdqa 0x30(%0),%%xmm6\n"
"pavgb (%0,%4,1),%%xmm0\n" "pavgb (%0,%4,1),%%xmm0\n"
"pavgb 0x10(%0,%4,1),%%xmm1\n" "pavgb 0x10(%0,%4,1),%%xmm1\n"
"pavgb 0x20(%0,%4,1),%%xmm2\n" "pavgb 0x20(%0,%4,1),%%xmm2\n"
"pavgb 0x30(%0,%4,1),%%xmm3\n" "pavgb 0x30(%0,%4,1),%%xmm6\n"
"lea 0x40(%0),%0\n" "lea 0x40(%0),%0\n"
"movdqa %%xmm0,%%xmm4\n" "movdqa %%xmm0,%%xmm7\n"
"shufps $0x88,%%xmm1,%%xmm0\n" "shufps $0x88,%%xmm1,%%xmm0\n"
"shufps $0xdd,%%xmm1,%%xmm4\n" "shufps $0xdd,%%xmm1,%%xmm7\n"
"pavgb %%xmm4,%%xmm0\n" "pavgb %%xmm7,%%xmm0\n"
"movdqa %%xmm2,%%xmm4\n" "movdqa %%xmm2,%%xmm7\n"
"shufps $0x88,%%xmm3,%%xmm2\n" "shufps $0x88,%%xmm6,%%xmm2\n"
"shufps $0xdd,%%xmm3,%%xmm4\n" "shufps $0xdd,%%xmm6,%%xmm7\n"
"pavgb %%xmm4,%%xmm2\n" "pavgb %%xmm7,%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n" "movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm2,%%xmm3\n" "movdqa %%xmm2,%%xmm6\n"
"pmaddubsw %%xmm7,%%xmm0\n" "pmaddubsw %%xmm4,%%xmm0\n"
"pmaddubsw %%xmm7,%%xmm2\n" "pmaddubsw %%xmm4,%%xmm2\n"
"pmaddubsw %%xmm6,%%xmm1\n" "pmaddubsw %%xmm3,%%xmm1\n"
"pmaddubsw %%xmm6,%%xmm3\n" "pmaddubsw %%xmm3,%%xmm6\n"
"phaddw %%xmm2,%%xmm0\n" "phaddw %%xmm2,%%xmm0\n"
"phaddw %%xmm3,%%xmm1\n" "phaddw %%xmm6,%%xmm1\n"
"psraw $0x8,%%xmm0\n" "psraw $0x8,%%xmm0\n"
"psraw $0x8,%%xmm1\n" "psraw $0x8,%%xmm1\n"
"packsswb %%xmm1,%%xmm0\n" "packsswb %%xmm1,%%xmm0\n"
...@@ -297,13 +307,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -297,13 +307,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+rm"(width) // %3 "+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb)), // %4 : "r"(static_cast<intptr_t>(src_stride_argb))
"m"(kARGBToU), // %5
"m"(kARGBToV), // %6
"m"(kAddUV128) // %7
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif #endif
); );
} }
......
...@@ -208,25 +208,27 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\ ...@@ -208,25 +208,27 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \ RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \
}; };
#define CS(v) static_cast<int16>(v)
// ARGB table // ARGB table
#define RGBY(i) { \ #define RGBY(i) { \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(256 * 64 - 1) \ CS(256 * 64 - 1) \
} }
#define RGBU(i) { \ #define RGBU(i) { \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \ CS(2.018 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ CS(-0.391 * 64 * (i - 128) - 0.5), \
0, \ 0, \
0 \ 0 \
} }
#define RGBV(i) { \ #define RGBV(i) { \
0, \ 0, \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ CS(-0.813 * 64 * (i - 128) - 0.5), \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ CS(1.596 * 64 * (i - 128) + 0.5), \
0 \ 0 \
} }
...@@ -238,23 +240,23 @@ MAKETABLE(kCoefficientsRgbY) ...@@ -238,23 +240,23 @@ MAKETABLE(kCoefficientsRgbY)
// BGRA table // BGRA table
#define RGBY(i) { \ #define RGBY(i) { \
static_cast<int16>(256 * 64 - 1), \ CS(256 * 64 - 1), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \ CS(1.164 * 64 * (i - 16) + 0.5) \
} }
#define RGBU(i) { \ #define RGBU(i) { \
0, \ 0, \
0, \ 0, \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ CS(-0.391 * 64 * (i - 128) - 0.5), \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \ CS(2.018 * 64 * (i - 128) + 0.5) \
} }
#define RGBV(i) { \ #define RGBV(i) { \
0, \ 0, \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ CS(1.596 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ CS(-0.813 * 64 * (i - 128) - 0.5), \
0 \ 0 \
} }
...@@ -266,22 +268,22 @@ MAKETABLE(kCoefficientsBgraY) ...@@ -266,22 +268,22 @@ MAKETABLE(kCoefficientsBgraY)
// ABGR table // ABGR table
#define RGBY(i) { \ #define RGBY(i) { \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ CS(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(256 * 64 - 1) \ CS(256 * 64 - 1) \
} }
#define RGBU(i) { \ #define RGBU(i) { \
0, \ 0, \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ CS(-0.391 * 64 * (i - 128) - 0.5), \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \ CS(2.018 * 64 * (i - 128) + 0.5), \
0 \ 0 \
} }
#define RGBV(i) { \ #define RGBV(i) { \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ CS(1.596 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ CS(-0.813 * 64 * (i - 128) - 0.5), \
0, \ 0, \
0 \ 0 \
} }
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment