Commit d93d4486 authored by fbarchard@google.com's avatar fbarchard@google.com

row functions for windows use ssse3 for yuv to rgb. mac use sse3 for rgb to yuv

Review URL: http://webrtc-codereview.appspot.com/267007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@66 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 82ba1b77
......@@ -36,7 +36,7 @@ static void SplitUV_NEON(const uint8* src_uv,
"+r"(dst_v),
"+r"(pix) // Output registers
: // Input registers
: "q0", "q1" // Clobber List
: "memory", "cc", "q0", "q1" // Clobber List
);
}
......@@ -1080,6 +1080,13 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 4 == 0) &&
......@@ -1132,6 +1139,13 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
......@@ -1176,6 +1190,13 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
......@@ -1220,6 +1241,13 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
......@@ -1263,6 +1291,13 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUV444TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
......@@ -1300,10 +1335,10 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
void (*FastConvertYToARGBRow)(const uint8* y_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0) &&
IS_ALIGNED(dst_argb, 8) && (dst_stride_argb % 8 == 0)) {
#if defined(HAS_FASTCONVERTYTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
} else
#endif
......
......@@ -15,51 +15,61 @@
#define kMaxStride (2048 * 4)
#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
#define YUV_DISABLE_ASM
#endif
// The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
!defined(LIBYUV_DISABLE_ASM)
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_BGRATOARGBROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on Windows and Linux
#if (defined(WIN32) || defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on Linux (32/64 bit)
// TODO(fbarchard): enable for fpic on linux
#if (defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSE2
#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
#define HAS_FASTCONVERTYUVTOABGRROW_SSE2
#define HAS_FASTCONVERTYUV444TOARGBROW_SSE2
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif
// The following are available on Windows and GCC 32 bit
#if (defined(WIN32) || \
defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_MMX
#define HAS_FASTCONVERTYUVTOBGRAROW_MMX
#define HAS_FASTCONVERTYUVTOABGRROW_MMX
#endif
// The following are available on Windows
#if defined(WIN32) && \
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif
extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3
......@@ -224,6 +234,40 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
// Method to force C version.
//#define USE_MMX 0
//#define USE_SSE2 0
......
......@@ -253,37 +253,47 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile(
"movdqa %5,%%xmm7\n"
"movdqa %6,%%xmm6\n"
"movdqa %7,%%xmm5\n"
asm volatile(
"movdqa %0,%%xmm4\n"
"movdqa %1,%%xmm3\n"
"movdqa %2,%%xmm5\n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
"m"(kAddUV128) // %2
:
#if defined(__SSE2__)
"xmm3", "xmm4", "xmm5"
#endif
);
asm volatile(
"sub %1,%2\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm2\n"
"movdqa 0x30(%0),%%xmm3\n"
"movdqa 0x30(%0),%%xmm6\n"
"pavgb (%0,%4,1),%%xmm0\n"
"pavgb 0x10(%0,%4,1),%%xmm1\n"
"pavgb 0x20(%0,%4,1),%%xmm2\n"
"pavgb 0x30(%0,%4,1),%%xmm3\n"
"pavgb 0x30(%0,%4,1),%%xmm6\n"
"lea 0x40(%0),%0\n"
"movdqa %%xmm0,%%xmm4\n"
"movdqa %%xmm0,%%xmm7\n"
"shufps $0x88,%%xmm1,%%xmm0\n"
"shufps $0xdd,%%xmm1,%%xmm4\n"
"pavgb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,%%xmm4\n"
"shufps $0x88,%%xmm3,%%xmm2\n"
"shufps $0xdd,%%xmm3,%%xmm4\n"
"pavgb %%xmm4,%%xmm2\n"
"shufps $0xdd,%%xmm1,%%xmm7\n"
"pavgb %%xmm7,%%xmm0\n"
"movdqa %%xmm2,%%xmm7\n"
"shufps $0x88,%%xmm6,%%xmm2\n"
"shufps $0xdd,%%xmm6,%%xmm7\n"
"pavgb %%xmm7,%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm2,%%xmm3\n"
"pmaddubsw %%xmm7,%%xmm0\n"
"pmaddubsw %%xmm7,%%xmm2\n"
"pmaddubsw %%xmm6,%%xmm1\n"
"pmaddubsw %%xmm6,%%xmm3\n"
"movdqa %%xmm2,%%xmm6\n"
"pmaddubsw %%xmm4,%%xmm0\n"
"pmaddubsw %%xmm4,%%xmm2\n"
"pmaddubsw %%xmm3,%%xmm1\n"
"pmaddubsw %%xmm3,%%xmm6\n"
"phaddw %%xmm2,%%xmm0\n"
"phaddw %%xmm3,%%xmm1\n"
"phaddw %%xmm6,%%xmm1\n"
"psraw $0x8,%%xmm0\n"
"psraw $0x8,%%xmm1\n"
"packsswb %%xmm1,%%xmm0\n"
......@@ -297,13 +307,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb)), // %4
"m"(kARGBToU), // %5
"m"(kARGBToV), // %6
"m"(kAddUV128) // %7
: "r"(static_cast<intptr_t>(src_stride_argb))
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
);
}
......
......@@ -208,25 +208,27 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \
};
#define CS(v) static_cast<int16>(v)
// ARGB table
#define RGBY(i) { \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(256 * 64 - 1) \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(256 * 64 - 1) \
}
#define RGBU(i) { \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
CS(2.018 * 64 * (i - 128) + 0.5), \
CS(-0.391 * 64 * (i - 128) - 0.5), \
0, \
0 \
}
#define RGBV(i) { \
0, \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
CS(-0.813 * 64 * (i - 128) - 0.5), \
CS(1.596 * 64 * (i - 128) + 0.5), \
0 \
}
......@@ -238,23 +240,23 @@ MAKETABLE(kCoefficientsRgbY)
// BGRA table
#define RGBY(i) { \
static_cast<int16>(256 * 64 - 1), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \
CS(256 * 64 - 1), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5) \
}
#define RGBU(i) { \
0, \
0, \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \
CS(-0.391 * 64 * (i - 128) - 0.5), \
CS(2.018 * 64 * (i - 128) + 0.5) \
}
#define RGBV(i) { \
0, \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
CS(1.596 * 64 * (i - 128) + 0.5), \
CS(-0.813 * 64 * (i - 128) - 0.5), \
0 \
}
......@@ -266,22 +268,22 @@ MAKETABLE(kCoefficientsBgraY)
// ABGR table
#define RGBY(i) { \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(256 * 64 - 1) \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(256 * 64 - 1) \
}
#define RGBU(i) { \
0, \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
CS(-0.391 * 64 * (i - 128) - 0.5), \
CS(2.018 * 64 * (i - 128) + 0.5), \
0 \
}
#define RGBV(i) { \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
CS(1.596 * 64 * (i - 128) + 0.5), \
CS(-0.813 * 64 * (i - 128) - 0.5), \
0, \
0 \
}
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment