Commit d93d4486 authored by fbarchard@google.com's avatar fbarchard@google.com

row functions for windows use ssse3 for yuv to rgb. mac use sse3 for rgb to yuv

Review URL: http://webrtc-codereview.appspot.com/267007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@66 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 82ba1b77
......@@ -36,7 +36,7 @@ static void SplitUV_NEON(const uint8* src_uv,
"+r"(dst_v),
"+r"(pix) // Output registers
: // Input registers
: "q0", "q1" // Clobber List
: "memory", "cc", "q0", "q1" // Clobber List
);
}
......@@ -1080,6 +1080,13 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 4 == 0) &&
......@@ -1132,6 +1139,13 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
......@@ -1176,6 +1190,13 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
......@@ -1220,6 +1241,13 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
......@@ -1263,6 +1291,13 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUV444TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
......@@ -1300,10 +1335,10 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
void (*FastConvertYToARGBRow)(const uint8* y_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0) &&
IS_ALIGNED(dst_argb, 8) && (dst_stride_argb % 8 == 0)) {
#if defined(HAS_FASTCONVERTYTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
} else
#endif
......
......@@ -15,51 +15,61 @@
#define kMaxStride (2048 * 4)
#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
#define YUV_DISABLE_ASM
#endif
// The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
!defined(LIBYUV_DISABLE_ASM)
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_BGRATOARGBROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on Windows and Linux
#if (defined(WIN32) || defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on Linux (32/64 bit)
// TODO(fbarchard): enable for fpic on linux
#if (defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSE2
#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
#define HAS_FASTCONVERTYUVTOABGRROW_SSE2
#define HAS_FASTCONVERTYUV444TOARGBROW_SSE2
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif
// The following are available on Windows and GCC 32 bit
#if (defined(WIN32) || \
defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_MMX
#define HAS_FASTCONVERTYUVTOBGRAROW_MMX
#define HAS_FASTCONVERTYUVTOABGRROW_MMX
#endif
// The following are available on Windows
#if defined(WIN32) && \
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif
extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3
......@@ -224,6 +234,40 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
// Method to force C version.
//#define USE_MMX 0
//#define USE_SSE2 0
......
......@@ -253,37 +253,47 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile(
"movdqa %5,%%xmm7\n"
"movdqa %6,%%xmm6\n"
"movdqa %7,%%xmm5\n"
asm volatile(
"movdqa %0,%%xmm4\n"
"movdqa %1,%%xmm3\n"
"movdqa %2,%%xmm5\n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
"m"(kAddUV128) // %2
:
#if defined(__SSE2__)
"xmm3", "xmm4", "xmm5"
#endif
);
asm volatile(
"sub %1,%2\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm2\n"
"movdqa 0x30(%0),%%xmm3\n"
"movdqa 0x30(%0),%%xmm6\n"
"pavgb (%0,%4,1),%%xmm0\n"
"pavgb 0x10(%0,%4,1),%%xmm1\n"
"pavgb 0x20(%0,%4,1),%%xmm2\n"
"pavgb 0x30(%0,%4,1),%%xmm3\n"
"pavgb 0x30(%0,%4,1),%%xmm6\n"
"lea 0x40(%0),%0\n"
"movdqa %%xmm0,%%xmm4\n"
"movdqa %%xmm0,%%xmm7\n"
"shufps $0x88,%%xmm1,%%xmm0\n"
"shufps $0xdd,%%xmm1,%%xmm4\n"
"pavgb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,%%xmm4\n"
"shufps $0x88,%%xmm3,%%xmm2\n"
"shufps $0xdd,%%xmm3,%%xmm4\n"
"pavgb %%xmm4,%%xmm2\n"
"shufps $0xdd,%%xmm1,%%xmm7\n"
"pavgb %%xmm7,%%xmm0\n"
"movdqa %%xmm2,%%xmm7\n"
"shufps $0x88,%%xmm6,%%xmm2\n"
"shufps $0xdd,%%xmm6,%%xmm7\n"
"pavgb %%xmm7,%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm2,%%xmm3\n"
"pmaddubsw %%xmm7,%%xmm0\n"
"pmaddubsw %%xmm7,%%xmm2\n"
"pmaddubsw %%xmm6,%%xmm1\n"
"pmaddubsw %%xmm6,%%xmm3\n"
"movdqa %%xmm2,%%xmm6\n"
"pmaddubsw %%xmm4,%%xmm0\n"
"pmaddubsw %%xmm4,%%xmm2\n"
"pmaddubsw %%xmm3,%%xmm1\n"
"pmaddubsw %%xmm3,%%xmm6\n"
"phaddw %%xmm2,%%xmm0\n"
"phaddw %%xmm3,%%xmm1\n"
"phaddw %%xmm6,%%xmm1\n"
"psraw $0x8,%%xmm0\n"
"psraw $0x8,%%xmm1\n"
"packsswb %%xmm1,%%xmm0\n"
......@@ -297,13 +307,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb)), // %4
"m"(kARGBToU), // %5
"m"(kARGBToV), // %6
"m"(kAddUV128) // %7
: "r"(static_cast<intptr_t>(src_stride_argb))
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
);
}
......
......@@ -208,25 +208,27 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \
};
#define CS(v) static_cast<int16>(v)
// ARGB table
#define RGBY(i) { \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(256 * 64 - 1) \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(256 * 64 - 1) \
}
#define RGBU(i) { \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
CS(2.018 * 64 * (i - 128) + 0.5), \
CS(-0.391 * 64 * (i - 128) - 0.5), \
0, \
0 \
}
#define RGBV(i) { \
0, \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
CS(-0.813 * 64 * (i - 128) - 0.5), \
CS(1.596 * 64 * (i - 128) + 0.5), \
0 \
}
......@@ -238,23 +240,23 @@ MAKETABLE(kCoefficientsRgbY)
// BGRA table
#define RGBY(i) { \
static_cast<int16>(256 * 64 - 1), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \
CS(256 * 64 - 1), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5) \
}
#define RGBU(i) { \
0, \
0, \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \
CS(-0.391 * 64 * (i - 128) - 0.5), \
CS(2.018 * 64 * (i - 128) + 0.5) \
}
#define RGBV(i) { \
0, \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
CS(1.596 * 64 * (i - 128) + 0.5), \
CS(-0.813 * 64 * (i - 128) - 0.5), \
0 \
}
......@@ -266,22 +268,22 @@ MAKETABLE(kCoefficientsBgraY)
// ABGR table
#define RGBY(i) { \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
static_cast<int16>(256 * 64 - 1) \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(1.164 * 64 * (i - 16) + 0.5), \
CS(256 * 64 - 1) \
}
#define RGBU(i) { \
0, \
static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
CS(-0.391 * 64 * (i - 128) - 0.5), \
CS(2.018 * 64 * (i - 128) + 0.5), \
0 \
}
#define RGBV(i) { \
static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
CS(1.596 * 64 * (i - 128) + 0.5), \
CS(-0.813 * 64 * (i - 128) - 0.5), \
0, \
0 \
}
......
......@@ -520,7 +520,7 @@ __asm {
}
}
#define YUVTORGB(TABLE) __asm { \
#define YUVTORGB_MMX(TABLE) __asm { \
__asm convertloop : \
__asm movzx eax, byte ptr [edi] \
__asm lea edi, [edi + 1] \
......@@ -561,7 +561,7 @@ void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
YUVTORGB(kCoefficientsRgbY)
YUVTORGB_MMX(kCoefficientsRgbY)
pop ebp
pop edi
......@@ -588,7 +588,7 @@ void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
YUVTORGB(kCoefficientsBgraY)
YUVTORGB_MMX(kCoefficientsBgraY)
pop ebp
pop edi
......@@ -615,7 +615,7 @@ void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
YUVTORGB(kCoefficientsAbgrY)
YUVTORGB_MMX(kCoefficientsAbgrY)
pop ebp
pop edi
......@@ -696,6 +696,321 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf,
}
}
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
#define UR 0
#define VB 0
#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
// Bias
#define BB UB * 128 + VB * 128
#define BG UG * 128 + VG * 128
#define BR UR * 128 + VR * 128
extern "C" TALIGN16(const int8, kUVToB[16]) = {
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
};
extern "C" TALIGN16(const int8, kUVToR[16]) = {
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
};
extern "C" TALIGN16(const int8, kUVToG[16]) = {
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
};
extern "C" TALIGN16(const int16, kYToRgb[8]) = {
YG, YG, YG, YG, YG, YG, YG, YG
};
extern "C" TALIGN16(const int16, kYSub16[8]) = {
16, 16, 16, 16, 16, 16, 16, 16
};
extern "C" TALIGN16(const int16, kUVBiasB[8]) = {
BB, BB, BB, BB, BB, BB, BB, BB
};
extern "C" TALIGN16(const int16, kUVBiasG[8]) = {
BG, BG, BG, BG, BG, BG, BG, BG
};
extern "C" TALIGN16(const int16, kUVBiasR[8]) = {
BR, BR, BR, BR, BR, BR, BR, BR
};
#define YUVTORGB_SSSE3 __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movd xmm0, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, _kUVToB /* scale B UV */ \
__asm pmaddubsw xmm1, _kUVToG /* scale G UV */ \
__asm pmaddubsw xmm2, _kUVToR /* scale R UV */ \
__asm psubw xmm0, _kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, _kUVBiasG \
__asm psubw xmm2, _kUVBiasR \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] \
__asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \
__asm psubsw xmm3, _kYSub16 \
__asm pmullw xmm3, _kYToRgb \
__asm paddw xmm0, xmm3 /* B += Y */ \
__asm paddw xmm1, xmm3 /* G += Y */ \
__asm paddw xmm2, xmm3 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
__declspec(naked)
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop :
YUVTORGB_SSSE3
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
movdqa [edx], xmm0
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
convertloop :
YUVTORGB_SSSE3
// Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
punpcklbw xmm1, xmm0 // GB
punpcklbw xmm5, xmm2 // AR
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
movdqa [edx], xmm5
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop :
YUVTORGB_SSSE3
// Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG
punpcklbw xmm0, xmm5 // BA
movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
movdqa [edx], xmm2
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
ja convertloop
pop edi
pop esi
ret
}
}
__declspec(naked)
void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop :
// Step 1: Find 4 UV contributions to 4 R,G,B values
movd xmm0, [esi] // U
movd xmm1, [esi + edi] // V
lea esi, [esi + 4]
punpcklbw xmm0, xmm1 // UV
movdqa xmm1, xmm0
movdqa xmm2, xmm0
pmaddubsw xmm0, _kUVToB // scale B UV
pmaddubsw xmm1, _kUVToG // scale G UV
pmaddubsw xmm2, _kUVToR // scale R UV
psubw xmm0, _kUVBiasB // unbias back to signed
psubw xmm1, _kUVBiasG
psubw xmm2, _kUVBiasR
// Step 2: Find Y contribution to 4 R,G,B values
movd xmm3, [eax]
lea eax, [eax + 4]
punpcklbw xmm3, xmm4
psubsw xmm3, _kYSub16
pmullw xmm3, _kYToRgb
paddw xmm0, xmm3 // B += Y
paddw xmm1, xmm3 // G += Y
paddw xmm2, xmm3 // R += Y
psraw xmm0, 6
psraw xmm1, 6
psraw xmm2, 6
packuswb xmm0, xmm0 // B
packuswb xmm1, xmm1 // G
packuswb xmm2, xmm2 // R
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
punpcklwd xmm0, xmm2 // BGRA 4 pixels
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
ja convertloop
pop edi
pop esi
ret
}
}
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
__declspec(naked)
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
pxor xmm4, xmm4
movdqa xmm3, _kYSub16
movdqa xmm2, _kYToRgb
convertloop :
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm4
psubsw xmm0, xmm3
pmullw xmm0, xmm2
psraw xmm0, 6
packuswb xmm0, xmm0 // G
// Step 2: Weave into ARGB
punpcklbw xmm0, xmm0 // GG
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
por xmm0, xmm5
movdqa [edx], xmm0
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm1, xmm5
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
ja convertloop
ret
}
}
#endif
#endif
} // extern "C"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment