Commit 228bdc24 authored by fbarchard@google.com's avatar fbarchard@google.com

port yuv to rgb ssse3 to gcc

BUG=none
TEST=media_unittest
Review URL: http://webrtc-codereview.appspot.com/269015

git-svn-id: http://libyuv.googlecode.com/svn/trunk@80 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 4cf70bd6
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 79 Version: 80
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -1136,19 +1136,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1136,19 +1136,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else } else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 4 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow4_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
} else
#endif #endif
{ {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
...@@ -1188,12 +1175,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, ...@@ -1188,12 +1175,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3; FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3;
} else } else
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
} else
#endif #endif
{ {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C; FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
...@@ -1233,12 +1214,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, ...@@ -1233,12 +1214,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3; FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3;
} else } else
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
} else
#endif #endif
{ {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C; FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
...@@ -1278,12 +1253,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1278,12 +1253,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
} else } else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
} else
#endif #endif
{ {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
...@@ -1321,11 +1290,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1321,11 +1290,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3; FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3;
} else } else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
} else
#endif #endif
{ {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C; FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
...@@ -1354,7 +1318,7 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, ...@@ -1354,7 +1318,7 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#if defined(HAS_FASTCONVERTYTOARGBROW_SSE2) #if defined(HAS_FASTCONVERTYTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 8 == 0) && (width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2; FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
......
...@@ -37,28 +37,17 @@ ...@@ -37,28 +37,17 @@
#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3
#define HAS_I400TOARGBROW_SSE2 #define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on Linux (32/64 bit)
// TODO(fbarchard): enable for fpic on linux
#if (defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSE2
#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
#define HAS_FASTCONVERTYUVTOABGRROW_SSE2
#define HAS_FASTCONVERTYUV444TOARGBROW_SSE2
#define HAS_FASTCONVERTYTOARGBROW_SSE2 #define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif #endif
// The following are available on Windows // The following are available on all x86 platforms except 32 bit OSX
#if defined(WIN32) && \ #if (defined(WIN32) || defined(__x86_64__) || \
(defined(__i386__) && !defined(__APPLE__))) && \
!defined(LIBYUV_DISABLE_ASM) !defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 #define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 #define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif #endif
extern "C" { extern "C" {
......
...@@ -14,49 +14,49 @@ ...@@ -14,49 +14,49 @@
extern "C" { extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3
// Constant multiplication table for converting ARGB to I400.
static const vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
static const uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
#ifdef HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_SSSE3
static const vec8 kARGBToU = { vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
}; };
static const uvec8 kARGBToV = { uvec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
}; };
static const uvec8 kAddUV128 = { uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
}; };
#endif #endif
#ifdef HAS_ARGBTOYROW_SSSE3
// Constant multiplication table for converting ARGB to I400.
vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
// Shuffle table for converting BG24 to ARGB. // Shuffle table for converting BG24 to ARGB.
static const uvec8 kShuffleMaskBG24ToARGB = { uvec8 kShuffleMaskBG24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
}; };
// Shuffle table for converting RAW to ARGB. // Shuffle table for converting RAW to ARGB.
static const uvec8 kShuffleMaskRAWToARGB = { uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
}; };
// Shuffle table for converting ABGR to ARGB. // Shuffle table for converting ABGR to ARGB.
static const uvec8 kShuffleMaskABGRToARGB = { uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
}; };
// Shuffle table for converting BGRA to ARGB. // Shuffle table for converting BGRA to ARGB.
static const uvec8 kShuffleMaskBGRAToARGB = { uvec8 kShuffleMaskBGRAToARGB = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
}; };
...@@ -145,17 +145,17 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { ...@@ -145,17 +145,17 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
"movdqa 0x20(%0),%%xmm3 \n" "movdqa 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n" "lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n" "movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] } "palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n" "por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] } "palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n" "movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n" "pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n" "por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] } "palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm3 \n"
...@@ -185,17 +185,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -185,17 +185,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"movdqa 0x20(%0),%%xmm3 \n" "movdqa 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n" "lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n" "movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] } "palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n" "por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] } "palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n" "movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n" "pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n" "por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] } "palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm3 \n"
...@@ -318,229 +318,320 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -318,229 +318,320 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
} }
#endif #endif
// The following code requires 6 registers and prefers 7 registers.
// 7 registers requires -fpic to be off, and -fomit-frame-pointer #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
#if defined(__x86_64__)
#define REG_a "rax" vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
#define REG_d "rdx" vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
#else
#define REG_a "eax"
#define REG_d "edx"
#endif #endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
#define UR 0
#define VB 0
#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
// Bias
#define BB UB * 128 + VB * 128
#define BG UG * 128 + VG * 128
#define BR UR * 128 + VR * 128
vec8 kUVToB = {
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
};
vec8 kUVToR = {
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
};
vec8 kUVToG = {
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
};
vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
#if defined(__APPLE__) || defined(__x86_64__) #if defined(__APPLE__) || defined(__x86_64__)
#define OMITFP #define OMITFP
#else #else
#define OMITFP __attribute__((optimize("omit-frame-pointer"))) #define OMITFP __attribute__((optimize("omit-frame-pointer")))
#endif #endif
#define CLOBBER "%"REG_a, "%"REG_d // This version produces 8 pixels
// This version produces 2 pixels
#define YUVTORGB \ #define YUVTORGB \
"1: \n" \ "movd (%1),%%xmm0 \n" \
"movzb (%1),%%"REG_a" \n" \ "movd (%1,%2,1),%%xmm1 \n" \
"lea 1(%1),%1 \n" \ "lea 0x4(%1),%1 \n" \
"movzb (%2),%%"REG_d" \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \
"lea 1(%2),%2 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ "movdqa %%xmm0,%%xmm1 \n" \
"movzb 0(%0),%%"REG_a" \n" \ "movdqa %%xmm0,%%xmm2 \n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ "pmaddubsw %5,%%xmm0 \n" \
"paddsw %%xmm1,%%xmm0 \n" \ "pmaddubsw %6,%%xmm1 \n" \
"movzb 1(%0),%%"REG_d" \n" \ "pmaddubsw %7,%%xmm2 \n" \
"punpcklqdq %%xmm0,%%xmm0 \n" \ "psubw %8,%%xmm0 \n" \
"lea 2(%0),%0 \n" \ "psubw %9,%%xmm1 \n" \
"movq 0(%5,%%"REG_a",8),%%xmm1 \n" \ "psubw %10,%%xmm2 \n" \
"movhps 0(%5,%%"REG_d",8),%%xmm1 \n" \ "movq (%0),%%xmm3 \n" \
"paddsw %%xmm0,%%xmm1 \n" \ "lea 0x8(%0),%0 \n" \
"psraw $6,%%xmm1 \n" \ "punpcklbw %%xmm4,%%xmm3 \n" \
"packuswb %%xmm1,%%xmm1 \n" \ "psubsw %11,%%xmm3 \n" \
"movq %%xmm1,0(%3) \n" \ "pmullw %12,%%xmm3 \n" \
"lea 8(%3),%3 \n" \ "paddw %%xmm3,%%xmm0 \n" \
"sub $0x2,%4 \n" \ "paddw %%xmm3,%%xmm1 \n" \
"ja 1b \n" "paddw %%xmm3,%%xmm2 \n" \
// This version produces 4 pixels "psraw $0x6,%%xmm0 \n" \
#define YUVTORGB4 \ "psraw $0x6,%%xmm1 \n" \
"1: \n" \ "psraw $0x6,%%xmm2 \n" \
"movzb 0(%1),%%"REG_a" \n" \ "packuswb %%xmm0,%%xmm0 \n" \
"movzb 0(%2),%%"REG_d" \n" \ "packuswb %%xmm1,%%xmm1 \n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ "packuswb %%xmm2,%%xmm2 \n"
"movzb 0(%0),%%"REG_a" \n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi
"paddsw %%xmm1,%%xmm0 \n" \ const uint8* u_buf, // rsi
"movzb 1(%0),%%"REG_d" \n" \ const uint8* v_buf, // rdx
"punpcklqdq %%xmm0,%%xmm0 \n" \ uint8* rgb_buf, // rcx
"movq 0(%5,%%"REG_a",8),%%xmm2 \n" \ int width) { // r8
"movhps 0(%5,%%"REG_d",8),%%xmm2 \n" \
"paddsw %%xmm0,%%xmm2 \n" \
"psraw $6,%%xmm2 \n" \
"movzb 1(%1),%%"REG_a" \n" \
"movzb 1(%2),%%"REG_d" \n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
"movzb 2(%0),%%"REG_a" \n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
"paddsw %%xmm1,%%xmm0 \n" \
"movzb 3(%0),%%"REG_d" \n" \
"punpcklqdq %%xmm0,%%xmm0 \n" \
"movq 0(%5,%%"REG_a",8),%%xmm3 \n" \
"movhps 0(%5,%%"REG_d",8),%%xmm3 \n" \
"paddsw %%xmm0,%%xmm3 \n" \
"psraw $6,%%xmm3 \n" \
"lea 2(%1),%1 \n" \
"lea 2(%2),%2 \n" \
"lea 4(%0),%0 \n" \
"packuswb %%xmm3,%%xmm2 \n" \
"movdqa %%xmm2,0(%3) \n" \
"lea 16(%3),%3 \n" \
"sub $0x4,%4 \n" \
"ja 1b \n" \
// 6 or 7 registers
void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile ( asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"movdqa %%xmm0,(%3) \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r" (kCoefficientsRgbY) // %5 : "m" (kUVToB), // %5
: "memory", "cc", CLOBBER "m" (kUVToG), // %6
"m" (kUVToR), // %7
"m" (kUVBiasB), // %8
"m" (kUVBiasG), // %9
"m" (kUVBiasR), // %10
"m" (kYSub16), // %11
"m" (kYToRgb) // %12
: "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
); );
} }
// 6 or 7 registers void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi const uint8* u_buf, // rsi
const uint8* v_buf, // rdx const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm volatile ( asm volatile (
YUVTORGB4 "sub %1,%2 \n"
: "+r"(y_buf), // %0 "pcmpeqb %%xmm5,%%xmm5 \n"
"+r"(u_buf), // %1 "pxor %%xmm4,%%xmm4 \n"
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
: "r" (kCoefficientsRgbY) // %5
: "memory", "cc", CLOBBER
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
);
}
void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi "1: \n"
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
YUVTORGB YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"movdqa %%xmm5,(%3) \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r" (kCoefficientsBgraY) // %5 : "m" (kUVToB), // %5
: "memory", "cc", CLOBBER "m" (kUVToG), // %6
"m" (kUVToR), // %7
"m" (kUVBiasB), // %8
"m" (kUVBiasG), // %9
"m" (kUVBiasR), // %10
"m" (kYSub16), // %11
"m" (kYToRgb) // %12
: "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
); );
} }
void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi const uint8* u_buf, // rsi
const uint8* v_buf, // rdx const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm volatile ( asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
"1: \n"
YUVTORGB YUVTORGB
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm1,%%xmm1 \n"
"packuswb %%xmm2,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"movdqa %%xmm2,(%3) \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r" (kCoefficientsAbgrY) // %5 : "m" (kUVToB), // %5
: "memory", "cc", CLOBBER "m" (kUVToG), // %6
"m" (kUVToR), // %7
"m" (kUVBiasB), // %8
"m" (kUVBiasG), // %9
"m" (kUVBiasR), // %10
"m" (kYSub16), // %11
"m" (kYToRgb) // %12
: "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
); );
} }
// 6 registers void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi const uint8* u_buf, // rsi
const uint8* u_buf, // rsi const uint8* v_buf, // rdx
const uint8* v_buf, // rdx uint8* rgb_buf, // rcx
uint8* rgb_buf, // rcx int width) { // r8
int width) { // r8
asm volatile ( asm volatile (
"1: \n" "sub %1,%2 \n"
"movzb (%1),%%"REG_a" \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"lea 1(%1),%1 \n" "pxor %%xmm4,%%xmm4 \n"
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n"
"movzb (%2),%%"REG_a" \n" "1: \n"
"lea 1(%2),%2 \n" "movd (%1),%%xmm0 \n"
"movq 4096(%5,%%"REG_a",8),%%xmm1 \n" "movd (%1,%2,1),%%xmm1 \n"
"paddsw %%xmm1,%%xmm0 \n" "lea 0x4(%1),%1 \n"
"movzb (%0),%%"REG_a" \n" "punpcklbw %%xmm1,%%xmm0 \n"
"lea 1(%0),%0 \n" "movdqa %%xmm0,%%xmm1 \n"
"movq 0(%5,%%"REG_a",8),%%xmm2 \n" "movdqa %%xmm0,%%xmm2 \n"
"paddsw %%xmm0,%%xmm2 \n" "pmaddubsw %5,%%xmm0 \n"
"shufps $0x44,%%xmm2,%%xmm2 \n" "pmaddubsw %6,%%xmm1 \n"
"psraw $0x6,%%xmm2 \n" "pmaddubsw %7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n" "psubw %8,%%xmm0 \n"
"movd %%xmm2,0x0(%3) \n" "psubw %9,%%xmm1 \n"
"lea 4(%3),%3 \n" "psubw %10,%%xmm2 \n"
"sub $0x1,%4 \n" "movd (%0),%%xmm3 \n"
"ja 1b \n" "lea 0x4(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm3 \n"
"psubsw %11,%%xmm3 \n"
"pmullw %12,%%xmm3 \n"
"paddw %%xmm3,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"paddw %%xmm3,%%xmm2 \n"
"psraw $0x6,%%xmm0 \n"
"psraw $0x6,%%xmm1 \n"
"psraw $0x6,%%xmm2 \n"
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm1,%%xmm1 \n"
"packuswb %%xmm2,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"movdqa %%xmm0,(%3) \n"
"lea 0x10(%3),%3 \n"
"sub $0x4,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r" (kCoefficientsRgbY) // %5 : "m" (kUVToB), // %5
: "memory", "cc", "%"REG_a "m" (kUVToG), // %6
"m" (kUVToR), // %7
"m" (kUVBiasB), // %8
"m" (kUVBiasG), // %9
"m" (kUVBiasR), // %10
"m" (kYSub16), // %11
"m" (kYToRgb) // %12
: "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
); );
} }
#endif
// 5 registers #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm volatile ( asm volatile (
"1: \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"movzb (%0),%%"REG_a" \n" "pslld $0x18,%%xmm5 \n"
"movzb 0x1(%0),%%"REG_d" \n" "pxor %%xmm4,%%xmm4 \n"
"movq (%3,%%"REG_a",8),%%xmm2 \n" "movdqa %3,%%xmm3 \n"
"lea 2(%0),%0 \n" "movdqa %4,%%xmm2 \n"
"movhps (%3,%%"REG_d",8),%%xmm2 \n"
"psraw $0x6,%%xmm2 \n" "1: \n"
"packuswb %%xmm2,%%xmm2 \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq %%xmm2,0x0(%1) \n" "movq (%0),%%xmm0 \n"
"lea 8(%1),%1 \n" "lea 0x8(%0),%0 \n"
"sub $0x2,%2 \n" "punpcklbw %%xmm4,%%xmm0 \n"
"ja 1b \n" "psubsw %%xmm3,%%xmm0 \n"
"pmullw %%xmm2,%%xmm0 \n"
"psraw $0x6,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"por %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm1,16(%1) \n"
"lea 32(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(rgb_buf), // %1 "+r"(rgb_buf), // %1
"+rm"(width) // %2 "+rm"(width) // %2
: "r" (kCoefficientsRgbY) // %3 : "m" (kYSub16), // %3
: "memory", "cc", "%"REG_a, "%"REG_d "m" (kYToRgb) // %4
: "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
); );
} }
#endif #endif
......
...@@ -54,8 +54,7 @@ static const vec8 kABGRToV = { ...@@ -54,8 +54,7 @@ static const vec8 kABGRToV = {
}; };
static const uvec8 kAddY16 = { static const uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
}; };
static const uvec8 kAddUV128 = { static const uvec8 kAddUV128 = {
...@@ -548,27 +547,13 @@ static const vec8 kUVToG = { ...@@ -548,27 +547,13 @@ static const vec8 kUVToG = {
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
}; };
static const vec16 kYToRgb = { static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
YG, YG, YG, YG, YG, YG, YG, YG static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
}; static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
static const vec16 kYSub16 = { static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
16, 16, 16, 16, 16, 16, 16, 16
};
static const vec16 kUVBiasB = {
BB, BB, BB, BB, BB, BB, BB, BB
};
static const vec16 kUVBiasG = {
BG, BG, BG, BG, BG, BG, BG, BG
};
static const vec16 kUVBiasR = {
BR, BR, BR, BR, BR, BR, BR, BR
};
#define YUVTORGB_SSSE3 __asm { \ #define YUVTORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movd xmm0, [esi] /* U */ \ __asm movd xmm0, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \ __asm movd xmm1, [esi + edi] /* V */ \
...@@ -619,7 +604,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, ...@@ -619,7 +604,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
pxor xmm4, xmm4 pxor xmm4, xmm4
convertloop: convertloop:
YUVTORGB_SSSE3 YUVTORGB
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG punpcklbw xmm0, xmm1 // BG
...@@ -658,7 +643,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, ...@@ -658,7 +643,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
pxor xmm4, xmm4 pxor xmm4, xmm4
convertloop: convertloop:
YUVTORGB_SSSE3 YUVTORGB
// Step 3: Weave into BGRA // Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -699,7 +684,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, ...@@ -699,7 +684,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
pxor xmm4, xmm4 pxor xmm4, xmm4
convertloop: convertloop:
YUVTORGB_SSSE3 YUVTORGB
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG punpcklbw xmm2, xmm1 // RG
...@@ -787,7 +772,6 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -787,7 +772,6 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
#endif #endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
__declspec(naked) __declspec(naked)
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -829,8 +813,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, ...@@ -829,8 +813,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
ret ret
} }
} }
#endif #endif
#endif #endif
} // extern "C" } // extern "C"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment