Commit eaedc1d7 authored by fbarchard@google.com's avatar fbarchard@google.com

remove mmx functions

BUG=none
TEST=builds
Review URL: http://webrtc-codereview.appspot.com/269010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@77 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c82af4a5
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 69
Version: 77
License: BSD
License File: LICENSE
......
......@@ -1149,11 +1149,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
(width % 2 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
......@@ -1167,8 +1162,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
src_v += src_stride_v;
}
}
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS();
return 0;
}
......@@ -1201,11 +1194,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
(width % 2 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_MMX;
} else
#endif
{
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
......@@ -1219,7 +1207,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
src_v += src_stride_v;
}
}
EMMS();
return 0;
}
......@@ -1252,11 +1239,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
(width % 2 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_MMX;
} else
#endif
{
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
......@@ -1270,7 +1252,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
src_v += src_stride_v;
}
}
EMMS();
return 0;
}
......@@ -1303,11 +1284,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
(width % 2 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
......@@ -1319,8 +1295,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
src_u += src_stride_u;
src_v += src_stride_v;
}
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS();
return 0;
}
......@@ -1353,13 +1327,9 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_MMX;
#else
{
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
}
#endif
for (int y = 0; y < height; ++y) {
FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
......@@ -1367,8 +1337,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
src_u += src_stride_u;
src_v += src_stride_v;
}
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS();
return 0;
}
......@@ -1391,11 +1359,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
if (width % 2 == 0) {
FastConvertYToARGBRow = FastConvertYToARGBRow_MMX;
} else
#endif
{
FastConvertYToARGBRow = FastConvertYToARGBRow_C;
......@@ -1405,8 +1368,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS();
return 0;
}
......
......@@ -13,21 +13,19 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "rotate_priv.h"
#include "row.h"
namespace libyuv {
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#if defined(_MSC_VER)
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
#else
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
#endif
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
!defined(__APPLE__) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// Shuffle table for reversing the bytes.
extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
static const uvec8 kShuffleReverse =
{ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
// Shuffle table for reversing the bytes of UV channels.
extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
static const uvec8 kShuffleReverseUV =
{ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
#endif
......@@ -73,7 +71,7 @@ __asm {
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
convertloop :
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movq xmm0, qword ptr [eax]
......@@ -172,7 +170,7 @@ __asm {
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
convertloop :
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqa xmm0, [eax]
......@@ -863,9 +861,9 @@ __asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
movdqa xmm5, _kShuffleReverse
movdqa xmm5, kShuffleReverse
lea eax, [eax + ecx - 16]
convertloop :
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm5
......@@ -878,12 +876,16 @@ __asm {
}
#elif (defined(__i386__) || defined(__x86_64__)) && \
!defined(__APPLE__) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_REVERSE_LINE_SSSE3
static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa (%3),%%xmm5 \n"
"movdqa %0,%%xmm5 \n"
:: "m"(kShuffleReverse)
);
asm volatile (
"lea -0x10(%0,%2,1),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
......@@ -896,12 +898,12 @@ static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
: "r"(kShuffleReverse) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm5"
#endif
);
);
}
#endif
......@@ -1066,10 +1068,10 @@ __asm {
mov edx, [esp + 4 + 8] // dst_a
mov edi, [esp + 4 + 12] // dst_b
mov ecx, [esp + 4 + 16] // width
movdqa xmm5, _kShuffleReverseUV
movdqa xmm5, kShuffleReverseUV
lea eax, [eax + ecx * 2 - 16]
convertloop :
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm5
......@@ -1085,6 +1087,7 @@ __asm {
}
#elif (defined(__i386__) || defined(__x86_64__)) && \
!defined(__APPLE__) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_REVERSE_LINE_UV_SSSE3
void ReverseLineUV_SSSE3(const uint8* src,
......@@ -1092,28 +1095,31 @@ void ReverseLineUV_SSSE3(const uint8* src,
int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa (%4),%%xmm5 \n"
"lea -0x10(%0,%3,2),%0 \n"
"movdqa %0,%%xmm5 \n"
:: "m"(kShuffleReverseUV)
);
asm volatile (
"lea -16(%0,%3,2),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"lea -16(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
"lea 8(%1),%1 \n"
"movhpd %%xmm0,(%2) \n"
"lea 0x8(%2),%2 \n"
"sub $0x8,%3 \n"
"lea 8(%2),%2 \n"
"sub $8,%3 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(temp_width) // %3
: "r"(kShuffleReverseUV) // %4
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm5"
#endif
);
);
}
#endif
......
......@@ -51,15 +51,6 @@
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#endif
// The following are available on Windows and GCC 32 bit
#if (defined(WIN32) || \
defined(__i386__)) && \
!defined(LIBYUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_MMX
#define HAS_FASTCONVERTYUVTOBGRAROW_MMX
#define HAS_FASTCONVERTYUVTOABGRROW_MMX
#endif
// The following are available on Windows
#if defined(WIN32) && \
!defined(LIBYUV_DISABLE_ASM)
......@@ -128,12 +119,14 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
#if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
typedef __declspec(align(16)) signed char vec8[16];
typedef __declspec(align(16)) unsigned char uvec8[16];
typedef __declspec(align(16)) signed short vec16[8];
#else // __GNUC__
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
typedef signed char __attribute__((vector_size(16))) vec8;
typedef unsigned char __attribute__((vector_size(16))) uvec8;
typedef signed short __attribute__((vector_size(16))) vec16;
#endif
extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
......@@ -204,36 +197,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
......@@ -268,42 +231,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
#endif
// Method to force C version.
//#define USE_MMX 0
//#define USE_SSE2 0
#if !defined(USE_MMX)
// Windows, Mac and Linux use MMX
#if defined(__i386__) || defined(_MSC_VER)
#define USE_MMX 1
#else
#define USE_MMX 0
#endif
#endif
#if !defined(USE_SSE2)
#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
#define USE_SSE2 1
#else
#define USE_SSE2 0
#endif
#endif
// x64 uses MMX2 (SSE) so emms is not required.
// Warning C4799: function has no EMMS instruction.
// EMMS() is slow and should be called by the calling function once per image.
#if USE_MMX && !defined(ARCH_CPU_X86_64)
#if defined(_MSC_VER)
#define EMMS() __asm emms
#pragma warning(disable: 4799)
#else
#define EMMS() asm("emms")
#endif
#else
#define EMMS()
#endif
} // extern "C"
#endif // LIBYUV_SOURCE_ROW_H_
......@@ -542,231 +542,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
#endif
);
}
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
// 32 bit mmx gcc version
#ifdef OSX
#define UNDERSCORE "_"
#else
#define UNDERSCORE ""
#endif
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
asm(
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToARGBRow_MMX \n"
"_FastConvertYUVToARGBRow_MMX: \n"
#else
".global FastConvertYUVToARGBRow_MMX \n"
"FastConvertYUVToARGBRow_MMX: \n"
#endif
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1 \n"
"paddsw %mm0,%mm2 \n"
"psraw $0x6,%mm1 \n"
"psraw $0x6,%mm2 \n"
"packuswb %mm2,%mm1 \n"
"movq %mm1,0x0(%ebp) \n"
"lea 8(%ebp),%ebp \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
asm(
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToBGRARow_MMX \n"
"_FastConvertYUVToBGRARow_MMX: \n"
#else
".global FastConvertYUVToBGRARow_MMX \n"
"FastConvertYUVToBGRARow_MMX: \n"
#endif
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx \n"
"movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx \n"
"movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1 \n"
"paddsw %mm0,%mm2 \n"
"psraw $0x6,%mm1 \n"
"psraw $0x6,%mm2 \n"
"packuswb %mm2,%mm1 \n"
"movq %mm1,0x0(%ebp) \n"
"lea 8(%ebp),%ebp \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
asm(
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToABGRRow_MMX \n"
"_FastConvertYUVToABGRRow_MMX: \n"
#else
".global FastConvertYUVToABGRRow_MMX \n"
"FastConvertYUVToABGRRow_MMX: \n"
#endif
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx \n"
"movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx \n"
"movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1 \n"
"paddsw %mm0,%mm2 \n"
"psraw $0x6,%mm1 \n"
"psraw $0x6,%mm2 \n"
"packuswb %mm2,%mm1 \n"
"movq %mm1,0x0(%ebp) \n"
"lea 8(%ebp),%ebp \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
asm(
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUV444ToARGBRow_MMX \n"
"_FastConvertYUV444ToARGBRow_MMX: \n"
#else
".global FastConvertYUV444ToARGBRow_MMX \n"
"FastConvertYUV444ToARGBRow_MMX: \n"
#endif
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
"lea 1(%edx),%edx \n"
"paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
"psraw $0x6,%mm0 \n"
"packuswb %mm0,%mm0 \n"
"movd %mm0,0x0(%ebp) \n"
"lea 4(%ebp),%ebp \n"
"sub $0x1,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
asm(
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYToARGBRow_MMX \n"
"_FastConvertYToARGBRow_MMX: \n"
#else
".global FastConvertYToARGBRow_MMX \n"
"FastConvertYToARGBRow_MMX: \n"
#endif
"push %ebx \n"
"mov 0x8(%esp),%eax \n"
"mov 0xc(%esp),%edx \n"
"mov 0x10(%esp),%ecx \n"
"1: \n"
"movzbl (%eax),%ebx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
"psraw $0x6,%mm0 \n"
"movzbl 0x1(%eax),%ebx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
"psraw $0x6,%mm1 \n"
"packuswb %mm1,%mm0 \n"
"lea 0x2(%eax),%eax \n"
"movq %mm0,(%edx) \n"
"lea 0x8(%edx),%edx \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"pop %ebx \n"
"ret \n"
);
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
......
......@@ -15,71 +15,71 @@ extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3
// Constant multiplication table for converting ARGB to I400.
SIMD_ALIGNED(const int8 kARGBToY[16]) = {
static const vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
SIMD_ALIGNED(const int8 kARGBToU[16]) = {
static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
SIMD_ALIGNED(const int8 kARGBToV[16]) = {
static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
// Constants for BGRA
SIMD_ALIGNED(const int8 kBGRAToY[16]) = {
static const vec8 kBGRAToY = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
};
SIMD_ALIGNED(const int8 kBGRAToU[16]) = {
static const vec8 kBGRAToU = {
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
};
SIMD_ALIGNED(const int8 kBGRAToV[16]) = {
static const vec8 kBGRAToV = {
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
};
// Constants for ABGR
SIMD_ALIGNED(const int8 kABGRToY[16]) = {
static const vec8 kABGRToY = {
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
};
SIMD_ALIGNED(const int8 kABGRToU[16]) = {
static const vec8 kABGRToU = {
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
};
SIMD_ALIGNED(const int8 kABGRToV[16]) = {
static const vec8 kABGRToV = {
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
};
SIMD_ALIGNED(const uint8 kAddY16[16]) = {
static const uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
};
SIMD_ALIGNED(const uint8 kAddUV128[16]) = {
static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting BG24 to ARGB.
SIMD_ALIGNED(const uint8 kShuffleMaskBG24ToARGB[16]) = {
static const uvec8 kShuffleMaskBG24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
SIMD_ALIGNED(const uint8 kShuffleMaskRAWToARGB[16]) = {
static const uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Shuffle table for converting ABGR to ARGB.
SIMD_ALIGNED(const uint8 kShuffleMaskABGRToARGB[16]) = {
static const uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB.
SIMD_ALIGNED(const uint8 kShuffleMaskBGRAToARGB[16]) = {
static const uvec8 kShuffleMaskBGRAToARGB = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
......@@ -118,7 +118,7 @@ __asm {
mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskABGRToARGB
convertloop :
convertloop:
movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm5
......@@ -138,7 +138,7 @@ __asm {
mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskBGRAToARGB
convertloop :
convertloop:
movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm5
......@@ -160,7 +160,7 @@ __asm {
pslld xmm5, 24
movdqa xmm4, kShuffleMaskBG24ToARGB
convertloop :
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
......@@ -199,7 +199,7 @@ __asm {
pslld xmm5, 24
movdqa xmm4, kShuffleMaskRAWToARGB
convertloop :
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
......@@ -237,7 +237,7 @@ __asm {
movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY
convertloop :
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
......@@ -270,7 +270,7 @@ __asm {
movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY
convertloop :
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
......@@ -303,7 +303,7 @@ __asm {
movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY
convertloop :
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
......@@ -343,7 +343,7 @@ __asm {
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
convertloop :
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -407,7 +407,7 @@ __asm {
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
convertloop :
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -471,7 +471,7 @@ __asm {
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
convertloop :
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -519,182 +519,6 @@ __asm {
}
}
#define YUVTORGB_MMX(TABLE) __asm { \
__asm convertloop : \
__asm movzx eax, byte ptr [edi] \
__asm lea edi, [edi + 1] \
__asm movzx ebx, byte ptr [esi] \
__asm lea esi, [esi + 1] \
__asm movq mm0, [TABLE + 2048 + 8 * eax] \
__asm movzx eax, byte ptr [edx] \
__asm paddsw mm0, [TABLE + 4096 + 8 * ebx] \
__asm movzx ebx, byte ptr [edx + 1] \
__asm movq mm1, [TABLE + 8 * eax] \
__asm lea edx, [edx + 2] \
__asm movq mm2, [TABLE + 8 * ebx] \
__asm paddsw mm1, mm0 \
__asm paddsw mm2, mm0 \
__asm psraw mm1, 6 \
__asm psraw mm2, 6 \
__asm packuswb mm1, mm2 \
__asm movq [ebp], mm1 \
__asm lea ebp, [ebp + 8] \
__asm sub ecx, 2 \
__asm ja convertloop \
}
__declspec(naked)
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push ebx
push esi
push edi
push ebp
mov edx, [esp + 16 + 4]
mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
YUVTORGB_MMX(kCoefficientsRgbY)
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
__declspec(naked)
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push ebx
push esi
push edi
push ebp
mov edx, [esp + 16 + 4]
mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
YUVTORGB_MMX(kCoefficientsBgraY)
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
__declspec(naked)
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push ebx
push esi
push edi
push ebp
mov edx, [esp + 16 + 4]
mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
YUVTORGB_MMX(kCoefficientsAbgrY)
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
__declspec(naked)
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
push ebx
push esi
push edi
push ebp
mov edx, [esp + 16 + 4]
mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
convertloop :
movzx eax, byte ptr [edi]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi]
lea esi, [esi + 1]
movq mm0, [kCoefficientsRgbY + 2048 + 8 * eax]
movzx eax, byte ptr [edx]
paddsw mm0, [kCoefficientsRgbY + 4096 + 8 * ebx]
lea edx, [edx + 1]
paddsw mm0, [kCoefficientsRgbY + 8 * eax]
psraw mm0, 6
packuswb mm0, mm0
movd [ebp], mm0
lea ebp, [ebp + 4]
sub ecx, 1
ja convertloop
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
__declspec(naked)
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
push ebx
mov eax, [esp + 4 + 4] // Y
mov edx, [esp + 4 + 8] // rgb
mov ecx, [esp + 4 + 12] // width
convertloop :
movzx ebx, byte ptr [eax]
movq mm0, [kCoefficientsRgbY + 8 * ebx]
psraw mm0, 6
movzx ebx, byte ptr [eax + 1]
movq mm1, [kCoefficientsRgbY + 8 * ebx]
psraw mm1, 6
packuswb mm0, mm1
lea eax, [eax + 2]
movq [edx], mm0
lea edx, [edx + 8]
sub ecx, 2
ja convertloop
pop ebx
ret
}
}
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
......@@ -712,35 +536,35 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf,
#define BG UG * 128 + VG * 128
#define BR UR * 128 + VR * 128
SIMD_ALIGNED(const int8 kUVToB[16]) = {
static const vec8 kUVToB = {
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
};
SIMD_ALIGNED(const int8 kUVToR[16]) = {
static const vec8 kUVToR = {
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
};
SIMD_ALIGNED(const int8 kUVToG[16]) = {
static const vec8 kUVToG = {
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
};
SIMD_ALIGNED(const int16 kYToRgb[8]) = {
static const vec16 kYToRgb = {
YG, YG, YG, YG, YG, YG, YG, YG
};
SIMD_ALIGNED(const int16 kYSub16[8]) = {
static const vec16 kYSub16 = {
16, 16, 16, 16, 16, 16, 16, 16
};
SIMD_ALIGNED(const int16 kUVBiasB[8]) = {
static const vec16 kUVBiasB = {
BB, BB, BB, BB, BB, BB, BB, BB
};
SIMD_ALIGNED(const int16 kUVBiasG[8]) = {
static const vec16 kUVBiasG = {
BG, BG, BG, BG, BG, BG, BG, BG
};
SIMD_ALIGNED(const int16 kUVBiasR[8]) = {
static const vec16 kUVBiasR = {
BR, BR, BR, BR, BR, BR, BR, BR
};
......@@ -794,7 +618,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop :
convertloop:
YUVTORGB_SSSE3
// Step 3: Weave into ARGB
......@@ -833,7 +657,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
sub edi, esi
pxor xmm4, xmm4
convertloop :
convertloop:
YUVTORGB_SSSE3
// Step 3: Weave into BGRA
......@@ -874,7 +698,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop :
convertloop:
YUVTORGB_SSSE3
// Step 3: Weave into ARGB
......@@ -914,7 +738,7 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop :
convertloop:
// Step 1: Find 4 UV contributions to 4 R,G,B values
movd xmm0, [esi] // U
movd xmm1, [esi + edi] // V
......@@ -978,7 +802,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
movdqa xmm3, kYSub16
movdqa xmm2, kYToRgb
convertloop :
convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
......
......@@ -14,6 +14,7 @@
#include <string.h>
#include "libyuv/cpu_id.h"
#include "row.h"
#if defined(_MSC_VER)
#define ALIGN16(var) __declspec(align(16)) var
......@@ -21,6 +22,7 @@
#define ALIGN16(var) var __attribute__((aligned(16)))
#endif
// Note: A Neon reference manual
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
// Note: Some SSE2 reference manuals
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment