Commit f69e90a1 authored by fbarchard@google.com's avatar fbarchard@google.com

mirror munging and avoid wait. scale addrows use 6 registers for mac

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/426007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@200 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 2bc55fa3
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 199
Version: 200
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 199
#define LIBYUV_VERSION 200
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -298,87 +298,87 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
// Read in the data from the source pointer.
// First round of bit swap.
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
#if defined (__i386__)
......@@ -755,6 +755,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
#endif
#endif
static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int w) {
......@@ -1007,28 +1008,28 @@ void RotateUV270(const uint8* src, int src_stride,
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_UV_SSSE3
__declspec(naked)
void MirrorRowUV_SSSE3(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width) {
__asm {
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src
mov edx, [esp + 4 + 8] // dst_a
mov edi, [esp + 4 + 12] // dst_b
mov ecx, [esp + 4 + 16] // width
movdqa xmm5, kShuffleMirrorUV
movdqa xmm1, kShuffleMirrorUV
lea eax, [eax + ecx * 2 - 16]
sub edi, edx
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm5
pshufb xmm0, xmm1
sub ecx, 8
movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8]
movhpd qword ptr [edi], xmm0
lea edi, [edi + 8]
ja convertloop
pop edi
ret
}
......@@ -1037,22 +1038,21 @@ __asm {
#elif (defined(__i386__) || defined(__x86_64__)) && \
!defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_UV_SSSE3
void MirrorRowUV_SSSE3(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width) {
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %4,%%xmm1 \n"
"lea -16(%0,%3,2),%0 \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -16(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n"
"movlpd %%xmm0,(%1) \n"
"movhpd %%xmm0,(%1,%2) \n"
"lea 8(%1),%1 \n"
"movhpd %%xmm0,(%2) \n"
"lea 8(%2),%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
......@@ -1061,7 +1061,7 @@ void MirrorRowUV_SSSE3(const uint8* src,
: "m"(kShuffleMirrorUV) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm5"
, "xmm0", "xmm1"
#endif
);
}
......@@ -1070,12 +1070,11 @@ void MirrorRowUV_SSSE3(const uint8* src,
static void MirrorRowUV_C(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width) {
int i;
src += width << 1;
for (i = 0; i < width; ++i) {
src -= 2;
src += (width << 1) - 2;
for (int i = 0; i < width; ++i) {
dst_a[i] = src[0];
dst_b[i] = src[1];
src -= 2;
}
}
......@@ -1083,7 +1082,6 @@ void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
mirror_uv_func MirrorRow;
#if defined(HAS_MIRRORROW_UV_NEON)
......@@ -1105,12 +1103,11 @@ void RotateUV180(const uint8* src, int src_stride,
dst_a += dst_stride_a * (height - 1);
dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) {
for (int i = 0; i < height; ++i) {
MirrorRow(src, dst_a, dst_b, width);
src += src_stride; // down one line at a time
dst_a -= dst_stride_a; // nominally up one line at a time
dst_b -= dst_stride_b; // nominally up one line at a time
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;
}
}
......
......@@ -1701,15 +1701,15 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
intptr_t tmp_src = 0;
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%3 \n"
"sub $0x1,%5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"mov %0,%5 \n"
"mov %0,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"mov %3,%4 \n"
"mov %5,%2 \n"
"2: \n"
"movdqa (%0),%%xmm2 \n"
"add %6,%0 \n"
......@@ -1718,21 +1718,21 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%4 \n"
"sub $0x1,%2 \n"
"ja 2b \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x10(%5),%0 \n"
"lea 0x10(%3),%0 \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"sub $0x10,%4 \n"
"ja 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+rm"(src_width), // %2
"+rm"(src_height), // %3
"+r"(tmp_height), // %4
"+r"(tmp_src) // %5
: "rm"(static_cast<intptr_t>(src_stride)) // %6
"+r"(tmp_height), // %2
"+r"(tmp_src), // %3
"+rm"(src_width), // %4
"+rm"(src_height) // %5
: "r"(static_cast<intptr_t>(src_stride)) // %6
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
......@@ -1740,6 +1740,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
);
}
#if defined(__i386__)
extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
......@@ -2886,7 +2887,6 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
// (1-f)a + fb can be replaced with a + f(b-a)
#define BLENDER(a, b, f) ((int)(a) + ((f) * ((int)(b) - (int)(a)) >> 16))
// TODO(fbarchard): consider +0x8000 for rounding if it can be done for free.
static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
for (int j = 0; j < dst_width - 1; j += 2) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment