Commit 8b54a8f9 authored by fbarchard@google.com's avatar fbarchard@google.com

Specialized scale down sample to 1 / 2 size adjust to match general purpose code…

Specialized scale down sample to 1 / 2 size adjust to match general purpose code which uses odd pixel (rounded up - nearest neighbor).
BUG=223
TEST=out\Debug\convert.exe -f 0 faces_640x480_P420.yuv face2_320x240_P420.yuv
R=johannkoenig@google.com

Review URL: https://webrtc-codereview.appspot.com/1583005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@708 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 83408b85
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 707
Version: 708
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 707
#define LIBYUV_VERSION 708
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -196,16 +196,14 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5
pand xmm1, xmm5
psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0
......@@ -271,16 +269,14 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5
pand xmm1, xmm5
psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
......@@ -1269,15 +1265,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
......@@ -1289,7 +1283,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
, "xmm0", "xmm1"
#endif
);
}
......@@ -1336,15 +1330,13 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
......@@ -1356,7 +1348,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
, "xmm0", "xmm1"
#endif
);
}
......@@ -2324,13 +2316,13 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
uint8* dend = dst + dst_width - 1;
do {
dst[0] = src_ptr[0];
dst[1] = src_ptr[2];
dst[0] = src_ptr[1];
dst[1] = src_ptr[3];
dst += 2;
src_ptr += 4;
} while (dst < dend);
if (dst_width & 1) {
dst[0] = src_ptr[0];
dst[0] = src_ptr[1];
}
}
......@@ -2689,6 +2681,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
}
#endif
src_ptr += src_stride; // Point to odd rows.
// TODO(fbarchard): Loop through source height to allow odd height.
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
......
......@@ -62,7 +62,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
shufps xmm0, xmm1, 0x88
shufps xmm0, xmm1, 0xdd
sub ecx, 4
movdqa [edx], xmm0
lea edx, [edx + 16]
......@@ -350,7 +350,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
......@@ -634,13 +634,13 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb,
uint32* dst = reinterpret_cast<uint32*>(dst_argb);
for (int x = 0; x < dst_width - 1; x += 2) {
dst[0] = src[0];
dst[1] = src[2];
dst[0] = src[1];
dst[1] = src[3];
src += 4;
dst += 2;
}
if (dst_width & 1) {
dst[0] = src[0];
dst[0] = src[1];
}
}
......@@ -743,25 +743,26 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
FilterMode filtering) {
assert(dx == 65536 * 2); // Test scale factor of 2.
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row / even column.
src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
ScaleARGBRowDown2_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON :
ScaleARGBRowDown2_NEON;
}
#endif
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
int row_stride = src_stride * (dy >> 16);
// TODO(fbarchard): Loop through source height to allow odd height.
for (int y = 0; y < dst_height; ++y) {
......@@ -782,6 +783,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
FilterMode filtering) {
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
int col_step = dx >> 16;
int row_stride = (dy >> 16) * src_stride;
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
int src_step, uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
......@@ -798,9 +802,6 @@ static void ScaleARGBDownEven(int src_width, int src_height,
ScaleARGBRowDownEven_NEON;
}
#endif
int col_step = dx >> 16;
int row_stride = (dy >> 16) * src_stride;
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
for (int y = 0; y < dst_height; ++y) {
ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
......
......@@ -27,8 +27,8 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"vld2.u32 {q0, q1}, [%0]! \n"
"vld2.u32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store even pixels
"vst1.u8 {q2}, [%1]! \n"
"vst1.u8 {q1}, [%1]! \n" // store odd pixels
"vst1.u8 {q3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
......@@ -78,6 +78,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
int src_stepx,
uint8* dst_argb, int dst_width) {
asm volatile (
"add %0, #4 \n" // point to odd pixels.
"mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n"
......
......@@ -39,6 +39,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
......
......@@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
// load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store even pixels
"vst1.u8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment