Commit 8b54a8f9 authored by fbarchard@google.com's avatar fbarchard@google.com

Specialized scale down sample to 1 / 2 size adjust to match general purpose code…

Specialized scale down sample to 1 / 2 size adjust to match general purpose code which uses odd pixel (rounded up - nearest neighbor).
BUG=223
TEST=out\Debug\convert.exe -f 0 faces_640x480_P420.yuv face2_320x240_P420.yuv
R=johannkoenig@google.com

Review URL: https://webrtc-codereview.appspot.com/1583005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@708 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 83408b85
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 707 Version: 708
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 707 #define LIBYUV_VERSION 708
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -196,16 +196,14 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -196,16 +196,14 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// src_stride ignored // src_stride ignored
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16 align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pand xmm0, xmm5 psrlw xmm0, 8 // isolate odd pixels.
pand xmm1, xmm5 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16 sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
...@@ -271,16 +269,14 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ...@@ -271,16 +269,14 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
// src_stride ignored // src_stride ignored
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16 align 16
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pand xmm0, xmm5 psrlw xmm0, 8 // isolate odd pixels.
pand xmm1, xmm5 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16 sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
...@@ -1269,15 +1265,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, ...@@ -1269,15 +1265,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
...@@ -1289,7 +1283,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -1289,7 +1283,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: :
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5" , "xmm0", "xmm1"
#endif #endif
); );
} }
...@@ -1336,15 +1330,13 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ...@@ -1336,15 +1330,13 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
...@@ -1356,7 +1348,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ...@@ -1356,7 +1348,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
: :
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5" , "xmm0", "xmm1"
#endif #endif
); );
} }
...@@ -2324,13 +2316,13 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -2324,13 +2316,13 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
uint8* dend = dst + dst_width - 1; uint8* dend = dst + dst_width - 1;
do { do {
dst[0] = src_ptr[0]; dst[0] = src_ptr[1];
dst[1] = src_ptr[2]; dst[1] = src_ptr[3];
dst += 2; dst += 2;
src_ptr += 4; src_ptr += 4;
} while (dst < dend); } while (dst < dend);
if (dst_width & 1) { if (dst_width & 1) {
dst[0] = src_ptr[0]; dst[0] = src_ptr[1];
} }
} }
...@@ -2689,6 +2681,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, ...@@ -2689,6 +2681,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
} }
#endif #endif
src_ptr += src_stride; // Point to odd rows.
// TODO(fbarchard): Loop through source height to allow odd height. // TODO(fbarchard): Loop through source height to allow odd height.
for (int y = 0; y < dst_height; ++y) { for (int y = 0; y < dst_height; ++y) {
ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
......
...@@ -62,7 +62,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -62,7 +62,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0xdd
sub ecx, 4 sub ecx, 4
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
...@@ -350,7 +350,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -350,7 +350,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
...@@ -634,13 +634,13 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb, ...@@ -634,13 +634,13 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb,
uint32* dst = reinterpret_cast<uint32*>(dst_argb); uint32* dst = reinterpret_cast<uint32*>(dst_argb);
for (int x = 0; x < dst_width - 1; x += 2) { for (int x = 0; x < dst_width - 1; x += 2) {
dst[0] = src[0]; dst[0] = src[1];
dst[1] = src[2]; dst[1] = src[3];
src += 4; src += 4;
dst += 2; dst += 2;
} }
if (dst_width & 1) { if (dst_width & 1) {
dst[0] = src[0]; dst[0] = src[1];
} }
} }
...@@ -743,25 +743,26 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, ...@@ -743,25 +743,26 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
FilterMode filtering) { FilterMode filtering) {
assert(dx == 65536 * 2); // Test scale factor of 2. assert(dx == 65536 * 2); // Test scale factor of 2.
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row / even column.
src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) = uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C; filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
#if defined(HAS_SCALEARGBROWDOWN2_SSE2) #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 : ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
ScaleARGBRowDown2_SSE2; ScaleARGBRowDown2_SSE2;
} }
#elif defined(HAS_SCALEARGBROWDOWN2_NEON) #elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON : ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON :
ScaleARGBRowDown2_NEON; ScaleARGBRowDown2_NEON;
} }
#endif #endif
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
int row_stride = src_stride * (dy >> 16);
// TODO(fbarchard): Loop through source height to allow odd height. // TODO(fbarchard): Loop through source height to allow odd height.
for (int y = 0; y < dst_height; ++y) { for (int y = 0; y < dst_height; ++y) {
...@@ -782,6 +783,9 @@ static void ScaleARGBDownEven(int src_width, int src_height, ...@@ -782,6 +783,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
FilterMode filtering) { FilterMode filtering) {
assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2)); assert(IS_ALIGNED(src_height, 2));
int col_step = dx >> 16;
int row_stride = (dy >> 16) * src_stride;
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
int src_step, uint8* dst_argb, int dst_width) = int src_step, uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C; filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
...@@ -798,9 +802,6 @@ static void ScaleARGBDownEven(int src_width, int src_height, ...@@ -798,9 +802,6 @@ static void ScaleARGBDownEven(int src_width, int src_height,
ScaleARGBRowDownEven_NEON; ScaleARGBRowDownEven_NEON;
} }
#endif #endif
int col_step = dx >> 16;
int row_stride = (dy >> 16) * src_stride;
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
for (int y = 0; y < dst_height; ++y) { for (int y = 0; y < dst_height; ++y) {
ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
......
...@@ -27,8 +27,8 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -27,8 +27,8 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"vld2.u32 {q0, q1}, [%0]! \n" "vld2.u32 {q0, q1}, [%0]! \n"
"vld2.u32 {q2, q3}, [%0]! \n" "vld2.u32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store even pixels "vst1.u8 {q1}, [%1]! \n" // store odd pixels
"vst1.u8 {q2}, [%1]! \n" "vst1.u8 {q3}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -78,6 +78,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, ...@@ -78,6 +78,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
"add %0, #4 \n" // point to odd pixels.
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
......
...@@ -39,6 +39,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -39,6 +39,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
......
...@@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n" "vld2.u8 {q0,q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store even pixels "vst1.u8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment