Commit 4339f09d authored by fbarchard@google.com's avatar fbarchard@google.com

bilinear scaling improvements

-subpixel accurate for scaling down by passing in x and dx.
-blend for bilinear use single multiply instead of 2 multiplies.
-filter col do 2 pixels at a time
-assembly do address munging
-assembly avoid wait on store for core2
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/397012

git-svn-id: http://libyuv.googlecode.com/svn/trunk@187 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 145998b6
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 185 Version: 186
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 185 #define LIBYUV_VERSION 186
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -1316,6 +1316,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1316,6 +1316,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
cmp eax, 0 cmp eax, 0
je xloop1 je xloop1
cmp eax, 128 cmp eax, 128
...@@ -1334,7 +1335,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1334,7 +1335,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
xloop: xloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm2, [esi + edx] movdqa xmm2, [esi + edx]
lea esi, [esi + 16]
movdqa xmm1, xmm0 movdqa xmm1, xmm0
movdqa xmm3, xmm2 movdqa xmm3, xmm2
punpcklbw xmm0, xmm7 punpcklbw xmm0, xmm7
...@@ -1350,43 +1350,40 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1350,43 +1350,40 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
psrlw xmm0, 8 psrlw xmm0, 8
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa [edi], xmm0
lea edi, [edi + 16]
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
ja xloop ja xloop
mov al, [edi - 1] mov al, [esi + edi - 1]
mov [edi], al mov [esi + edi], al
pop edi pop edi
pop esi pop esi
ret ret
xloop1: xloop1:
movdqa xmm0, [esi] movdqa xmm0, [esi]
lea esi, [esi + 16]
movdqa [edi], xmm0
lea edi, [edi + 16]
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
ja xloop1 ja xloop1
mov al, [edi - 1] mov al, [esi + edi - 1]
mov [edi], al mov [esi + edi], al
pop edi pop edi
pop esi pop esi
ret ret
xloop2: xloop2:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm2, [esi + edx] pavgb xmm0, [esi + edx]
lea esi, [esi + 16]
pavgb xmm0, xmm2
movdqa [edi], xmm0
lea edi, [edi + 16]
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
ja xloop2 ja xloop2
mov al, [edi - 1] mov al, [esi + edi - 1]
mov [edi], al mov [esi + edi], al
pop edi pop edi
pop esi pop esi
ret ret
...@@ -1407,6 +1404,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1407,6 +1404,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
cmp eax, 0 cmp eax, 0
je xloop1 je xloop1
cmp eax, 128 cmp eax, 128
...@@ -1423,7 +1421,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1423,7 +1421,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
xloop: xloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm2, [esi + edx] movdqa xmm2, [esi + edx]
lea esi, [esi + 16]
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklbw xmm0, xmm2 punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2 punpckhbw xmm1, xmm2
...@@ -1432,47 +1429,43 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1432,47 +1429,43 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm1, 7 psrlw xmm1, 7
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa [edi], xmm0
lea edi, [edi + 16]
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
ja xloop ja xloop
mov al, [edi - 1] mov al, [esi + edi - 1]
mov [edi], al mov [esi + edi], al
pop edi pop edi
pop esi pop esi
ret ret
xloop1: xloop1:
movdqa xmm0, [esi] movdqa xmm0, [esi]
lea esi, [esi + 16]
movdqa [edi], xmm0
lea edi, [edi + 16]
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
ja xloop1 ja xloop1
mov al, [edi - 1] mov al, [esi + edi - 1]
mov [edi], al mov [esi + edi], al
pop edi pop edi
pop esi pop esi
ret ret
xloop2: xloop2:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm2, [esi + edx] pavgb xmm0, [esi + edx]
lea esi, [esi + 16]
pavgb xmm0, xmm2
movdqa [edi], xmm0
lea edi, [edi + 16]
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
ja xloop2 ja xloop2
mov al, [edi - 1] mov al, [esi + edi - 1]
mov [edi], al mov [esi + edi], al
pop edi pop edi
pop esi pop esi
ret ret
} }
} }
...@@ -2081,6 +2074,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2081,6 +2074,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"mov 0x14(%esp),%edx \n" "mov 0x14(%esp),%edx \n"
"mov 0x18(%esp),%ecx \n" "mov 0x18(%esp),%ecx \n"
"mov 0x1c(%esp),%eax \n" "mov 0x1c(%esp),%eax \n"
"sub %esi, %edi \n"
"cmp $0x0,%eax \n" "cmp $0x0,%eax \n"
"je 2f \n" "je 2f \n"
"cmp $0x80,%eax \n" "cmp $0x80,%eax \n"
...@@ -2098,7 +2092,6 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2098,7 +2092,6 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"1:" "1:"
"movdqa (%esi),%xmm0 \n" "movdqa (%esi),%xmm0 \n"
"movdqa (%esi,%edx,1),%xmm2 \n" "movdqa (%esi,%edx,1),%xmm2 \n"
"lea 0x10(%esi),%esi \n"
"movdqa %xmm0,%xmm1 \n" "movdqa %xmm0,%xmm1 \n"
"movdqa %xmm2,%xmm3 \n" "movdqa %xmm2,%xmm3 \n"
"punpcklbw %xmm7,%xmm0 \n" "punpcklbw %xmm7,%xmm0 \n"
...@@ -2114,42 +2107,40 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2114,42 +2107,40 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"psrlw $0x8,%xmm0 \n" "psrlw $0x8,%xmm0 \n"
"psrlw $0x8,%xmm1 \n" "psrlw $0x8,%xmm1 \n"
"packuswb %xmm1,%xmm0 \n" "packuswb %xmm1,%xmm0 \n"
"movdqa %xmm0,(%edi) \n"
"lea 0x10(%edi),%edi \n"
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n"
"ja 1b \n" "ja 1b \n"
"mov -0x1(%edi),%al \n"
"mov %al,(%edi) \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"pop %edi \n" "pop %edi \n"
"pop %esi \n" "pop %esi \n"
"ret \n" "ret \n"
"2:" "2:"
"movdqa (%esi),%xmm0 \n" "movdqa (%esi),%xmm0 \n"
"lea 0x10(%esi),%esi \n"
"movdqa %xmm0,(%edi) \n"
"lea 0x10(%edi),%edi \n"
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n"
"ja 2b \n" "ja 2b \n"
"mov -0x1(%edi),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%edi) \n" "mov %al,(%esi,%edi,1) \n"
"pop %edi \n" "pop %edi \n"
"pop %esi \n" "pop %esi \n"
"ret \n" "ret \n"
"3:" "3:"
"movdqa (%esi),%xmm0 \n" "movdqa (%esi),%xmm0 \n"
"movdqa (%esi,%edx,1),%xmm2 \n" "pavgb (%esi,%edx,1),%xmm0 \n"
"lea 0x10(%esi),%esi \n"
"pavgb %xmm2,%xmm0 \n"
"movdqa %xmm0,(%edi) \n"
"lea 0x10(%edi),%edi \n"
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n"
"ja 3b \n" "ja 3b \n"
"mov -0x1(%edi),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%edi) \n" "mov %al,(%esi,%edi,1) \n"
"pop %edi \n" "pop %edi \n"
"pop %esi \n" "pop %esi \n"
"ret \n" "ret \n"
...@@ -2169,6 +2160,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2169,6 +2160,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"mov 0x14(%esp),%edx \n" "mov 0x14(%esp),%edx \n"
"mov 0x18(%esp),%ecx \n" "mov 0x18(%esp),%ecx \n"
"mov 0x1c(%esp),%eax \n" "mov 0x1c(%esp),%eax \n"
"sub %esi, %edi \n"
"cmp $0x0,%eax \n" "cmp $0x0,%eax \n"
"je 2f \n" "je 2f \n"
"cmp $0x80,%eax \n" "cmp $0x80,%eax \n"
...@@ -2184,7 +2176,6 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2184,7 +2176,6 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"1:" "1:"
"movdqa (%esi),%xmm0 \n" "movdqa (%esi),%xmm0 \n"
"movdqa (%esi,%edx,1),%xmm2 \n" "movdqa (%esi,%edx,1),%xmm2 \n"
"lea 0x10(%esi),%esi \n"
"movdqa %xmm0,%xmm1 \n" "movdqa %xmm0,%xmm1 \n"
"punpcklbw %xmm2,%xmm0 \n" "punpcklbw %xmm2,%xmm0 \n"
"punpckhbw %xmm2,%xmm1 \n" "punpckhbw %xmm2,%xmm1 \n"
...@@ -2193,40 +2184,40 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2193,40 +2184,40 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"psrlw $0x7,%xmm0 \n" "psrlw $0x7,%xmm0 \n"
"psrlw $0x7,%xmm1 \n" "psrlw $0x7,%xmm1 \n"
"packuswb %xmm1,%xmm0 \n" "packuswb %xmm1,%xmm0 \n"
"movdqa %xmm0,(%edi) \n"
"lea 0x10(%edi),%edi \n"
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n"
"ja 1b \n" "ja 1b \n"
"mov -0x1(%edi),%al \n"
"mov %al,(%edi) \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"pop %edi \n" "pop %edi \n"
"pop %esi \n" "pop %esi \n"
"ret \n" "ret \n"
"2:" "2:"
"movdqa (%esi),%xmm0 \n" "movdqa (%esi),%xmm0 \n"
"lea 0x10(%esi),%esi \n"
"movdqa %xmm0,(%edi) \n"
"lea 0x10(%edi),%edi \n"
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n"
"ja 2b \n" "ja 2b \n"
"mov -0x1(%edi),%al \n"
"mov %al,(%edi) \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"pop %edi \n" "pop %edi \n"
"pop %esi \n" "pop %esi \n"
"ret \n" "ret \n"
"3:" "3:"
"movdqa (%esi),%xmm0 \n" "movdqa (%esi),%xmm0 \n"
"movdqa (%esi,%edx,1),%xmm2 \n" "pavgb (%esi,%edx,1),%xmm0 \n"
"lea 0x10(%esi),%esi \n"
"pavgb %xmm2,%xmm0 \n"
"movdqa %xmm0,(%edi) \n"
"lea 0x10(%edi),%edi \n"
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n"
"ja 3b \n" "ja 3b \n"
"mov -0x1(%edi),%al \n"
"mov %al,(%edi) \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"pop %edi \n" "pop %edi \n"
"pop %esi \n" "pop %esi \n"
"ret \n" "ret \n"
...@@ -2921,16 +2912,30 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, ...@@ -2921,16 +2912,30 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
} }
#endif #endif
// (1-f)a + fb can be replaced with a + f(b-a)
#define BLENDER(a, b, f) ((int)(a) + ((f) * ((int)(b) - (int)(a)) >> 16))
// TODO(fbarchard): consider +0x8000 for rounding if it can be done for free.
static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int dx) { int dst_width, int x, int dx) {
int x = 0; for (int j = 0; j < dst_width - 1; j += 2) {
for (int j = 0; j < dst_width; ++j) {
int xi = x >> 16; int xi = x >> 16;
int xf1 = x & 0xffff; int a = src_ptr[xi];
int xf0 = 65536 - xf1; int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
*dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; x += dx;
xi = x >> 16;
a = src_ptr[xi];
b = src_ptr[xi + 1];
dst_ptr[1] = BLENDER(a, b, x & 0xffff);
x += dx; x += dx;
dst_ptr += 2;
}
if (dst_width & 1) {
int xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
} }
} }
...@@ -3340,10 +3345,9 @@ static __inline uint32 SumBox(int iboxwidth, int iboxheight, ...@@ -3340,10 +3345,9 @@ static __inline uint32 SumBox(int iboxwidth, int iboxheight,
return sum; return sum;
} }
static void ScalePlaneBoxRow(int dst_width, int boxheight, static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
int dx, int src_stride, int x, int dx, int src_stride,
const uint8* src_ptr, uint8* dst_ptr) { const uint8* src_ptr, uint8* dst_ptr) {
int x = 0;
for (int i = 0; i < dst_width; ++i) { for (int i = 0; i < dst_width; ++i) {
int ix = x >> 16; int ix = x >> 16;
x += dx; x += dx;
...@@ -3362,14 +3366,13 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { ...@@ -3362,14 +3366,13 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
return sum; return sum;
} }
static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) { const uint16* src_ptr, uint8* dst_ptr) {
int scaletbl[2]; int scaletbl[2];
int minboxwidth = (dx >> 16); int minboxwidth = (dx >> 16);
scaletbl[0] = 65536 / (minboxwidth * boxheight); scaletbl[0] = 65536 / (minboxwidth * boxheight);
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
int *scaleptr = scaletbl - minboxwidth; int *scaleptr = scaletbl - minboxwidth;
int x = 0;
for (int i = 0; i < dst_width; ++i) { for (int i = 0; i < dst_width; ++i) {
int ix = x >> 16; int ix = x >> 16;
x += dx; x += dx;
...@@ -3378,11 +3381,10 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, ...@@ -3378,11 +3381,10 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
} }
} }
static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) { const uint16* src_ptr, uint8* dst_ptr) {
int boxwidth = (dx >> 16); int boxwidth = (dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight); int scaleval = 65536 / (boxwidth * boxheight);
int x = 0;
for (int i = 0; i < dst_width; ++i) { for (int i = 0; i < dst_width; ++i) {
*dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
x += boxwidth; x += boxwidth;
...@@ -3404,33 +3406,32 @@ static void ScalePlaneBox(int src_width, int src_height, ...@@ -3404,33 +3406,32 @@ static void ScalePlaneBox(int src_width, int src_height,
const uint8* src_ptr, uint8* dst_ptr) { const uint8* src_ptr, uint8* dst_ptr) {
assert(dst_width > 0); assert(dst_width > 0);
assert(dst_height > 0); assert(dst_height > 0);
int dy = (src_height << 16) / dst_height;
int dx = (src_width << 16) / dst_width; int dx = (src_width << 16) / dst_width;
int dy = (src_height << 16) / dst_height;
int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
int maxy = (src_height << 16);
if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
dst_height * 2 > src_height) { dst_height * 2 > src_height) {
uint8* dst = dst_ptr; uint8* dst = dst_ptr;
int dy = (src_height << 16) / dst_height;
int dx = (src_width << 16) / dst_width;
int y = 0;
for (int j = 0; j < dst_height; ++j) { for (int j = 0; j < dst_height; ++j) {
int iy = y >> 16; int iy = y >> 16;
const uint8* const src = src_ptr + iy * src_stride; const uint8* src = src_ptr + iy * src_stride;
y += dy; y += dy;
if (y > (src_height << 16)) { if (y > maxy) {
y = (src_height << 16); y = maxy;
} }
int boxheight = (y >> 16) - iy; int boxheight = (y >> 16) - iy;
ScalePlaneBoxRow(dst_width, boxheight, ScalePlaneBoxRow_C(dst_width, boxheight,
dx, src_stride, x, dx, src_stride,
src, dst); src, dst);
dst += dst_stride; dst += dst_stride;
} }
} else { } else {
ALIGN16(uint16 row[kMaxInputWidth]); ALIGN16(uint16 row[kMaxInputWidth]);
void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, int src_height); uint16* dst_ptr, int src_width, int src_height);
void (*ScaleAddCols)(int dst_width, int boxheight, int dx, void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr); const uint16* src_ptr, uint8* dst_ptr);
#if defined(HAS_SCALEADDROWS_SSE2) #if defined(HAS_SCALEADDROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
...@@ -3447,17 +3448,16 @@ static void ScalePlaneBox(int src_width, int src_height, ...@@ -3447,17 +3448,16 @@ static void ScalePlaneBox(int src_width, int src_height,
ScaleAddCols = ScaleAddCols1_C; ScaleAddCols = ScaleAddCols1_C;
} }
int y = 0;
for (int j = 0; j < dst_height; ++j) { for (int j = 0; j < dst_height; ++j) {
int iy = y >> 16; int iy = y >> 16;
const uint8* const src = src_ptr + iy * src_stride; const uint8* src = src_ptr + iy * src_stride;
y += dy; y += dy;
if (y > (src_height << 16)) { if (y > (src_height << 16)) {
y = (src_height << 16); y = (src_height << 16);
} }
int boxheight = (y >> 16) - iy; int boxheight = (y >> 16) - iy;
ScaleAddRows(src, src_stride, row, src_width, boxheight); ScaleAddRows(src, src_stride, row, src_width, boxheight);
ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
dst_ptr += dst_stride; dst_ptr += dst_stride;
} }
} }
...@@ -3470,33 +3470,34 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height, ...@@ -3470,33 +3470,34 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
int dst_width, int dst_height, int dst_width, int dst_height,
int src_stride, int dst_stride, int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) { const uint8* src_ptr, uint8* dst_ptr) {
uint8* dst = dst_ptr;
int dx = (src_width << 16) / dst_width; int dx = (src_width << 16) / dst_width;
int dy = (src_height << 16) / dst_height; int dy = (src_height << 16) / dst_height;
int maxx = ((src_width - 1) << 16) - 1; int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
int maxy = ((src_height - 1) << 16) - 1; int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
int y = (dst_height < src_height) ? 32768 : int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
(src_height << 16) / dst_height - 32768;
for (int i = 0; i < dst_height; ++i) { for (int i = 0; i < dst_height; ++i) {
int cy = (y < 0) ? 0 : y; int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
int yi = cy >> 16; int yi = y >> 16;
int yf = cy & 0xffff; int yf = y & 0xffff;
const uint8* const src = src_ptr + yi * src_stride; const uint8* src0 = src_ptr + yi * src_stride;
int x = (dst_width < src_width) ? 32768 : const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
(src_width << 16) / dst_width - 32768; uint8* dst = dst_ptr;
for (int j = 0; j < dst_width; ++j) { for (int j = 0; j < dst_width; ++j) {
int cx = (x < 0) ? 0 : x; int xi = x >> 16;
int xi = cx >> 16; int xf = x & 0xffff;
int xf = cx & 0xffff; int x1 = (xi < src_width - 1) ? xi + 1 : xi;
int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; int a = src0[xi];
int r1 = (src[xi + src_stride] * (65536 - xf) + int b = src0[x1];
src[xi + src_stride + 1] * xf) >> 16; int r0 = BLENDER(a, b, xf);
*dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; a = src1[xi];
b = src1[x1];
int r1 = BLENDER(a, b, xf);
*dst++ = BLENDER(r0, r1, yf);
x += dx; x += dx;
if (x > maxx) if (x > maxx)
x = maxx; x = maxx;
} }
dst += dst_stride - dst_width; dst_ptr += dst_stride;
y += dy; y += dy;
if (y > maxy) if (y > maxy)
y = maxy; y = maxy;
...@@ -3513,8 +3514,6 @@ void ScalePlaneBilinear(int src_width, int src_height, ...@@ -3513,8 +3514,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
const uint8* src_ptr, uint8* dst_ptr) { const uint8* src_ptr, uint8* dst_ptr) {
assert(dst_width > 0); assert(dst_width > 0);
assert(dst_height > 0); assert(dst_height > 0);
int dy = (src_height << 16) / dst_height;
int dx = (src_width << 16) / dst_width;
if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src_ptr, dst_ptr); src_stride, dst_stride, src_ptr, dst_ptr);
...@@ -3524,8 +3523,6 @@ void ScalePlaneBilinear(int src_width, int src_height, ...@@ -3524,8 +3523,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
int src_stride, int src_stride,
int dst_width, int source_y_fraction); int dst_width, int source_y_fraction);
void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int dx);
#if defined(HAS_SCALEFILTERROWS_NEON) #if defined(HAS_SCALEFILTERROWS_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleFilterRows = ScaleFilterRows_NEON; ScaleFilterRows = ScaleFilterRows_NEON;
...@@ -3546,16 +3543,18 @@ void ScalePlaneBilinear(int src_width, int src_height, ...@@ -3546,16 +3543,18 @@ void ScalePlaneBilinear(int src_width, int src_height,
{ {
ScaleFilterRows = ScaleFilterRows_C; ScaleFilterRows = ScaleFilterRows_C;
} }
ScaleFilterCols = ScaleFilterCols_C;
int y = 0; int dx = (src_width << 16) / dst_width;
int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. int dy = (src_height << 16) / dst_height;
int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
for (int j = 0; j < dst_height; ++j) { for (int j = 0; j < dst_height; ++j) {
int iy = y >> 16; int yi = y >> 16;
int fy = (y >> 8) & 255; int yf = (y >> 8) & 255;
const uint8* const src = src_ptr + iy * src_stride; const uint8* src = src_ptr + yi * src_stride;
ScaleFilterRows(row, src, src_stride, src_width, fy); ScaleFilterRows(row, src, src_stride, src_width, yf);
ScaleFilterCols(dst_ptr, row, dst_width, dx); ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
dst_ptr += dst_stride; dst_ptr += dst_stride;
y += dy; y += dy;
if (y > maxy) { if (y > maxy) {
...@@ -3575,18 +3574,20 @@ static void ScalePlaneSimple(int src_width, int src_height, ...@@ -3575,18 +3574,20 @@ static void ScalePlaneSimple(int src_width, int src_height,
int dst_width, int dst_height, int dst_width, int dst_height,
int src_stride, int dst_stride, int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) { const uint8* src_ptr, uint8* dst_ptr) {
uint8* dst = dst_ptr;
int dx = (src_width << 16) / dst_width; int dx = (src_width << 16) / dst_width;
for (int y = 0; y < dst_height; ++y) { int dy = (src_height << 16) / dst_height;
const uint8* const src = src_ptr + (y * src_height / dst_height) * int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
src_stride; for (int j = 0; j < dst_height; ++j) {
// TODO(fbarchard): Round X coordinate by setting x=0x8000. int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
int x = 0; int yi = y >> 16;
const uint8* src = src_ptr + yi * src_stride;
uint8* dst = dst_ptr;
for (int i = 0; i < dst_width; ++i) { for (int i = 0; i < dst_width; ++i) {
*dst++ = src[x >> 16]; *dst++ = src[x >> 16];
x += dx; x += dx;
} }
dst += dst_stride - dst_width; dst_ptr += dst_stride;
y += dy;
} }
} }
...@@ -3790,9 +3791,9 @@ int ScaleOffset(const uint8* src, int src_width, int src_height, ...@@ -3790,9 +3791,9 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
int dst_halfwidth = (dst_width + 1) >> 1; int dst_halfwidth = (dst_width + 1) >> 1;
int dst_halfheight = (dst_height + 1) >> 1; int dst_halfheight = (dst_height + 1) >> 1;
int aheight = dst_height - dst_yoffset * 2; // actual output height int aheight = dst_height - dst_yoffset * 2; // actual output height
const uint8* const src_y = src; const uint8* src_y = src;
const uint8* const src_u = src + src_width * src_height; const uint8* src_u = src + src_width * src_height;
const uint8* const src_v = src + src_width * src_height + const uint8* src_v = src + src_width * src_height +
src_halfwidth * src_halfheight; src_halfwidth * src_halfheight;
uint8* dst_y = dst + dst_yoffset * dst_width; uint8* dst_y = dst + dst_yoffset * dst_width;
uint8* dst_u = dst + dst_width * dst_height + uint8* dst_u = dst + dst_width * dst_height +
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment