Commit b5ea79d8 authored by fbarchard@google.com's avatar fbarchard@google.com

add rows handle height of 1 using a more general while-style loop.

BUG=none
TESTED=try bots

Review URL: https://webrtc-codereview.appspot.com/45999004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1366 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c7161d1c
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1365 Version: 1366
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1365 #define LIBYUV_VERSION 1366
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -745,8 +745,8 @@ static void ScalePlaneBox(int src_width, int src_height, ...@@ -745,8 +745,8 @@ static void ScalePlaneBox(int src_width, int src_height,
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1. // TODO(fbarchard): Remove this and make AddRows handle odd width.
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { if (!IS_ALIGNED(src_width, 16)) {
uint8* dst = dst_ptr; uint8* dst = dst_ptr;
int j; int j;
for (j = 0; j < dst_height; ++j) { for (j = 0; j < dst_height; ++j) {
......
...@@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ...@@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
); );
} }
// Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) { uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0; int tmp_height = 0;
intptr_t tmp_src = 0; intptr_t tmp_src = 0;
asm volatile ( asm volatile (
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n" "mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(3) ",%%xmm2 \n"
"mov %0,%3 \n" "add %6,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"mov %5,%2 \n"
"test %2,%2 \n"
"je 3f \n"
LABELALIGN
"2: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"add %6,%0 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n" "punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n" "punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n" "paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n" "sub $0x1,%2 \n"
"jg 2b \n" "jg 1b \n"
LABELALIGN
"3: \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x10,3) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n" "sub $0x10,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
...@@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ...@@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned. // Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx, uint8* dst_argb, int dst_width) {
uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12 = 0; intptr_t src_stepx_x12 = 0;
asm volatile ( asm volatile (
......
...@@ -708,11 +708,9 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ...@@ -708,11 +708,9 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
} }
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16xN bytes and produces 16 shorts at a time.
// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
__declspec(naked) __declspec(naked)
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width, int src_height) {
int src_height) {
__asm { __asm {
push esi push esi
push edi push edi
...@@ -724,21 +722,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -724,21 +722,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov ecx, [esp + 16 + 16] // dst_width mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height mov ebx, [esp + 16 + 20] // height
pxor xmm4, xmm4 pxor xmm4, xmm4
dec ebx mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
xloop: xloop:
// first row
movdqu xmm0, [esi] // sum rows
lea eax, [esi + edx]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm4
punpckhbw xmm1, xmm4
lea esi, [esi + 16]
mov ebp, ebx
test ebp, ebp
je ydone
// sum remaining rows
yloop: yloop:
movdqu xmm2, [eax] // read 16 pixels movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row lea eax, [eax + edx] // advance to next row
...@@ -750,11 +741,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -750,11 +741,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
sub ebp, 1 sub ebp, 1
jg yloop jg yloop
ydone:
movdqu [edi], xmm0 movdqu [edi], xmm0
movdqu [edi + 16], xmm1 movdqu [edi + 16], xmm1
lea edi, [edi + 32] lea edi, [edi + 32] // dst_ptr += 16
lea esi, [esi + 16] // src_ptr += 16
mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
sub ecx, 16 sub ecx, 16
jg xloop jg xloop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment