Commit b5ea79d8 authored by fbarchard@google.com's avatar fbarchard@google.com

add rows handle height of 1 using a more general while-style loop.

BUG=none
TESTED=try bots

Review URL: https://webrtc-codereview.appspot.com/45999004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1366 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c7161d1c
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1365
Version: 1366
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1365
#define LIBYUV_VERSION 1366
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -745,8 +745,8 @@ static void ScalePlaneBox(int src_width, int src_height,
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
// TODO(fbarchard): Remove this and make AddRows handle odd width.
if (!IS_ALIGNED(src_width, 16)) {
uint8* dst = dst_ptr;
int j;
for (j = 0; j < dst_height; ++j) {
......
......@@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
);
}
// Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0;
intptr_t tmp_src = 0;
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n"
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"mov %0,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"mov %5,%2 \n"
"test %2,%2 \n"
"je 3f \n"
LABELALIGN
"2: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"add %6,%0 \n"
"movdqu " MEMACCESS(3) ",%%xmm2 \n"
"add %6,%3 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n"
"jg 2b \n"
"jg 1b \n"
LABELALIGN
"3: \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x10,3) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
......@@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
int src_stepx, uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12 = 0;
asm volatile (
......
......@@ -708,11 +708,9 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
}
// Reads 16xN bytes and produces 16 shorts at a time.
// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
__declspec(naked)
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width,
int src_height) {
uint16* dst_ptr, int src_width, int src_height) {
__asm {
push esi
push edi
......@@ -724,21 +722,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
pxor xmm4, xmm4
dec ebx
mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
xloop:
// first row
movdqu xmm0, [esi]
lea eax, [esi + edx]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm4
punpckhbw xmm1, xmm4
lea esi, [esi + 16]
mov ebp, ebx
test ebp, ebp
je ydone
// sum remaining rows
// sum rows
yloop:
movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
......@@ -750,11 +741,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
sub ebp, 1
jg yloop
ydone:
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
lea edi, [edi + 32]
lea edi, [edi + 32] // dst_ptr += 16
lea esi, [esi + 16] // src_ptr += 16
mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
sub ecx, 16
jg xloop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment