Commit 8ffe78ab authored by fbarchard@google.com's avatar fbarchard@google.com

Scale down by 4 used 3rd pixel

BUG=232
TEST=convert.exe -f 0 faces_640x480_P420.yuv face2_160x120_P420.yuv
R=changjun.yang@intel.com

Review URL: https://webrtc-codereview.appspot.com/1579005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@709 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8b54a8f9
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 708
Version: 709
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 708
#define LIBYUV_VERSION 709
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -137,15 +137,15 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
#elif !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#endif
void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
LIBYUV_API
......@@ -173,11 +173,11 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
}
int halfwidth = (width + 1) >> 1;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C;
uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
#if defined(HAS_SCALEROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(halfwidth, 16)) {
ScaleRowDown2 = ScaleRowDown2Int_NEON;
ScaleRowDown2 = ScaleRowDown2Box_NEON;
}
#elif defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
......@@ -186,7 +186,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
ScaleRowDown2 = ScaleRowDown2Int_SSE2;
ScaleRowDown2 = ScaleRowDown2Box_SSE2;
}
#endif
......
......@@ -55,13 +55,13 @@ void SetUseReferenceImpl(bool use) {
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
#define HAS_SCALEROWDOWN4_NEON
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#define HAS_SCALEROWDOWN34_NEON
......@@ -71,10 +71,10 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
......@@ -84,11 +84,11 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 16x2 -> 16x1
......@@ -217,7 +217,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
......@@ -290,7 +290,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -343,8 +343,9 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
psrld xmm5, 24
pslld xmm5, 16
align 16
wloop:
......@@ -354,6 +355,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pand xmm0, xmm5
pand xmm1, xmm5
packuswb xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0
......@@ -367,7 +369,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x4 rectangle to 8x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
......@@ -425,112 +427,6 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
#define HAS_SCALEROWDOWN8_SSE2
// Point samples 32 pixels to 4 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
psrlq xmm5, 56
align 16
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5
pand xmm1, xmm5
packuswb xmm0, xmm1 // 32->16
packuswb xmm0, xmm0 // 16->8
packuswb xmm0, xmm0 // 8->4
sub ecx, 4
movd dword ptr [edx], xmm0
lea edx, [edx + 4]
jg wloop
ret
}
}
// Blends 32x8 rectangle to 4x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
push edi
push ebp
mov eax, [esp + 12 + 4] // src_ptr
mov esi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst_ptr
mov ecx, [esp + 12 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
pxor xmm7, xmm7
align 16
wloop:
movdqa xmm0, [eax] // average 8 rows to 1
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqa xmm2, [eax + esi * 2]
movdqa xmm3, [eax + esi * 2 + 16]
movdqa xmm4, [eax + edi]
movdqa xmm5, [eax + edi + 16]
lea ebp, [eax + esi * 4]
lea eax, [eax + 32]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqa xmm2, [ebp]
movdqa xmm3, [ebp + 16]
movdqa xmm4, [ebp + esi]
movdqa xmm5, [ebp + esi + 16]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
movdqa xmm4, [ebp + esi * 2]
movdqa xmm5, [ebp + esi * 2 + 16]
movdqa xmm6, [ebp + edi]
pavgb xmm4, xmm6
movdqa xmm6, [ebp + edi + 16]
pavgb xmm5, xmm6
pavgb xmm2, xmm4
pavgb xmm3, xmm5
pavgb xmm0, xmm2
pavgb xmm1, xmm3
psadbw xmm0, xmm7 // average 32 pixels to 4
psadbw xmm1, xmm7
pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01
pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx
por xmm0, xmm1 // -> 3201
psrlw xmm0, 3
packuswb xmm0, xmm0
packuswb xmm0, xmm0
sub ecx, 4
movd dword ptr [edx], xmm0
lea edx, [edx + 4]
jg wloop
pop ebp
pop edi
pop esi
ret
}
}
#define HAS_SCALEROWDOWN34_SSSE3
// Point samples 32 pixels to 24 pixels.
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
......@@ -588,7 +484,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -647,7 +543,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -743,7 +639,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// Scale 16x3 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16))
static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -809,7 +705,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
// Scale 16x2 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16))
static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -1288,7 +1184,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
......@@ -1353,7 +1249,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
);
}
static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -1398,6 +1294,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
"pslld $0x10,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
......@@ -1406,6 +1303,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
......@@ -1422,7 +1320,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stridex3 = 0;
asm volatile (
......@@ -1476,103 +1374,6 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
#define HAS_SCALEROWDOWN8_SSE2
static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlq $0x38,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stridex3 = 0;
intptr_t row4 = 0;
asm volatile (
"lea (%5,%5,2),%3 \n"
"pxor %%xmm7,%%xmm7 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%5,1),%%xmm2 \n"
"movdqa 0x10(%0,%5,1),%%xmm3 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"movdqa (%0,%5,2),%%xmm2 \n"
"movdqa 0x10(%0,%5,2),%%xmm3 \n"
"movdqa (%0,%3,1),%%xmm4 \n"
"movdqa 0x10(%0,%3,1),%%xmm5 \n"
"lea (%0,%5,4),%4 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm4,%%xmm2 \n"
"pavgb %%xmm5,%%xmm3 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"movdqa 0x0(%4),%%xmm2 \n"
"movdqa 0x10(%4),%%xmm3 \n"
"movdqa 0x0(%4,%5,1),%%xmm4 \n"
"movdqa 0x10(%4,%5,1),%%xmm5 \n"
"pavgb %%xmm4,%%xmm2 \n"
"pavgb %%xmm5,%%xmm3 \n"
"movdqa 0x0(%4,%5,2),%%xmm4 \n"
"movdqa 0x10(%4,%5,2),%%xmm5 \n"
"movdqa 0x0(%4,%3,1),%%xmm6 \n"
"pavgb %%xmm6,%%xmm4 \n"
"movdqa 0x10(%4,%3,1),%%xmm6 \n"
"pavgb %%xmm6,%%xmm5 \n"
"pavgb %%xmm4,%%xmm2 \n"
"pavgb %%xmm5,%%xmm3 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psadbw %%xmm7,%%xmm0 \n"
"psadbw %%xmm7,%%xmm1 \n"
"pshufd $0xd8,%%xmm0,%%xmm0 \n"
"pshufd $0x8d,%%xmm1,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"psrlw $0x3,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+rm"(dst_width), // %2
"+r"(stridex3), // %3
"+r"(row4) // %4
: "r"(static_cast<intptr_t>(src_stride)) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
#define HAS_SCALEROWDOWN34_SSSE3
static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
......@@ -1613,7 +1414,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -1680,7 +1481,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
);
}
static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -1783,7 +1584,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -1829,7 +1630,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
);
}
static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -2282,7 +2083,7 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
#define HAS_SCALEROWDOWN2_MIPS_DSPR2
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
#define HAS_SCALEFILTERROWS_MIPS_DSPR2
void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
......@@ -2292,21 +2093,21 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
#define HAS_SCALEROWDOWN4_MIPS_DSPR2
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
#define HAS_SCALEROWDOWN34_MIPS_DSPR2
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
#define HAS_SCALEROWDOWN38_MIPS_DSPR2
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
......@@ -2326,7 +2127,7 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
}
}
void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
......@@ -2347,17 +2148,17 @@ static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
uint8* dend = dst + dst_width - 1;
do {
dst[0] = src_ptr[0];
dst[1] = src_ptr[4];
dst[0] = src_ptr[2];
dst[1] = src_ptr[6];
dst += 2;
src_ptr += 8;
} while (dst < dend);
if (dst_width & 1) {
dst[0] = src_ptr[0];
dst[0] = src_ptr[2];
}
}
static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
static void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
uint8* dend = dst + dst_width - 1;
......@@ -2393,33 +2194,6 @@ static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
uint8* dend = dst + dst_width - 1;
do {
dst[0] = src_ptr[0];
dst[1] = src_ptr[8];
dst += 2;
src_ptr += 16;
} while (dst < dend);
if (dst_width & 1) {
dst[0] = src_ptr[0];
}
}
// Note calling code checks width is less than max and if not
// uses ScaleRowDown8_C instead.
static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
SIMD_ALIGNED(uint8 src_row[kMaxStride * 2]);
assert(dst_width <= kMaxStride);
ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
src_row + kMaxStride,
dst_width * 2);
ScaleRowDown2Int_C(src_row, kMaxStride, dst, dst_width);
}
static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
......@@ -2434,7 +2208,7 @@ static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
}
// Filter rows 0 and 1 together, 3 : 1
static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
static void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
const uint8* s = src_ptr;
......@@ -2457,7 +2231,7 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Filter rows 1 and 2 together, 1 : 1
static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
const uint8* s = src_ptr;
......@@ -2524,7 +2298,7 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
#define HAS_SCALEROWDOWN34_SSE2
// Filter rows 0 and 1 together, 3 : 1
static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
static void ScaleRowDown34_0_Box_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
......@@ -2534,7 +2308,7 @@ static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
}
// Filter rows 1 and 2 together, 1 : 1
static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
static void ScaleRowDown34_1_Box_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
......@@ -2557,7 +2331,7 @@ static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
}
// 8x3 -> 3x1
static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
static void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
......@@ -2583,7 +2357,7 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
}
// 8x2 -> 3x1
static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
intptr_t stride = src_stride;
......@@ -2657,35 +2431,40 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
FilterMode filtering) {
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
filtering ? ScaleRowDown2Box_C : ScaleRowDown2_C;
int row_stride = src_stride << 1;
if (!filtering) {
src_ptr += src_stride; // Point to odd rows.
src_stride = 0;
}
#if defined(HAS_SCALEROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
}
#elif defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
ScaleRowDown2 = filtering ? ScaleRowDown2Box_Unaligned_SSE2 :
ScaleRowDown2_Unaligned_SSE2;
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
if (IS_ALIGNED(src_ptr, 16) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
ScaleRowDown2 = filtering ? ScaleRowDown2Box_SSE2 : ScaleRowDown2_SSE2;
}
}
#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ?
ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
}
#endif
src_ptr += src_stride; // Point to odd rows.
// TODO(fbarchard): Loop through source height to allow odd height.
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
src_ptr += (src_stride << 1);
src_ptr += row_stride;
dst_ptr += dst_stride;
}
}
......@@ -2701,58 +2480,34 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
FilterMode filtering) {
void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
int row_stride = src_stride << 2;
if (!filtering) {
src_ptr += src_stride * 2; // Point to row 2.
src_stride = 0;
}
#if defined(HAS_SCALEROWDOWN4_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(dst_width, 4)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
#elif defined(HAS_SCALEROWDOWN4_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Int_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
}
#endif
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
src_ptr += (src_stride << 2);
dst_ptr += dst_stride;
}
}
// Scale plane, 1/8
// This is an optimized version for scaling down a plane to 1/8
// of its original size.
static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
FilterMode filtering) {
void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering && (dst_width <= kMaxStride) ?
ScaleRowDown8Int_C : ScaleRowDown8_C;
#if defined(HAS_SCALEROWDOWN8_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
}
#endif
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
src_ptr += (src_stride << 3);
src_ptr += row_stride;
dst_ptr += dst_stride;
}
}
......@@ -2773,8 +2528,8 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
ScaleRowDown34_0 = ScaleRowDown34_C;
ScaleRowDown34_1 = ScaleRowDown34_C;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
}
#if defined(HAS_SCALEROWDOWN34_NEON)
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
......@@ -2782,16 +2537,16 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
ScaleRowDown34_0 = ScaleRowDown34_NEON;
ScaleRowDown34_1 = ScaleRowDown34_NEON;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
}
}
#endif
#if defined(HAS_SCALEROWDOWN34_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSE2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSE2;
}
#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
......@@ -2801,8 +2556,8 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
}
}
#endif
......@@ -2814,8 +2569,8 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_MIPS_DSPR2;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
}
}
#endif
......@@ -2874,8 +2629,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
ScaleRowDown38_3 = ScaleRowDown38_C;
ScaleRowDown38_2 = ScaleRowDown38_C;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
}
#if defined(HAS_SCALEROWDOWN38_NEON)
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
......@@ -2883,8 +2638,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
ScaleRowDown38_3 = ScaleRowDown38_NEON;
ScaleRowDown38_2 = ScaleRowDown38_NEON;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
}
}
#elif defined(HAS_SCALEROWDOWN38_SSSE3)
......@@ -2894,8 +2649,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
}
}
#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
......@@ -2906,8 +2661,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Int_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Int_MIPS_DSPR2;
ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
}
}
#endif
......@@ -3330,11 +3085,6 @@ void ScalePlane(const uint8* src, int src_stride,
// optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
} else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
filtering != kFilterBilinear) {
// optimized, 1/8
ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
} else {
// Arbitrary downsample
ScalePlaneDown(src_width, src_height, dst_width, dst_height,
......
......@@ -34,12 +34,12 @@ static __inline int Abs(int v) {
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, int src_stride,
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
#endif
......@@ -75,7 +75,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
// Blends 8x2 rectangle to 4x1.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleARGBRowDown2Int_SSE2(const uint8* src_argb,
static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
__asm {
......@@ -150,7 +150,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb,
static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
......@@ -366,7 +366,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
);
}
static void ScaleARGBRowDown2Int_SSE2(const uint8* src_argb,
static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
......@@ -438,7 +438,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb,
static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, int src_stepx,
uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
......@@ -644,7 +644,7 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb,
}
}
static void ScaleARGBRowDown2Int_C(const uint8* src_argb, ptrdiff_t src_stride,
static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
for (int x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] +
......@@ -677,7 +677,7 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
}
}
static void ScaleARGBRowDownEvenInt_C(const uint8* src_argb,
static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
......@@ -748,18 +748,18 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C;
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 :
ScaleARGBRowDown2_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON :
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
ScaleARGBRowDown2_NEON;
}
#endif
......@@ -788,17 +788,17 @@ static void ScaleARGBDownEven(int src_width, int src_height,
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
int src_step, uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_NEON :
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
#endif
......
......@@ -38,11 +38,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
);
}
void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"add %1, %1, %0 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
......@@ -74,11 +74,9 @@ void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
int src_stepx,
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
uint8* dst_argb, int dst_width) {
asm volatile (
"add %0, #4 \n" // point to odd pixels.
"mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n"
......@@ -86,7 +84,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
"vld1.32 {d0[1]}, [%0], r12 \n"
"vld1.32 {d1[0]}, [%0], r12 \n"
"vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, #4 \n" // 4 pixels per loop.
"subs %2, %2, #4 \n" // 4 pixels per loop.
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
......@@ -99,12 +97,12 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
asm volatile (
"mov r12, %4, lsl #2 \n"
"add %1, %0 \n"
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
......@@ -125,7 +123,7 @@ void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, #4 \n" // 4 pixels per loop.
"subs %3, %3, #4 \n" // 4 pixels per loop.
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
......
......@@ -76,7 +76,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
);
}
void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
......@@ -230,7 +230,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
);
}
void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
......@@ -355,7 +355,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
);
}
void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
......@@ -410,7 +410,7 @@ void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
......@@ -506,7 +506,7 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
);
}
void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
......@@ -558,7 +558,7 @@ void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
......
......@@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n"
"vld2.u8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
"vst1.u8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
......@@ -39,14 +39,14 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
);
}
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
"vld1.u8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
......@@ -69,12 +69,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld2.u8 {d0, d1}, [%0]! \n"
"subs %2, #4 \n"
"vtrn.u8 d1, d0 \n"
"vshrn.u16 d0, q0, #8 \n"
"vst1.u32 {d0[1]}, [%1]! \n"
"bgt 1b \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.u8 {d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
......@@ -83,7 +81,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
);
}
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"add r4, %0, %3 \n"
......@@ -94,7 +92,7 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vld1.u8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n"
"subs %2, #4 \n"
"subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n"
......@@ -121,7 +119,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
asm volatile (
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, #24 \n"
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
......@@ -133,7 +131,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
);
}
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -142,7 +140,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, #24 \n"
"subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
......@@ -189,7 +187,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
);
}
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -198,7 +196,7 @@ void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, #24 \n"
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
......@@ -247,7 +245,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
"vld1.u8 {q3}, [%3] \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, #12 \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n"
......@@ -262,7 +260,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
}
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -280,7 +278,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
"subs %2, #12 \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
......@@ -372,7 +370,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
}
// 32x2 -> 12x1
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......@@ -387,7 +385,7 @@ void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, #12 \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
......@@ -487,7 +485,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
......@@ -502,7 +500,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"25: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
......@@ -513,7 +511,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 50b \n"
......@@ -523,7 +521,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"subs %3, #16 \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
......@@ -533,7 +531,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"subs %3, %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n"
......
......@@ -165,7 +165,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_Opt) {
memset(src_b, 0, kMaxWidth);
int count = benchmark_iterations_ *
(benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth;
((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
for (int i = 0; i < count; ++i) {
h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment