Commit 8ffe78ab authored by fbarchard@google.com's avatar fbarchard@google.com

Scale down by 4 used 3rd pixel

BUG=232
TEST=convert.exe -f 0 faces_640x480_P420.yuv face2_160x120_P420.yuv
R=changjun.yang@intel.com

Review URL: https://webrtc-codereview.appspot.com/1579005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@709 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8b54a8f9
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 708 Version: 709
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 708 #define LIBYUV_VERSION 709
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -137,15 +137,15 @@ int I422ToI420(const uint8* src_y, int src_stride_y, ...@@ -137,15 +137,15 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
#elif !defined(LIBYUV_DISABLE_X86) && \ #elif !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
#endif #endif
void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
LIBYUV_API LIBYUV_API
...@@ -173,11 +173,11 @@ int I444ToI420(const uint8* src_y, int src_stride_y, ...@@ -173,11 +173,11 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
} }
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C; uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
#if defined(HAS_SCALEROWDOWN2_NEON) #if defined(HAS_SCALEROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(halfwidth, 16)) { IS_ALIGNED(halfwidth, 16)) {
ScaleRowDown2 = ScaleRowDown2Int_NEON; ScaleRowDown2 = ScaleRowDown2Box_NEON;
} }
#elif defined(HAS_SCALEROWDOWN2_SSE2) #elif defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
...@@ -186,7 +186,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y, ...@@ -186,7 +186,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
ScaleRowDown2 = ScaleRowDown2Int_SSE2; ScaleRowDown2 = ScaleRowDown2Box_SSE2;
} }
#endif #endif
......
This diff is collapsed.
...@@ -34,12 +34,12 @@ static __inline int Abs(int v) { ...@@ -34,12 +34,12 @@ static __inline int Abs(int v) {
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, int src_stride, void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
#endif #endif
...@@ -75,7 +75,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -75,7 +75,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
// Blends 8x2 rectangle to 4x1. // Blends 8x2 rectangle to 4x1.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
static void ScaleARGBRowDown2Int_SSE2(const uint8* src_argb, static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
__asm { __asm {
...@@ -150,7 +150,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -150,7 +150,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
// Blends four 2x2 to 4x1. // Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned. // Alignment requirement: dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb, static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
...@@ -366,7 +366,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -366,7 +366,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
); );
} }
static void ScaleARGBRowDown2Int_SSE2(const uint8* src_argb, static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
...@@ -438,7 +438,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -438,7 +438,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
// Blends four 2x2 to 4x1. // Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned. // Alignment requirement: dst_argb 16 byte aligned.
static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb, static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, int src_stepx, ptrdiff_t src_stride, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
...@@ -644,7 +644,7 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb, ...@@ -644,7 +644,7 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb,
} }
} }
static void ScaleARGBRowDown2Int_C(const uint8* src_argb, ptrdiff_t src_stride, static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
for (int x = 0; x < dst_width; ++x) { for (int x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] + dst_argb[0] = (src_argb[0] + src_argb[4] +
...@@ -677,7 +677,7 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, ...@@ -677,7 +677,7 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
} }
} }
static void ScaleARGBRowDownEvenInt_C(const uint8* src_argb, static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
...@@ -748,18 +748,18 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, ...@@ -748,18 +748,18 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
int row_stride = src_stride * (dy >> 16); int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) = uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C; filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C;
#if defined(HAS_SCALEARGBROWDOWN2_SSE2) #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 : ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 :
ScaleARGBRowDown2_SSE2; ScaleARGBRowDown2_SSE2;
} }
#elif defined(HAS_SCALEARGBROWDOWN2_NEON) #elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON : ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
ScaleARGBRowDown2_NEON; ScaleARGBRowDown2_NEON;
} }
#endif #endif
...@@ -788,17 +788,17 @@ static void ScaleARGBDownEven(int src_width, int src_height, ...@@ -788,17 +788,17 @@ static void ScaleARGBDownEven(int src_width, int src_height,
src_argb += (y >> 16) * src_stride + (x >> 16) * 4; src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
int src_step, uint8* dst_argb, int dst_width) = int src_step, uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C; filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 : ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2; ScaleARGBRowDownEven_SSE2;
} }
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) #elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) && if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 4)) { IS_ALIGNED(src_argb, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_NEON : ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON; ScaleARGBRowDownEven_NEON;
} }
#endif #endif
......
...@@ -38,11 +38,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -38,11 +38,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
); );
} }
void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
...@@ -74,11 +74,9 @@ void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -74,11 +74,9 @@ void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned. // Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
"add %0, #4 \n" // point to odd pixels.
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
...@@ -86,7 +84,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, ...@@ -86,7 +84,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
"vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n"
"vld1.32 {d1[0]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n"
"vld1.32 {d1[1]}, [%0], r12 \n" "vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, #4 \n" // 4 pixels per loop. "subs %2, %2, #4 \n" // 4 pixels per loop.
"vst1.8 {q0}, [%1]! \n" "vst1.8 {q0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -99,12 +97,12 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, ...@@ -99,12 +97,12 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned. // Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %0 \n" "add %1, %1, %0 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
...@@ -125,7 +123,7 @@ void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -125,7 +123,7 @@ void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, #4 \n" // 4 pixels per loop. "subs %3, %3, #4 \n" // 4 pixels per loop.
"vst1.8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
......
...@@ -76,7 +76,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -76,7 +76,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
); );
} }
void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride; const uint8* t = src_ptr + src_stride;
...@@ -230,7 +230,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -230,7 +230,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
); );
} }
void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
intptr_t stride = src_stride; intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride; const uint8* s1 = src_ptr + stride;
...@@ -355,7 +355,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -355,7 +355,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
); );
} }
void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) { uint8* d, int dst_width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
...@@ -410,7 +410,7 @@ void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -410,7 +410,7 @@ void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) { uint8* d, int dst_width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
...@@ -506,7 +506,7 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -506,7 +506,7 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
); );
} }
void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride; intptr_t stride = src_stride;
const uint8* t = src_ptr + stride; const uint8* t = src_ptr + stride;
...@@ -558,7 +558,7 @@ void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -558,7 +558,7 @@ void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr, void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride; intptr_t stride = src_stride;
......
...@@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n" "vld2.u8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vst1.u8 {q1}, [%1]! \n" // store odd pixels "vst1.u8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n" "bgt 1b \n"
...@@ -39,14 +39,14 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -39,14 +39,14 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
); );
} }
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc "vld1.u8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc "vld1.u8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1 \n"
...@@ -69,12 +69,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -69,12 +69,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"1: \n" "1: \n"
"vld2.u8 {d0, d1}, [%0]! \n" "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, #4 \n" "subs %2, %2, #8 \n" // 8 processed per loop
"vtrn.u8 d1, d0 \n" "vst1.u8 {d2}, [%1]! \n"
"vshrn.u16 d0, q0, #8 \n" "bgt 1b \n"
"vst1.u32 {d0[1]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -83,7 +81,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ...@@ -83,7 +81,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
); );
} }
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"add r4, %0, %3 \n" "add r4, %0, %3 \n"
...@@ -94,7 +92,7 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -94,7 +92,7 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vld1.u8 {q1}, [r4]! \n" "vld1.u8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n" "vld1.u8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n" "vld1.u8 {q3}, [%3]! \n"
"subs %2, #4 \n" "subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n" "vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n" "vpadal.u8 q0, q2 \n"
...@@ -121,7 +119,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -121,7 +119,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, #24 \n" "subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2 "vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.u8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
...@@ -133,7 +131,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -133,7 +131,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
); );
} }
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
...@@ -142,7 +140,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, ...@@ -142,7 +140,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
"1: \n" "1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, #24 \n" "subs %2, %2, #24 \n"
// filter src line 0 with src line 1 // filter src line 0 with src line 1
// expand chars to shorts to allow for room // expand chars to shorts to allow for room
...@@ -189,7 +187,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, ...@@ -189,7 +187,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
); );
} }
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
...@@ -198,7 +196,7 @@ void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, ...@@ -198,7 +196,7 @@ void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
"1: \n" "1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, #24 \n" "subs %2, %2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n" "vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n" "vrhadd.u8 q1, q1, q3 \n"
...@@ -247,7 +245,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -247,7 +245,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
"vld1.u8 {q3}, [%3] \n" "vld1.u8 {q3}, [%3] \n"
"1: \n" "1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n" "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, #12 \n" "subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n" "vst1.u8 {d4}, [%1]! \n"
...@@ -262,7 +260,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ...@@ -262,7 +260,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
} }
// 32x3 -> 12x1 // 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
...@@ -280,7 +278,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, ...@@ -280,7 +278,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n" "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
"subs %2, #12 \n" "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
...@@ -372,7 +370,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, ...@@ -372,7 +370,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
} }
// 32x2 -> 12x1 // 32x2 -> 12x1
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
...@@ -387,7 +385,7 @@ void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, ...@@ -387,7 +385,7 @@ void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, #12 \n" "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
...@@ -487,7 +485,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -487,7 +485,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"1: \n" "1: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n" "vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n" "vmlal.u8 q13, d2, d5 \n"
...@@ -502,7 +500,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -502,7 +500,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"25: \n" "25: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
...@@ -513,7 +511,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -513,7 +511,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"50: \n" "50: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n" "vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 50b \n" "bgt 50b \n"
...@@ -523,7 +521,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -523,7 +521,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"75: \n" "75: \n"
"vld1.u8 {q1}, [%1]! \n" "vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n" "vld1.u8 {q0}, [%2]! \n"
"subs %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
...@@ -533,7 +531,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -533,7 +531,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n" "subs %3, %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
......
...@@ -165,7 +165,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_Opt) { ...@@ -165,7 +165,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_Opt) {
memset(src_b, 0, kMaxWidth); memset(src_b, 0, kMaxWidth);
int count = benchmark_iterations_ * int count = benchmark_iterations_ *
(benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth; ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth); h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment