Commit 05416e2d authored by fbarchard@google.com's avatar fbarchard@google.com

Box filter for YUV use rows with accumulation buffer for better memory behavior.…

Box filter for YUV use rows with accumulation buffer for better memory behavior.  The old code would do columns accumulated into registers, and then store the result once.  This was slow from a memory point of view.  The new code does a row of source at a time, updating an accumulation buffer every row.  The accumulation buffer is small, and should fit cache.  Before each accumulation of N rows, the buffer needs to be reset to zero.  If the memset is a bottleneck, it would be faster to do the first row without an add, storing to the accumulation buffer, and then add for the remaining rows.
BUG=425
TESTED=out\release\libyuv_unittest --gtest_filter=*ScaleTo1x1*
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/52659004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1428 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent b07de879
...@@ -30,13 +30,11 @@ extern "C" { ...@@ -30,13 +30,11 @@ extern "C" {
#define VISUALC_HAS_AVX2 1 #define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012 #endif // VisualStudio >= 2012
// The following are available on all x86 platforms: // The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86 #define HAS_FIXEDDIV_X86
#define HAS_SCALEADDROWS_SSE2
#define HAS_SCALEARGBCOLS_SSE2 #define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2 #define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3 #define HAS_SCALEARGBFILTERCOLS_SSSE3
...@@ -50,17 +48,21 @@ extern "C" { ...@@ -50,17 +48,21 @@ extern "C" {
#define HAS_SCALEROWDOWN4_SSE2 #define HAS_SCALEROWDOWN4_SSE2
#endif #endif
// The following are available on VS2012. // The following are available on VS2012:
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_SCALEADDROWS_AVX2 #define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2 #define HAS_SCALEROWDOWN4_AVX2
#endif #endif
// The following are available on Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEADDROWS_NEON
#define HAS_SCALEARGBCOLS_NEON #define HAS_SCALEARGBCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON
...@@ -183,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -183,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width); uint16* dst_ptr, int dst_width);
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
uint16* dst_ptr, int src_width, int src_height); void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height);
void ScaleARGBRowDown2_C(const uint8* src_argb, void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
...@@ -289,14 +289,10 @@ void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, ...@@ -289,14 +289,10 @@ void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
uint16* dst_ptr, int src_width, int src_height); void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
uint16* dst_ptr, int src_width, int src_height); void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRows_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx); int dst_width, int x, int dx);
...@@ -442,10 +438,8 @@ void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -442,10 +438,8 @@ void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
uint16* dst_ptr, int src_width, int src_height); void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRows_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx); int dst_width, int x, int dx);
......
...@@ -733,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height, ...@@ -733,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
int dst_width, int dst_height, int dst_width, int dst_height,
int src_stride, int dst_stride, int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) { const uint8* src_ptr, uint8* dst_ptr) {
int j; int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point. // Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0; int x = 0;
int y = 0; int y = 0;
...@@ -750,29 +750,29 @@ static void ScalePlaneBox(int src_width, int src_height, ...@@ -750,29 +750,29 @@ static void ScalePlaneBox(int src_width, int src_height,
const uint16* src_ptr, uint8* dst_ptr) = const uint16* src_ptr, uint8* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C: (dx & 0xffff) ? ScaleAddCols2_C:
((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; ScaleAddRow_C;
#if defined(HAS_SCALEADDROWS_SSE2) #if defined(HAS_SCALEADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleAddRows = ScaleAddRows_Any_SSE2; ScaleAddRow = ScaleAddRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) { if (IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_SSE2; ScaleAddRow = ScaleAddRow_SSE2;
} }
} }
#endif #endif
#if defined(HAS_SCALEADDROWS_AVX2) #if defined(HAS_SCALEADDROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
ScaleAddRows = ScaleAddRows_Any_AVX2; ScaleAddRow = ScaleAddRow_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) { if (IS_ALIGNED(src_width, 32)) {
ScaleAddRows = ScaleAddRows_AVX2; ScaleAddRow = ScaleAddRow_AVX2;
} }
} }
#endif #endif
#if defined(HAS_SCALEADDROWS_NEON) #if defined(HAS_SCALEADDROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleAddRows = ScaleAddRows_Any_NEON; ScaleAddRow = ScaleAddRow_Any_NEON;
if (IS_ALIGNED(src_width, 16)) { if (IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_NEON; ScaleAddRow = ScaleAddRow_NEON;
} }
} }
#endif #endif
...@@ -786,7 +786,11 @@ static void ScalePlaneBox(int src_width, int src_height, ...@@ -786,7 +786,11 @@ static void ScalePlaneBox(int src_width, int src_height,
y = max_y; y = max_y;
} }
boxheight = MIN1((y >> 16) - iy); boxheight = MIN1((y >> 16) - iy);
ScaleAddRows(src, src_stride, (uint16*)(row16), src_width, boxheight); memset(row16, 0, src_width * 2);
for (k = 0; k < boxheight; ++k) {
ScaleAddRow(src, (uint16 *)(row16), src_width);
src += src_stride;
}
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
dst_ptr += dst_stride; dst_ptr += dst_stride;
} }
...@@ -798,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height, ...@@ -798,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
int dst_width, int dst_height, int dst_width, int dst_height,
int src_stride, int dst_stride, int src_stride, int dst_stride,
const uint16* src_ptr, uint16* dst_ptr) { const uint16* src_ptr, uint16* dst_ptr) {
int j; int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point. // Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0; int x = 0;
int y = 0; int y = 0;
...@@ -814,12 +818,12 @@ static void ScalePlaneBox_16(int src_width, int src_height, ...@@ -814,12 +818,12 @@ static void ScalePlaneBox_16(int src_width, int src_height,
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) = const uint32* src_ptr, uint16* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride, void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; ScaleAddRow_16_C;
#if defined(HAS_SCALEADDROWS_16_SSE2) #if defined(HAS_SCALEADDROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_16_SSE2; ScaleAddRow = ScaleAddRow_16_SSE2;
} }
#endif #endif
...@@ -832,7 +836,11 @@ static void ScalePlaneBox_16(int src_width, int src_height, ...@@ -832,7 +836,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
y = max_y; y = max_y;
} }
boxheight = MIN1((y >> 16) - iy); boxheight = MIN1((y >> 16) - iy);
ScaleAddRows(src, src_stride, (uint32*)(row32), src_width, boxheight); memset(row32, 0, src_width * 4);
for (k = 0; k < boxheight; ++k) {
ScaleAddRow(src, (uint32 *)(row32), src_width);
src += src_stride;
}
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
dst_ptr += dst_stride; dst_ptr += dst_stride;
} }
......
...@@ -169,25 +169,23 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, ...@@ -169,25 +169,23 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
#endif #endif
// Add rows box filter scale down. // Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROWS_SIMD, SCALEADDROWS_C, MASK) \ #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
uint16* dst_ptr, int src_width, int src_height) { \
int n = src_width & ~MASK; \ int n = src_width & ~MASK; \
if (n > 0) { \ if (n > 0) { \
SCALEADDROWS_SIMD(src_ptr, src_stride, dst_ptr, n, src_height); \ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \ } \
SCALEADDROWS_C(src_ptr + n, src_stride, \ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
dst_ptr + n, src_width & MASK, src_height); \
} }
#ifdef HAS_SCALEADDROWS_SSE2 #ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRows_Any_SSE2, ScaleAddRows_SSE2, ScaleAddRows_C, 15) SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
#endif #endif
#ifdef HAS_SCALEADDROWS_AVX2 #ifdef HAS_SCALEADDROW_AVX2
SAANY(ScaleAddRows_Any_AVX2, ScaleAddRows_AVX2, ScaleAddRows_C, 31) SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#endif #endif
#ifdef HAS_SCALEADDROWS_NEON #ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRows_Any_NEON, ScaleAddRows_NEON, ScaleAddRows_C, 15) SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif #endif
#undef SAANY #undef SAANY
......
...@@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, ...@@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
} }
} }
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
uint16* dst_ptr, int src_width, int src_height) {
int x; int x;
assert(src_width > 0); assert(src_width > 0);
assert(src_height > 0); for (x = 0; x < src_width - 1; x += 2) {
for (x = 0; x < src_width; ++x) { dst_ptr[0] += src_ptr[0];
const uint8* s = src_ptr + x; dst_ptr[1] += src_ptr[1];
unsigned int sum = 0u; src_ptr += 2;
int y; dst_ptr += 2;
for (y = 0; y < src_height; ++y) { }
sum += s[0]; if (src_width & 1) {
s += src_stride; dst_ptr[0] += src_ptr[0];
}
// TODO(fbarchard): Consider limiting height to 256 to avoid overflow.
dst_ptr[x] = sum < 65535u ? sum : 65535u;
} }
} }
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
uint32* dst_ptr, int src_width, int src_height) {
int x; int x;
assert(src_width > 0); assert(src_width > 0);
assert(src_height > 0); for (x = 0; x < src_width - 1; x += 2) {
for (x = 0; x < src_width; ++x) { dst_ptr[0] += src_ptr[0];
const uint16* s = src_ptr + x; dst_ptr[1] += src_ptr[1];
unsigned int sum = 0u; src_ptr += 2;
int y; dst_ptr += 2;
for (y = 0; y < src_height; ++y) { }
sum += s[0]; if (src_width & 1) {
s += src_stride; dst_ptr[0] += src_ptr[0];
}
// No risk of overflow here now
dst_ptr[x] = sum;
} }
} }
......
...@@ -800,104 +800,61 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ...@@ -800,104 +800,61 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
} }
} }
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16 bytes and accumulates to 16 shorts at a time.
__declspec(naked) __declspec(naked)
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
uint16* dst_ptr, int src_width, int src_height) {
__asm { __asm {
push esi mov eax, [esp + 4] // src_ptr
push edi mov edx, [esp + 8] // dst_ptr
push ebx mov ecx, [esp + 12] // src_width
push ebp pxor xmm5, xmm5
mov esi, [esp + 16 + 4] // src_ptr
mov edx, [esp + 16 + 8] // src_stride
mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
pxor xmm4, xmm4
// sum rows // sum rows
xloop: xloop:
movdqu xmm2, [eax] // read 16 pixels movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + edx] // advance to next row lea eax, [eax + 16]
movdqa xmm3, xmm2 movdqu xmm0, [edx] // read 16 words from destination
punpcklbw xmm2, xmm4 movdqu xmm1, [edx + 16]
punpckhbw xmm3, xmm4 movdqa xmm2, xmm3
punpcklbw xmm2, xmm5
punpckhbw xmm3, xmm5
paddusw xmm0, xmm2 // sum 16 words paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3 paddusw xmm1, xmm3
sub ebp, 1 movdqu [edx], xmm0 // write 16 words to destination
jg xloop movdqu [edx + 16], xmm1
lea edx, [edx + 32]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
lea edi, [edi + 32] // dst_ptr += 16
lea esi, [esi + 16] // src_ptr += 16
mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
sub ecx, 16 sub ecx, 16
jg xloop jg xloop
pop ebp
pop ebx
pop edi
pop esi
ret ret
} }
} }
// Reads 32xN bytes and produces 32 shorts at a time. // Reads 32 bytes and accumulates to 32 shorts at a time.
__declspec(naked) __declspec(naked)
void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
uint16* dst_ptr, int src_width, int src_height) {
__asm { __asm {
push esi mov eax, [esp + 4] // src_ptr
push edi mov edx, [esp + 8] // dst_ptr
push ebx mov ecx, [esp + 12] // src_width
push ebp vpxor ymm5, ymm5, ymm5
mov esi, [esp + 16 + 4] // src_ptr
mov edx, [esp + 16 + 8] // src_stride
mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
mov eax, esi // row pointer
mov ebp, ebx // height
vpxor ymm0, ymm0, ymm0 // clear accumulators
vpxor ymm1, ymm1, ymm1
vpxor ymm4, ymm4, ymm4
// sum rows // sum rows
xloop: xloop:
vmovdqu ymm2, [eax] // read 16 pixels vmovdqu ymm3, [eax] // read 32 bytes
vpermq ymm2, ymm2, 0xd8 // unmutate for vpunpck vpermq ymm3, ymm2, 0xd8 // unmutate for vpunpck
lea eax, [eax + edx] // advance to next row lea eax, [eax + 32]
vpunpckhbw ymm3, ymm2, ymm4 vmovdqu ymm0, [edx] // read 32 words from destination
vpunpcklbw ymm2, ymm2, ymm4 vmovdqu ymm1, [edx + 32]
vpunpcklbw ymm2, ymm3, ymm5
vpunpckhbw ymm3, ymm3, ymm5
vpaddusw ymm0, ymm0, ymm2 // sum 16 words vpaddusw ymm0, ymm0, ymm2 // sum 16 words
vpaddusw ymm1, ymm1, ymm3 vpaddusw ymm1, ymm1, ymm3
sub ebp, 1 vmovdqu [edx], ymm0 // write 32 words to destination
jg xloop vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
vmovdqu [edi], ymm0
vmovdqu [edi + 32], ymm1
lea edi, [edi + 64] // dst_ptr
lea esi, [esi + 32] // src_ptr
mov eax, esi // row pointer
mov ebp, ebx // height
vpxor ymm0, ymm0, ymm0 // clear accumulators
vpxor ymm1, ymm1, ymm1
sub ecx, 32 sub ecx, 32
jg xloop jg xloop
pop ebp
pop ebx
pop edi
pop esi
vzeroupper vzeroupper
ret ret
} }
......
...@@ -690,6 +690,8 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ ...@@ -690,6 +690,8 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
#if defined(__arm__) || defined (__aarch64__) #if defined(__arm__) || defined (__aarch64__)
// arm version subsamples by summing 4 pixels then multiplying by matrix with
// 4x smaller coefficients which are rounded to nearest integer.
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
#else #else
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment