Commit ae55e418 authored by Frank Barchard's avatar Frank Barchard

use rounding in scaledown by 2

When scaling down by 2 the formula should round consistently.
(a+b+c+d+2)/4
The C version did but the SSE2 version was doing 2 averages.
avg(avg(a,b),avg(c,d))
This change uses a sum, then rounds.

R=dhrosa@google.com, harryjin@google.com
BUG=libyuv:447,libyuv:527

Review URL: https://codereview.chromium.org/1513183004 .
parent 71b60123
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1553 Version: 1554
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -56,7 +56,7 @@ extern "C" { ...@@ -56,7 +56,7 @@ extern "C" {
#define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2 #define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3 #define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSE2 #define HAS_SCALEROWDOWN2_SSSE3
#define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSE2 #define HAS_SCALEROWDOWN4_SSE2
...@@ -232,11 +232,11 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, ...@@ -232,11 +232,11 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx); int dst_width, int x, int dx);
// Specialized scalers for x86. // Specialized scalers for x86.
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
...@@ -269,11 +269,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ...@@ -269,11 +269,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1553 #define LIBYUV_VERSION 1554
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -699,11 +699,11 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, ...@@ -699,11 +699,11 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_SSE2) #if defined(HAS_SCALEROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2; ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
if (IS_ALIGNED(halfwidth, 16)) { if (IS_ALIGNED(halfwidth, 16)) {
ScaleRowDown2 = ScaleRowDown2Box_SSE2; ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
} }
} }
#endif #endif
......
...@@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height, ...@@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_SSE2) #if defined(HAS_SCALEROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 : ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
ScaleRowDown2Box_Any_SSE2); ScaleRowDown2Box_Any_SSSE3);
if (IS_ALIGNED(dst_width, 16)) { if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
ScaleRowDown2Box_SSE2); ScaleRowDown2Box_SSSE3);
} }
} }
#endif #endif
......
...@@ -55,11 +55,11 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, ...@@ -55,11 +55,11 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
dst_ptr + n * BPP, r); \ dst_ptr + n * BPP, r); \
} }
#ifdef HAS_SCALEROWDOWN2_SSE2 #ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C, 2, 1, 15) ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
2, 1, 15) 2, 1, 15)
#endif #endif
#ifdef HAS_SCALEROWDOWN2_AVX2 #ifdef HAS_SCALEROWDOWN2_AVX2
......
...@@ -98,7 +98,7 @@ static uvec16 kScaleAb2 = ...@@ -98,7 +98,7 @@ static uvec16 kScaleAb2 =
// Generated using gcc disassembly on Visual C object file: // Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt // objdump -D yuvscaler.obj >yuvscaler.txt
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
LABELALIGN LABELALIGN
...@@ -120,25 +120,23 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -120,25 +120,23 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0x8,%%xmm5 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"movdqa %%xmm1,%%xmm3 \n" "pavgw %%xmm5,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "pavgw %%xmm5,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
...@@ -147,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -147,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm5" :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
); );
} }
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0x8,%%xmm5 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
...@@ -164,16 +164,16 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -164,16 +164,16 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "paddw %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n" "psrlw $0x1,%%xmm0 \n"
"pand %%xmm5,%%xmm3 \n" "psrlw $0x1,%%xmm1 \n"
"pavgw %%xmm2,%%xmm0 \n" "pavgw %%xmm5,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n" "pavgw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
......
...@@ -95,7 +95,7 @@ static uvec16 kScaleAb2 = ...@@ -95,7 +95,7 @@ static uvec16 kScaleAb2 =
// Reads 32 pixels, throws half away and writes 16 pixels. // Reads 32 pixels, throws half away and writes 16 pixels.
__declspec(naked) __declspec(naked)
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
mov eax, [esp + 4] // src_ptr mov eax, [esp + 4] // src_ptr
...@@ -121,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -121,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x1 rectangle to 16x1. // Blends 32x1 rectangle to 16x1.
__declspec(naked) __declspec(naked)
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
mov eax, [esp + 4] // src_ptr mov eax, [esp + 4] // src_ptr
// src_stride // src_stride
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pmaddubsw xmm0, xmm4 // horizontal add
movdqa xmm2, xmm0 // average columns (32 to 16 pixels) pmaddubsw xmm1, xmm4
psrlw xmm0, 8 pavgw xmm0, xmm5 // (x + 1) / 2
movdqa xmm3, xmm1 pavgw xmm1, xmm5
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
...@@ -157,7 +154,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -157,7 +154,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x2 rectangle to 16x1. // Blends 32x2 rectangle to 16x1.
__declspec(naked) __declspec(naked)
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
push esi push esi
...@@ -165,8 +162,11 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -165,8 +162,11 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -174,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -174,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
movdqu xmm2, [eax + esi] movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16] movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pavgb xmm0, xmm2 // average rows pmaddubsw xmm0, xmm4 // horizontal add
pavgb xmm1, xmm3 pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
movdqa xmm2, xmm0 // average columns (32 to 16 pixels) pmaddubsw xmm3, xmm4
psrlw xmm0, 8 paddw xmm0, xmm2 // vertical add
movdqa xmm3, xmm1 paddw xmm1, xmm3
psrlw xmm1, 8 psrlw xmm0, 1
pand xmm2, xmm5 psrlw xmm1, 1
pand xmm3, xmm5 pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm0, xmm2 pavgw xmm1, xmm5
pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
...@@ -245,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -245,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64] lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5 vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32 sub ecx, 32
...@@ -263,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -263,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
} }
// For rounding, average = (sum + 2) / 4
// becomes average((sum >> 1), 0)
// Blends 64x2 rectangle to 32x1. // Blends 64x2 rectangle to 32x1.
__declspec(naked) __declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
...@@ -280,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -280,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vpxor ymm5, ymm5, ymm5 // constant 0 vpxor ymm5, ymm5, ymm5 // constant 0
wloop: wloop:
vmovdqu ymm0, [eax] // average rows vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi] vmovdqu ymm2, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32] vmovdqu ymm3, [eax + esi + 32]
lea eax, [eax + 64] lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // vertical add
vpaddw ymm1, ymm1, ymm3
vpsrlw ymm0, ymm0, 1
vpsrlw ymm1, ymm1, 1
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5 vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32 sub ecx, 32
......
...@@ -1422,8 +1422,8 @@ static void TestI420Blend(int width, int height, int benchmark_iterations, ...@@ -1422,8 +1422,8 @@ static void TestI420Blend(int width, int height, int benchmark_iterations,
EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]); EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
} }
for (int i = 0; i < kSizeUV; ++i) { for (int i = 0; i < kSizeUV; ++i) {
EXPECT_NEAR(dst_u_c[i + off], dst_u_opt[i + off], 1); // Subsample off by 1 EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
EXPECT_NEAR(dst_v_c[i + off], dst_v_opt[i + off], 1); EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
} }
free_aligned_buffer_64(src_y0); free_aligned_buffer_64(src_y0);
free_aligned_buffer_64(src_u0); free_aligned_buffer_64(src_u0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment