Commit ae55e418 authored by Frank Barchard's avatar Frank Barchard

use rounding in scaledown by 2

When scaling down by 2 the formula should round consistently.
(a+b+c+d+2)/4
The C version did but the SSE2 version was doing 2 averages.
avg(avg(a,b),avg(c,d))
This change uses a sum, then rounds.

R=dhrosa@google.com, harryjin@google.com
BUG=libyuv:447,libyuv:527

Review URL: https://codereview.chromium.org/1513183004 .
parent 71b60123
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1553
Version: 1554
License: BSD
License File: LICENSE
......
......@@ -56,7 +56,7 @@ extern "C" {
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSE2
#define HAS_SCALEROWDOWN2_SSSE3
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSE2
......@@ -232,12 +232,12 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
......@@ -269,11 +269,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1553
#define LIBYUV_VERSION 1554
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -699,11 +699,11 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2;
#if defined(HAS_SCALEROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
if (IS_ALIGNED(halfwidth, 16)) {
ScaleRowDown2 = ScaleRowDown2Box_SSE2;
ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
}
}
#endif
......
......@@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
ScaleRowDown2Box_Any_SSE2);
#if defined(HAS_SCALEROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
ScaleRowDown2Box_Any_SSSE3);
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
ScaleRowDown2Box_SSE2);
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
ScaleRowDown2Box_SSSE3);
}
}
#endif
......
......@@ -55,11 +55,11 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEROWDOWN2_SSE2
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
#ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
......
......@@ -98,8 +98,8 @@ static uvec16 kScaleAb2 =
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
......@@ -120,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psrlw $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pavgw %%xmm5,%%xmm0 \n"
"pavgw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
......@@ -147,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
);
}
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
......@@ -164,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psrlw $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"psrlw $0x1,%%xmm0 \n"
"psrlw $0x1,%%xmm1 \n"
"pavgw %%xmm5,%%xmm0 \n"
"pavgw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
......
......@@ -95,8 +95,8 @@ static uvec16 kScaleAb2 =
// Reads 32 pixels, throws half away and writes 16 pixels.
__declspec(naked)
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
......@@ -121,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x1 rectangle to 16x1.
__declspec(naked)
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm1, xmm5
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
......@@ -157,16 +154,19 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x2 rectangle to 16x1.
__declspec(naked)
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
wloop:
movdqu xmm0, [eax]
......@@ -174,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
paddw xmm0, xmm2 // vertical add
paddw xmm1, xmm3
psrlw xmm0, 1
psrlw xmm1, 1
pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm1, xmm5
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
......@@ -245,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
......@@ -263,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
// For rounding, average = (sum + 2) / 4
// becomes average((sum >> 1), 0)
// Blends 64x2 rectangle to 32x1.
__declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
......@@ -280,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vmovdqu ymm2, [eax + esi]
vmovdqu ymm3, [eax + esi + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // vertical add
vpaddw ymm1, ymm1, ymm3
vpsrlw ymm0, ymm0, 1
vpsrlw ymm1, ymm1, 1
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
......
......@@ -1422,8 +1422,8 @@ static void TestI420Blend(int width, int height, int benchmark_iterations,
EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
}
for (int i = 0; i < kSizeUV; ++i) {
EXPECT_NEAR(dst_u_c[i + off], dst_u_opt[i + off], 1); // Subsample off by 1
EXPECT_NEAR(dst_v_c[i + off], dst_v_opt[i + off], 1);
EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
}
free_aligned_buffer_64(src_y0);
free_aligned_buffer_64(src_u0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment