Commit cc88adc6 authored by Frank Barchard's avatar Frank Barchard

YUV scale filter columns improved filtering accuracy

upscale a YUV image.  observe change in hue.. green especially.
disable ScaleFilterCols_SSSE3, falling back on ScaleFilterCols_C
observe hue.. green especially, is better.

was ScaleFrom1280x720_Bilinear (1620 ms)
now ScaleFrom1280x720_Bilinear (1907 ms)

BUG=libyuv:605
TEST=try bots
R=harryjin@google.com, wangcheng@google.com

Review URL: https://codereview.chromium.org/2084533006 .
parent 24b9fa66
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1599
Version: 1600
License: BSD
License File: LICENSE
......
......@@ -61,8 +61,7 @@ extern "C" {
#define HAS_SCALEARGBROWDOWN2_SSE2
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2
// TODO(fbarchard): HAS_SCALEFILTERCOLS_SSSE3 doesnt match C very well.
// #define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSSE3
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1599
#define LIBYUV_VERSION 1600
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -417,8 +417,16 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
// (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__)
// arm uses 16 bit math with truncation.
// TODO(fbarchard): add rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16))
(((int)((f)) * ((int)(b) - (int)(a))) >> 16))
#else
// inteluses 7 bit math with rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \
(((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
......@@ -470,8 +478,9 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
}
#undef BLENDER
// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) (uint16)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16))
(((int)((f)) * ((int)(b) - (int)(a))) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
......@@ -809,6 +818,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
}
}
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=605.
// Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \
......
......@@ -821,6 +821,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version.
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
......@@ -831,7 +841,10 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movl $0x04040000,%k2 \n"
"movd %k2,%%xmm5 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n" // 0x007f007f
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n" // 0x00010001
"pextrw $0x1,%%xmm2,%k3 \n"
"subl $0x2,%5 \n"
"jl 29f \n"
......@@ -853,13 +866,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm0 \n"
"pxor %%xmm6,%%xmm1 \n"
"pmaddubsw %%xmm1,%%xmm0 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
"paddusb %%xmm7,%%xmm1 \n"
"pmaddubsw %%xmm0,%%xmm1 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"pextrw $0x3,%%xmm2,%k4 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%k2 \n"
"paddw %9,%%xmm1 \n" // make pixels unsigned.
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movd %%xmm1,%k2 \n"
"mov %w2," MEMACCESS(0) " \n"
"lea " MEMLEA(0x2,0) ",%0 \n"
"sub $0x2,%5 \n"
......@@ -873,11 +889,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm2 \n"
"pshufb %%xmm5,%%xmm2 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm2 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%k2 \n"
"paddusb %%xmm7,%%xmm2 \n"
"pmaddubsw %%xmm0,%%xmm2 \n"
"paddw %9,%%xmm2 \n" // make pixels unsigned.
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,%k2 \n"
"mov %b2," MEMACCESS(0) " \n"
"99: \n"
: "+r"(dst_ptr), // %0
......@@ -887,9 +906,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"=&r"(x1), // %4
"+rm"(dst_width) // %5
: "rm"(x), // %6
"rm"(dx) // %7
"rm"(dx), // %7
#if defined(__x86_64__)
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
"m"(kFsub80), // %8
"m"(kFadd40) // %9
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
......
......@@ -572,6 +572,10 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// The NEON version mimics this formula:
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
......
......@@ -860,6 +860,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version.
__declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
......@@ -877,6 +887,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm5, eax
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9
pcmpeqb xmm7, xmm7 // generate 0x0001
psrlw xmm7, 15
pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2
jl xloop29
......@@ -899,20 +911,22 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm4, ebx
pshufb xmm1, xmm5 // 0011
punpcklwd xmm0, xmm4
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm1, xmm6 // 0..7f and 7f..0
pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm0, xmm0 // 8 bits, 2 pixels.
movd ebx, xmm0
paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm1, xmm1 // 8 bits, 2 pixels.
movd ebx, xmm1
mov [edi], bx
lea edi, [edi + 2]
sub ecx, 2 // 2 pixels
jge xloop2
xloop29:
add ecx, 2 - 1
jl xloop99
......@@ -921,11 +935,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions.
pshufb xmm2, xmm5 // 0011
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm2, xmm6 // 0..7f and 7f..0
pmaddubsw xmm0, xmm2 // 16 bit
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm0, xmm0 // 8 bits
movd ebx, xmm0
paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
pmaddubsw xmm2, xmm0 // 16 bit
paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm2, xmm2 // 8 bits
movd ebx, xmm2
mov [edi], bl
xloop99:
......
......@@ -314,10 +314,10 @@ static int TestFilter_16(int src_width, int src_height,
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0)
TEST_FACTOR(8, 1, 8, 3)
TEST_FACTOR(8, 1, 8, 0)
TEST_FACTOR(3by4, 3, 4, 1)
TEST_FACTOR(3by8, 3, 8, 1)
TEST_FACTOR(3, 1, 3, 3)
TEST_FACTOR(3, 1, 3, 0)
#undef TEST_FACTOR1
#undef TEST_FACTOR
#undef SX
......@@ -356,9 +356,9 @@ TEST_FACTOR(3, 1, 3, 3)
// Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 3) \
TEST_SCALETO1(name, width, height, Bilinear, 3) \
TEST_SCALETO1(name, width, height, Box, 3)
TEST_SCALETO1(name, width, height, Linear, 0) \
TEST_SCALETO1(name, width, height, Bilinear, 0) \
TEST_SCALETO1(name, width, height, Box, 0)
TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment