Commit cc88adc6 authored by Frank Barchard's avatar Frank Barchard

YUV scale filter columns improved filtering accuracy

upscale a YUV image.  observe change in hue.. green especially.
disable ScaleFilterCols_SSSE3, falling back on ScaleFilterCols_C
observe hue.. green especially, is better.

was ScaleFrom1280x720_Bilinear (1620 ms)
now ScaleFrom1280x720_Bilinear (1907 ms)

BUG=libyuv:605
TEST=try bots
R=harryjin@google.com, wangcheng@google.com

Review URL: https://codereview.chromium.org/2084533006 .
parent 24b9fa66
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1599 Version: 1600
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -61,8 +61,7 @@ extern "C" { ...@@ -61,8 +61,7 @@ extern "C" {
#define HAS_SCALEARGBROWDOWN2_SSE2 #define HAS_SCALEARGBROWDOWN2_SSE2
#define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2 #define HAS_SCALECOLSUP2_SSE2
// TODO(fbarchard): HAS_SCALEFILTERCOLS_SSSE3 doesnt match C very well. #define HAS_SCALEFILTERCOLS_SSSE3
// #define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSSE3 #define HAS_SCALEROWDOWN2_SSSE3
#define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1599 #define LIBYUV_VERSION 1600
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -417,8 +417,16 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, ...@@ -417,8 +417,16 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
} }
// (1-f)a + fb can be replaced with a + f(b-a) // (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__)
// arm uses 16 bit math with truncation.
// TODO(fbarchard): add rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \ #define BLENDER(a, b, f) (uint8)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16)) (((int)((f)) * ((int)(b) - (int)(a))) >> 16))
#else
// inteluses 7 bit math with rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \
(((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
...@@ -470,8 +478,9 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, ...@@ -470,8 +478,9 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
} }
#undef BLENDER #undef BLENDER
// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) (uint16)((int)(a) + \ #define BLENDER(a, b, f) (uint16)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16)) (((int)((f)) * ((int)(b) - (int)(a))) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
...@@ -809,6 +818,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, ...@@ -809,6 +818,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
} }
} }
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=605.
// Mimics SSSE3 blender // Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \ #define BLENDERC(a, b, f, s) (uint32)( \
......
...@@ -821,6 +821,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { ...@@ -821,6 +821,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
} }
#endif // HAS_SCALEADDROW_AVX2 #endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version. // Bilinear column filtering. SSSE3 version.
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
...@@ -831,7 +841,10 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -831,7 +841,10 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movl $0x04040000,%k2 \n" "movl $0x04040000,%k2 \n"
"movd %k2,%%xmm5 \n" "movd %k2,%%xmm5 \n"
"pcmpeqb %%xmm6,%%xmm6 \n" "pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n" "psrlw $0x9,%%xmm6 \n" // 0x007f007f
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n" // 0x00010001
"pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x1,%%xmm2,%k3 \n"
"subl $0x2,%5 \n" "subl $0x2,%5 \n"
"jl 29f \n" "jl 29f \n"
...@@ -853,13 +866,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -853,13 +866,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm4 \n" "movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm0 \n" "punpcklwd %%xmm4,%%xmm0 \n"
"pxor %%xmm6,%%xmm1 \n" "psubb %8,%%xmm0 \n" // make pixels signed.
"pmaddubsw %%xmm1,%%xmm0 \n" "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
"paddusb %%xmm7,%%xmm1 \n"
"pmaddubsw %%xmm0,%%xmm1 \n"
"pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x1,%%xmm2,%k3 \n"
"pextrw $0x3,%%xmm2,%k4 \n" "pextrw $0x3,%%xmm2,%k4 \n"
"psrlw $0x7,%%xmm0 \n" "paddw %9,%%xmm1 \n" // make pixels unsigned.
"packuswb %%xmm0,%%xmm0 \n" "psrlw $0x7,%%xmm1 \n"
"movd %%xmm0,%k2 \n" "packuswb %%xmm1,%%xmm1 \n"
"movd %%xmm1,%k2 \n"
"mov %w2," MEMACCESS(0) " \n" "mov %w2," MEMACCESS(0) " \n"
"lea " MEMLEA(0x2,0) ",%0 \n" "lea " MEMLEA(0x2,0) ",%0 \n"
"sub $0x2,%5 \n" "sub $0x2,%5 \n"
...@@ -873,11 +889,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -873,11 +889,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm0 \n" "movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm2 \n" "psrlw $0x9,%%xmm2 \n"
"pshufb %%xmm5,%%xmm2 \n" "pshufb %%xmm5,%%xmm2 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm2 \n" "pxor %%xmm6,%%xmm2 \n"
"pmaddubsw %%xmm2,%%xmm0 \n" "paddusb %%xmm7,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n" "pmaddubsw %%xmm0,%%xmm2 \n"
"packuswb %%xmm0,%%xmm0 \n" "paddw %9,%%xmm2 \n" // make pixels unsigned.
"movd %%xmm0,%k2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,%k2 \n"
"mov %b2," MEMACCESS(0) " \n" "mov %b2," MEMACCESS(0) " \n"
"99: \n" "99: \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -887,9 +906,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -887,9 +906,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"=&r"(x1), // %4 "=&r"(x1), // %4
"+rm"(dst_width) // %5 "+rm"(dst_width) // %5
: "rm"(x), // %6 : "rm"(x), // %6
"rm"(dx) // %7 "rm"(dx), // %7
#if defined(__x86_64__)
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
"m"(kFsub80), // %8
"m"(kFadd40) // %9
#endif
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
......
...@@ -572,6 +572,10 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -572,6 +572,10 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
MEMACCESS(6) \ MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// The NEON version mimics this formula:
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3}; int dx_offset[4] = {0, 1, 2, 3};
......
...@@ -860,6 +860,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { ...@@ -860,6 +860,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
} }
#endif // HAS_SCALEADDROW_AVX2 #endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version. // Bilinear column filtering. SSSE3 version.
__declspec(naked) __declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
...@@ -877,6 +887,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -877,6 +887,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm5, eax movd xmm5, eax
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9 psrlw xmm6, 9
pcmpeqb xmm7, xmm7 // generate 0x0001
psrlw xmm7, 15
pextrw eax, xmm2, 1 // get x0 integer. preroll pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2 sub ecx, 2
jl xloop29 jl xloop29
...@@ -899,20 +911,22 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -899,20 +911,22 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm4, ebx movd xmm4, ebx
pshufb xmm1, xmm5 // 0011 pshufb xmm1, xmm5 // 0011
punpcklwd xmm0, xmm4 punpcklwd xmm0, xmm4
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm1, xmm6 // 0..7f and 7f..0 pxor xmm1, xmm6 // 0..7f and 7f..0
pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
pextrw edx, xmm2, 3 // get x1 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
packuswb xmm0, xmm0 // 8 bits, 2 pixels. psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
movd ebx, xmm0 packuswb xmm1, xmm1 // 8 bits, 2 pixels.
movd ebx, xmm1
mov [edi], bx mov [edi], bx
lea edi, [edi + 2] lea edi, [edi + 2]
sub ecx, 2 // 2 pixels sub ecx, 2 // 2 pixels
jge xloop2 jge xloop2
xloop29: xloop29:
add ecx, 2 - 1 add ecx, 2 - 1
jl xloop99 jl xloop99
...@@ -921,11 +935,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -921,11 +935,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm0, ebx movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions. psrlw xmm2, 9 // 7 bit fractions.
pshufb xmm2, xmm5 // 0011 pshufb xmm2, xmm5 // 0011
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm2, xmm6 // 0..7f and 7f..0 pxor xmm2, xmm6 // 0..7f and 7f..0
pmaddubsw xmm0, xmm2 // 16 bit paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. pmaddubsw xmm2, xmm0 // 16 bit
packuswb xmm0, xmm0 // 8 bits paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
movd ebx, xmm0 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm2, xmm2 // 8 bits
movd ebx, xmm2
mov [edi], bl mov [edi], bl
xloop99: xloop99:
......
...@@ -314,10 +314,10 @@ static int TestFilter_16(int src_width, int src_height, ...@@ -314,10 +314,10 @@ static int TestFilter_16(int src_width, int src_height,
TEST_FACTOR(2, 1, 2, 0) TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0) TEST_FACTOR(4, 1, 4, 0)
TEST_FACTOR(8, 1, 8, 3) TEST_FACTOR(8, 1, 8, 0)
TEST_FACTOR(3by4, 3, 4, 1) TEST_FACTOR(3by4, 3, 4, 1)
TEST_FACTOR(3by8, 3, 8, 1) TEST_FACTOR(3by8, 3, 8, 1)
TEST_FACTOR(3, 1, 3, 3) TEST_FACTOR(3, 1, 3, 0)
#undef TEST_FACTOR1 #undef TEST_FACTOR1
#undef TEST_FACTOR #undef TEST_FACTOR
#undef SX #undef SX
...@@ -356,9 +356,9 @@ TEST_FACTOR(3, 1, 3, 3) ...@@ -356,9 +356,9 @@ TEST_FACTOR(3, 1, 3, 3)
// Test scale to a specified size with all 4 filters. // Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \ #define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \ TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 3) \ TEST_SCALETO1(name, width, height, Linear, 0) \
TEST_SCALETO1(name, width, height, Bilinear, 3) \ TEST_SCALETO1(name, width, height, Bilinear, 0) \
TEST_SCALETO1(name, width, height, Box, 3) TEST_SCALETO1(name, width, height, Box, 0)
TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 320, 240)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment