Commit 98a1fbf5 authored by fbarchard@google.com's avatar fbarchard@google.com

Scale up columns 2 pixels at a time

BUG=208
TEST=out\release\libyuv_unittest --gtest_filter=*Scale*640*
Review URL: https://webrtc-codereview.appspot.com/1294004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@648 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent a0070461
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 646
Version: 648
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 646
#define LIBYUV_VERSION 648
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -3043,12 +3043,12 @@ void YToARGBRow_SSE2(const uint8* y_buf,
pxor xmm5, xmm5
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
mov eax,0x00100010
movd xmm3,eax
pshufd xmm3,xmm3,0
mov eax,0x004a004a // 74
movd xmm2,eax
pshufd xmm2,xmm2,0
mov eax, 0x00100010
movd xmm3, eax
pshufd xmm3, xmm3, 0
mov eax, 0x004a004a // 74
movd xmm2, eax
pshufd xmm2, xmm2,0
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
......@@ -4267,8 +4267,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
pxor xmm3, xmm4 // ~alpha
movd xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
pshuflw xmm3, xmm3, 0F5h
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
......@@ -4298,8 +4298,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
pshuflw xmm3, xmm3, 0F5h
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
......@@ -4329,8 +4329,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
pxor xmm3, xmm4 // ~alpha
movd xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
pshuflw xmm3, xmm3, 0F5h
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
......@@ -4363,8 +4363,8 @@ static const uvec8 kShuffleAlpha = {
};
// Same as SSE2, but replaces:
// psrlw xmm3, 8 // alpha
// pshufhw xmm3, xmm3,0F5h // 8 alpha words
// pshuflw xmm3, xmm3,0F5h
// pshufhw xmm3, xmm3, 0F5h // 8 alpha words
// pshuflw xmm3, xmm3, 0F5h
// with..
// pshufb xmm3, kShuffleAlpha // alpha
// Blend 8 pixels at a time.
......@@ -4533,13 +4533,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
convertloop:
movdqa xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm0 // first 2
pshufhw xmm2, xmm0,0FFh // 8 alpha words
pshuflw xmm2, xmm2,0FFh
pshufhw xmm2, xmm0, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm0, xmm2 // rgb * a
movdqa xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm1 // next 2 pixels
pshufhw xmm2, xmm1,0FFh // 8 alpha words
pshuflw xmm2, xmm2,0FFh
pshufhw xmm2, xmm1, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // alphas
psrlw xmm0, 8
......@@ -4673,8 +4673,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
punpcklbw xmm0, xmm0 // first 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,040h // first 4 inv_alpha words. 1, a, a, a
pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm0, xmm2 // rgb * a
......@@ -4684,8 +4684,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
punpckhbw xmm1, xmm1 // next 2
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2,040h // first 4 inv_alpha words
pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm1, xmm2 // rgb * a
......
......@@ -424,46 +424,86 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_argb, const uint8* src_argb,
// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
// TODO(fbarchard): Port to Neon
// TODO(fbarchard): Port to Posix
// TODO(fbarchard): Unroll for 2 pixels for better pairing and memory access.
// TODO(fbarchard): Consider lea to get 2nd pixel without incrementing.
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
static const uvec8 kShuffleColARGB = {
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
static const uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u,
};
#define HAS_SCALEARGBFILTERCOLS_SSSE3
__declspec(naked) __declspec(align(16))
static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
push ebx
push ebp
push esi
push edi
mov edi, [esp + 12 + 4] // dst_argb
mov esi, [esp + 12 + 8] // src_argb
mov ecx, [esp + 12 + 12] // dst_width
mov edx, [esp + 12 + 16] // x
mov ebx, [esp + 12 + 20] // dx
mov edi, [esp + 16 + 4] // dst_argb
mov esi, [esp + 16 + 8] // src_argb
mov ecx, [esp + 16 + 12] // dst_width
mov edx, [esp + 16 + 16] // x
mov ebx, [esp + 16 + 20] // dx
movdqa xmm3, kShuffleFractions
movdqa xmm4, kShuffleColARGB
pcmpeqb xmm5, xmm5 // generate 0x007f for inverting fraction.
psrlw xmm5, 9
sub ecx, 2
jl xloop29
align 16
xloop:
mov eax, edx // get x integer offset
shr eax, 16
movq xmm0, qword ptr [esi + eax * 4] // 2 source pixels
pshufd xmm1, xmm0, 1 // second pixel
punpcklbw xmm0, xmm1 // aarrggbb
movd xmm2, edx // get x fraction
psrlw xmm2, 9 // 7 bit fraction
punpcklbw xmm2, xmm2
punpcklwd xmm2, xmm2
pshufd xmm2, xmm2, 0
pxor xmm2, xmm5 // 0..7f and 7f..0
pmaddubsw xmm0, xmm2
xloop2:
mov eax, edx // get x0 integer
movd xmm1, edx // get x0 fraction
lea ebp, [edx + ebx] // get x1 integer (x + dx)
movd xmm2, ebp // get x1 fraction
shr eax, 16 // x0
punpcklwd xmm1, xmm2 // x0x1 fractions
lea edx, [edx + ebx * 2] // x += dx * 2
shr ebp, 16 // x1
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
movhps xmm0, qword ptr [esi + ebp * 4] // 2 source x1 pixels
psrlw xmm1, 9 // 7 bit fractions.
pshufb xmm1, xmm3 // 0000000011111111
sub ecx, 2
pshufb xmm0, xmm4 // arrange pixels into pairs
pxor xmm1, xmm5 // 0..7f and 7f..0
pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
psrlw xmm0, 7
packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
movq qword ptr [edi], xmm0
lea edi, [edi + 8]
jge xloop2
xloop29:
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
mov eax, edx // get x0 integer
movd xmm1, edx // get x0 fraction
shr eax, 16 // x0
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
psrlw xmm1, 9 // 7 bit fractions.
pshufb xmm1, xmm3 // 00000000
pshufb xmm0, xmm4 // arrange pixels into pairs
pxor xmm1, xmm5 // 0..7f and 7f..0
pmaddubsw xmm0, xmm1 // argb 16 bit, 1 pixel.
psrlw xmm0, 7
packuswb xmm0, xmm0
add edx, ebx // x += dx
sub ecx, 1
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
movd [edi], xmm0
lea edi, [edi + 4]
jg xloop
xloop99:
pop edi
pop esi
pop ebp
pop ebx
ret
}
......@@ -1104,8 +1144,6 @@ static void ScaleARGBBilinear(int src_width, int src_height,
ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
}
#endif
int dx = (src_width << 16) / dst_width;
int dy = (src_height << 16) / dst_height;
int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
......
......@@ -410,7 +410,7 @@ TEST_F(libyuvTest, ARGBScaleTo853x480_Bilinear) {
dst_width, dst_height,
kFilterBilinear,
benchmark_iterations_);
EXPECT_LE(max_diff, 1);
EXPECT_LE(max_diff, 3);
}
TEST_F(libyuvTest, ARGBScaleFrom640x360_None) {
......@@ -436,7 +436,7 @@ TEST_F(libyuvTest, ARGBScaleFrom640x360_Bilinear) {
dst_width, dst_height,
kFilterBilinear,
benchmark_iterations_);
EXPECT_LE(max_diff, 2);
EXPECT_LE(max_diff, 3);
}
} // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment