Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
6532a1a8
Commit
6532a1a8
authored
Jul 19, 2014
by
Michael Niedermayer
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sws/x86: split mmxext fast bilinear scaler out
Signed-off-by:
Michael Niedermayer
<
michaelni@gmx.at
>
parent
e9f7c7ae
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
389 additions
and
363 deletions
+389
-363
swscale_internal.h
libswscale/swscale_internal.h
+9
-0
utils.c
libswscale/utils.c
+4
-170
Makefile
libswscale/x86/Makefile
+2
-1
hscale_fast_bilinear_simd.c
libswscale/x86/hscale_fast_bilinear_simd.c
+372
-0
swscale_template.c
libswscale/x86/swscale_template.c
+2
-192
No files found.
libswscale/swscale_internal.h
View file @
6532a1a8
...
...
@@ -864,6 +864,15 @@ void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
void
ff_hcscale_fast_c
(
SwsContext
*
c
,
int16_t
*
dst1
,
int16_t
*
dst2
,
int
dstWidth
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
srcW
,
int
xInc
);
int
ff_init_hscaler_mmxext
(
int
dstW
,
int
xInc
,
uint8_t
*
filterCode
,
int16_t
*
filter
,
int32_t
*
filterPos
,
int
numSplits
);
void
ff_hyscale_fast_mmxext
(
SwsContext
*
c
,
int16_t
*
dst
,
int
dstWidth
,
const
uint8_t
*
src
,
int
srcW
,
int
xInc
);
void
ff_hcscale_fast_mmxext
(
SwsContext
*
c
,
int16_t
*
dst1
,
int16_t
*
dst2
,
int
dstWidth
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
srcW
,
int
xInc
);
static
inline
void
fillPlane16
(
uint8_t
*
plane
,
int
stride
,
int
width
,
int
height
,
int
y
,
int
alpha
,
int
bits
,
const
int
big_endian
)
...
...
libswscale/utils.c
View file @
6532a1a8
...
...
@@ -681,172 +681,6 @@ fail:
return
ret
;
}
#if HAVE_MMXEXT_INLINE
static
av_cold
int
init_hscaler_mmxext
(
int
dstW
,
int
xInc
,
uint8_t
*
filterCode
,
int16_t
*
filter
,
int32_t
*
filterPos
,
int
numSplits
)
{
uint8_t
*
fragmentA
;
x86_reg
imm8OfPShufW1A
;
x86_reg
imm8OfPShufW2A
;
x86_reg
fragmentLengthA
;
uint8_t
*
fragmentB
;
x86_reg
imm8OfPShufW1B
;
x86_reg
imm8OfPShufW2B
;
x86_reg
fragmentLengthB
;
int
fragmentPos
;
int
xpos
,
i
;
// create an optimized horizontal scaling routine
/* This scaler is made of runtime-generated MMXEXT code using specially tuned
* pshufw instructions. For every four output pixels, if four input pixels
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
* used. If five input pixels are needed, then a chunk of fragmentA is used.
*/
// code fragment
__asm__
volatile
(
"jmp 9f
\n\t
"
// Begin
"0:
\n\t
"
"movq (%%"
REG_d
", %%"
REG_a
"), %%mm3
\n\t
"
"movd (%%"
REG_c
", %%"
REG_S
"), %%mm0
\n\t
"
"movd 1(%%"
REG_c
", %%"
REG_S
"), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"pshufw $0xFF, %%mm1, %%mm1
\n\t
"
"1:
\n\t
"
"pshufw $0xFF, %%mm0, %%mm0
\n\t
"
"2:
\n\t
"
"psubw %%mm1, %%mm0
\n\t
"
"movl 8(%%"
REG_b
", %%"
REG_a
"), %%esi
\n\t
"
"pmullw %%mm3, %%mm0
\n\t
"
"psllw $7, %%mm1
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%%"
REG_D
", %%"
REG_a
")
\n\t
"
"add $8, %%"
REG_a
"
\n\t
"
// End
"9:
\n\t
"
// "int $3 \n\t"
"lea "
LOCAL_MANGLE
(
0
b
)
", %0
\n\t
"
"lea "
LOCAL_MANGLE
(
1
b
)
", %1
\n\t
"
"lea "
LOCAL_MANGLE
(
2
b
)
", %2
\n\t
"
"dec %1
\n\t
"
"dec %2
\n\t
"
"sub %0, %1
\n\t
"
"sub %0, %2
\n\t
"
"lea "
LOCAL_MANGLE
(
9
b
)
", %3
\n\t
"
"sub %0, %3
\n\t
"
:
"=r"
(
fragmentA
),
"=r"
(
imm8OfPShufW1A
),
"=r"
(
imm8OfPShufW2A
),
"=r"
(
fragmentLengthA
)
);
__asm__
volatile
(
"jmp 9f
\n\t
"
// Begin
"0:
\n\t
"
"movq (%%"
REG_d
", %%"
REG_a
"), %%mm3
\n\t
"
"movd (%%"
REG_c
", %%"
REG_S
"), %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"pshufw $0xFF, %%mm0, %%mm1
\n\t
"
"1:
\n\t
"
"pshufw $0xFF, %%mm0, %%mm0
\n\t
"
"2:
\n\t
"
"psubw %%mm1, %%mm0
\n\t
"
"movl 8(%%"
REG_b
", %%"
REG_a
"), %%esi
\n\t
"
"pmullw %%mm3, %%mm0
\n\t
"
"psllw $7, %%mm1
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%%"
REG_D
", %%"
REG_a
")
\n\t
"
"add $8, %%"
REG_a
"
\n\t
"
// End
"9:
\n\t
"
// "int $3 \n\t"
"lea "
LOCAL_MANGLE
(
0
b
)
", %0
\n\t
"
"lea "
LOCAL_MANGLE
(
1
b
)
", %1
\n\t
"
"lea "
LOCAL_MANGLE
(
2
b
)
", %2
\n\t
"
"dec %1
\n\t
"
"dec %2
\n\t
"
"sub %0, %1
\n\t
"
"sub %0, %2
\n\t
"
"lea "
LOCAL_MANGLE
(
9
b
)
", %3
\n\t
"
"sub %0, %3
\n\t
"
:
"=r"
(
fragmentB
),
"=r"
(
imm8OfPShufW1B
),
"=r"
(
imm8OfPShufW2B
),
"=r"
(
fragmentLengthB
)
);
xpos
=
0
;
// lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos
=
0
;
for
(
i
=
0
;
i
<
dstW
/
numSplits
;
i
++
)
{
int
xx
=
xpos
>>
16
;
if
((
i
&
3
)
==
0
)
{
int
a
=
0
;
int
b
=
((
xpos
+
xInc
)
>>
16
)
-
xx
;
int
c
=
((
xpos
+
xInc
*
2
)
>>
16
)
-
xx
;
int
d
=
((
xpos
+
xInc
*
3
)
>>
16
)
-
xx
;
int
inc
=
(
d
+
1
<
4
);
uint8_t
*
fragment
=
inc
?
fragmentB
:
fragmentA
;
x86_reg
imm8OfPShufW1
=
inc
?
imm8OfPShufW1B
:
imm8OfPShufW1A
;
x86_reg
imm8OfPShufW2
=
inc
?
imm8OfPShufW2B
:
imm8OfPShufW2A
;
x86_reg
fragmentLength
=
inc
?
fragmentLengthB
:
fragmentLengthA
;
int
maxShift
=
3
-
(
d
+
inc
);
int
shift
=
0
;
if
(
filterCode
)
{
filter
[
i
]
=
((
xpos
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
1
]
=
(((
xpos
+
xInc
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
2
]
=
(((
xpos
+
xInc
*
2
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
3
]
=
(((
xpos
+
xInc
*
3
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filterPos
[
i
/
2
]
=
xx
;
memcpy
(
filterCode
+
fragmentPos
,
fragment
,
fragmentLength
);
filterCode
[
fragmentPos
+
imm8OfPShufW1
]
=
(
a
+
inc
)
|
((
b
+
inc
)
<<
2
)
|
((
c
+
inc
)
<<
4
)
|
((
d
+
inc
)
<<
6
);
filterCode
[
fragmentPos
+
imm8OfPShufW2
]
=
a
|
(
b
<<
2
)
|
(
c
<<
4
)
|
(
d
<<
6
);
if
(
i
+
4
-
inc
>=
dstW
)
shift
=
maxShift
;
// avoid overread
else
if
((
filterPos
[
i
/
2
]
&
3
)
<=
maxShift
)
shift
=
filterPos
[
i
/
2
]
&
3
;
// align
if
(
shift
&&
i
>=
shift
)
{
filterCode
[
fragmentPos
+
imm8OfPShufW1
]
+=
0x55
*
shift
;
filterCode
[
fragmentPos
+
imm8OfPShufW2
]
+=
0x55
*
shift
;
filterPos
[
i
/
2
]
-=
shift
;
}
}
fragmentPos
+=
fragmentLength
;
if
(
filterCode
)
filterCode
[
fragmentPos
]
=
RET
;
}
xpos
+=
xInc
;
}
if
(
filterCode
)
filterPos
[((
i
/
2
)
+
1
)
&
(
~
1
)]
=
xpos
>>
16
;
// needed to jump to the next part
return
fragmentPos
+
1
;
}
#endif
/* HAVE_MMXEXT_INLINE */
static
void
fill_rgb2yuv_table
(
SwsContext
*
c
,
const
int
table
[
4
],
int
dstRange
)
{
int64_t
W
,
V
,
Z
,
Cy
,
Cu
,
Cv
;
...
...
@@ -1400,9 +1234,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
#if HAVE_MMXEXT_INLINE
// can't downscale !!!
if
(
c
->
canMMXEXTBeUsed
&&
(
flags
&
SWS_FAST_BILINEAR
))
{
c
->
lumMmxextFilterCodeSize
=
init_hscaler_mmxext
(
dstW
,
c
->
lumXInc
,
NULL
,
c
->
lumMmxextFilterCodeSize
=
ff_
init_hscaler_mmxext
(
dstW
,
c
->
lumXInc
,
NULL
,
NULL
,
NULL
,
8
);
c
->
chrMmxextFilterCodeSize
=
init_hscaler_mmxext
(
c
->
chrDstW
,
c
->
chrXInc
,
c
->
chrMmxextFilterCodeSize
=
ff_
init_hscaler_mmxext
(
c
->
chrDstW
,
c
->
chrXInc
,
NULL
,
NULL
,
NULL
,
4
);
#if USE_MMAP
...
...
@@ -1443,9 +1277,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
FF_ALLOCZ_OR_GOTO
(
c
,
c
->
hLumFilterPos
,
(
dstW
/
2
/
8
+
8
)
*
sizeof
(
int32_t
),
fail
);
FF_ALLOCZ_OR_GOTO
(
c
,
c
->
hChrFilterPos
,
(
c
->
chrDstW
/
2
/
4
+
8
)
*
sizeof
(
int32_t
),
fail
);
init_hscaler_mmxext
(
dstW
,
c
->
lumXInc
,
c
->
lumMmxextFilterCode
,
ff_
init_hscaler_mmxext
(
dstW
,
c
->
lumXInc
,
c
->
lumMmxextFilterCode
,
c
->
hLumFilter
,
(
uint32_t
*
)
c
->
hLumFilterPos
,
8
);
init_hscaler_mmxext
(
c
->
chrDstW
,
c
->
chrXInc
,
c
->
chrMmxextFilterCode
,
ff_
init_hscaler_mmxext
(
c
->
chrDstW
,
c
->
chrXInc
,
c
->
chrMmxextFilterCode
,
c
->
hChrFilter
,
(
uint32_t
*
)
c
->
hChrFilterPos
,
4
);
#if USE_MMAP
...
...
libswscale/x86/Makefile
View file @
6532a1a8
$(SUBDIR)x86/swscale_mmx.o
:
CFLAGS += $(NOREDZONE_FLAGS)
OBJS
+=
x86/rgb2rgb.o
\
OBJS
+=
x86/hscale_fast_bilinear_simd.o
\
x86/rgb2rgb.o
\
x86/swscale.o
\
x86/yuv2rgb.o
\
...
...
libswscale/x86/hscale_fast_bilinear_simd.c
0 → 100644
View file @
6532a1a8
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "../swscale_internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#define RET 0xC3 // near return opcode for x86
#define PREFETCH "prefetchnta"
av_cold
int
ff_init_hscaler_mmxext
(
int
dstW
,
int
xInc
,
uint8_t
*
filterCode
,
int16_t
*
filter
,
int32_t
*
filterPos
,
int
numSplits
)
{
uint8_t
*
fragmentA
;
x86_reg
imm8OfPShufW1A
;
x86_reg
imm8OfPShufW2A
;
x86_reg
fragmentLengthA
;
uint8_t
*
fragmentB
;
x86_reg
imm8OfPShufW1B
;
x86_reg
imm8OfPShufW2B
;
x86_reg
fragmentLengthB
;
int
fragmentPos
;
int
xpos
,
i
;
// create an optimized horizontal scaling routine
/* This scaler is made of runtime-generated MMXEXT code using specially tuned
* pshufw instructions. For every four output pixels, if four input pixels
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
* used. If five input pixels are needed, then a chunk of fragmentA is used.
*/
// code fragment
__asm__
volatile
(
"jmp 9f
\n\t
"
// Begin
"0:
\n\t
"
"movq (%%"
REG_d
", %%"
REG_a
"), %%mm3
\n\t
"
"movd (%%"
REG_c
", %%"
REG_S
"), %%mm0
\n\t
"
"movd 1(%%"
REG_c
", %%"
REG_S
"), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"pshufw $0xFF, %%mm1, %%mm1
\n\t
"
"1:
\n\t
"
"pshufw $0xFF, %%mm0, %%mm0
\n\t
"
"2:
\n\t
"
"psubw %%mm1, %%mm0
\n\t
"
"movl 8(%%"
REG_b
", %%"
REG_a
"), %%esi
\n\t
"
"pmullw %%mm3, %%mm0
\n\t
"
"psllw $7, %%mm1
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%%"
REG_D
", %%"
REG_a
")
\n\t
"
"add $8, %%"
REG_a
"
\n\t
"
// End
"9:
\n\t
"
// "int $3 \n\t"
"lea "
LOCAL_MANGLE
(
0
b
)
", %0
\n\t
"
"lea "
LOCAL_MANGLE
(
1
b
)
", %1
\n\t
"
"lea "
LOCAL_MANGLE
(
2
b
)
", %2
\n\t
"
"dec %1
\n\t
"
"dec %2
\n\t
"
"sub %0, %1
\n\t
"
"sub %0, %2
\n\t
"
"lea "
LOCAL_MANGLE
(
9
b
)
", %3
\n\t
"
"sub %0, %3
\n\t
"
:
"=r"
(
fragmentA
),
"=r"
(
imm8OfPShufW1A
),
"=r"
(
imm8OfPShufW2A
),
"=r"
(
fragmentLengthA
)
);
__asm__
volatile
(
"jmp 9f
\n\t
"
// Begin
"0:
\n\t
"
"movq (%%"
REG_d
", %%"
REG_a
"), %%mm3
\n\t
"
"movd (%%"
REG_c
", %%"
REG_S
"), %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"pshufw $0xFF, %%mm0, %%mm1
\n\t
"
"1:
\n\t
"
"pshufw $0xFF, %%mm0, %%mm0
\n\t
"
"2:
\n\t
"
"psubw %%mm1, %%mm0
\n\t
"
"movl 8(%%"
REG_b
", %%"
REG_a
"), %%esi
\n\t
"
"pmullw %%mm3, %%mm0
\n\t
"
"psllw $7, %%mm1
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%%"
REG_D
", %%"
REG_a
")
\n\t
"
"add $8, %%"
REG_a
"
\n\t
"
// End
"9:
\n\t
"
// "int $3 \n\t"
"lea "
LOCAL_MANGLE
(
0
b
)
", %0
\n\t
"
"lea "
LOCAL_MANGLE
(
1
b
)
", %1
\n\t
"
"lea "
LOCAL_MANGLE
(
2
b
)
", %2
\n\t
"
"dec %1
\n\t
"
"dec %2
\n\t
"
"sub %0, %1
\n\t
"
"sub %0, %2
\n\t
"
"lea "
LOCAL_MANGLE
(
9
b
)
", %3
\n\t
"
"sub %0, %3
\n\t
"
:
"=r"
(
fragmentB
),
"=r"
(
imm8OfPShufW1B
),
"=r"
(
imm8OfPShufW2B
),
"=r"
(
fragmentLengthB
)
);
xpos
=
0
;
// lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos
=
0
;
for
(
i
=
0
;
i
<
dstW
/
numSplits
;
i
++
)
{
int
xx
=
xpos
>>
16
;
if
((
i
&
3
)
==
0
)
{
int
a
=
0
;
int
b
=
((
xpos
+
xInc
)
>>
16
)
-
xx
;
int
c
=
((
xpos
+
xInc
*
2
)
>>
16
)
-
xx
;
int
d
=
((
xpos
+
xInc
*
3
)
>>
16
)
-
xx
;
int
inc
=
(
d
+
1
<
4
);
uint8_t
*
fragment
=
inc
?
fragmentB
:
fragmentA
;
x86_reg
imm8OfPShufW1
=
inc
?
imm8OfPShufW1B
:
imm8OfPShufW1A
;
x86_reg
imm8OfPShufW2
=
inc
?
imm8OfPShufW2B
:
imm8OfPShufW2A
;
x86_reg
fragmentLength
=
inc
?
fragmentLengthB
:
fragmentLengthA
;
int
maxShift
=
3
-
(
d
+
inc
);
int
shift
=
0
;
if
(
filterCode
)
{
filter
[
i
]
=
((
xpos
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
1
]
=
(((
xpos
+
xInc
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
2
]
=
(((
xpos
+
xInc
*
2
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
3
]
=
(((
xpos
+
xInc
*
3
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filterPos
[
i
/
2
]
=
xx
;
memcpy
(
filterCode
+
fragmentPos
,
fragment
,
fragmentLength
);
filterCode
[
fragmentPos
+
imm8OfPShufW1
]
=
(
a
+
inc
)
|
((
b
+
inc
)
<<
2
)
|
((
c
+
inc
)
<<
4
)
|
((
d
+
inc
)
<<
6
);
filterCode
[
fragmentPos
+
imm8OfPShufW2
]
=
a
|
(
b
<<
2
)
|
(
c
<<
4
)
|
(
d
<<
6
);
if
(
i
+
4
-
inc
>=
dstW
)
shift
=
maxShift
;
// avoid overread
else
if
((
filterPos
[
i
/
2
]
&
3
)
<=
maxShift
)
shift
=
filterPos
[
i
/
2
]
&
3
;
// align
if
(
shift
&&
i
>=
shift
)
{
filterCode
[
fragmentPos
+
imm8OfPShufW1
]
+=
0x55
*
shift
;
filterCode
[
fragmentPos
+
imm8OfPShufW2
]
+=
0x55
*
shift
;
filterPos
[
i
/
2
]
-=
shift
;
}
}
fragmentPos
+=
fragmentLength
;
if
(
filterCode
)
filterCode
[
fragmentPos
]
=
RET
;
}
xpos
+=
xInc
;
}
if
(
filterCode
)
filterPos
[((
i
/
2
)
+
1
)
&
(
~
1
)]
=
xpos
>>
16
;
// needed to jump to the next part
return
fragmentPos
+
1
;
}
void
ff_hyscale_fast_mmxext
(
SwsContext
*
c
,
int16_t
*
dst
,
int
dstWidth
,
const
uint8_t
*
src
,
int
srcW
,
int
xInc
)
{
int32_t
*
filterPos
=
c
->
hLumFilterPos
;
int16_t
*
filter
=
c
->
hLumFilter
;
void
*
mmxextFilterCode
=
c
->
lumMmxextFilterCode
;
int
i
;
#if defined(PIC)
uint64_t
ebxsave
;
#endif
#if ARCH_X86_64
uint64_t
retsave
;
#endif
__asm__
volatile
(
#if defined(PIC)
"mov %%"
REG_b
", %5
\n\t
"
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %6
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %5
\n\t
"
#endif
#endif
"pxor %%mm7, %%mm7
\n\t
"
"mov %0, %%"
REG_c
"
\n\t
"
"mov %1, %%"
REG_D
"
\n\t
"
"mov %2, %%"
REG_d
"
\n\t
"
"mov %3, %%"
REG_b
"
\n\t
"
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
// i
PREFETCH
" (%%"
REG_c
")
\n\t
"
PREFETCH
" 32(%%"
REG_c
")
\n\t
"
PREFETCH
" 64(%%"
REG_c
")
\n\t
"
#if ARCH_X86_64
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
"add %%"REG_S", %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
#else
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
#endif
/* ARCH_X86_64 */
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if defined(PIC)
"mov %5, %%"
REG_b
"
\n\t
"
#if ARCH_X86_64
"mov %6, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov %5, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#endif
::
"m"
(
src
),
"m"
(
dst
),
"m"
(
filter
),
"m"
(
filterPos
),
"m"
(
mmxextFilterCode
)
#if defined(PIC)
,
"m"
(
ebxsave
)
#endif
#if ARCH_X86_64
,
"m"
(
retsave
)
#endif
:
"%"
REG_a
,
"%"
REG_c
,
"%"
REG_d
,
"%"
REG_S
,
"%"
REG_D
#if !defined(PIC)
,
"%"
REG_b
#endif
);
for
(
i
=
dstWidth
-
1
;
(
i
*
xInc
)
>>
16
>=
srcW
-
1
;
i
--
)
dst
[
i
]
=
src
[
srcW
-
1
]
*
128
;
}
void
ff_hcscale_fast_mmxext
(
SwsContext
*
c
,
int16_t
*
dst1
,
int16_t
*
dst2
,
int
dstWidth
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
srcW
,
int
xInc
)
{
int32_t
*
filterPos
=
c
->
hChrFilterPos
;
int16_t
*
filter
=
c
->
hChrFilter
;
void
*
mmxextFilterCode
=
c
->
chrMmxextFilterCode
;
int
i
;
#if defined(PIC)
DECLARE_ALIGNED
(
8
,
uint64_t
,
ebxsave
);
#endif
#if ARCH_X86_64
DECLARE_ALIGNED
(
8
,
uint64_t
,
retsave
);
#endif
__asm__
volatile
(
#if defined(PIC)
"mov %%"
REG_b
", %7
\n\t
"
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %8
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %7
\n\t
"
#endif
#endif
"pxor %%mm7, %%mm7
\n\t
"
"mov %0, %%"
REG_c
"
\n\t
"
"mov %1, %%"
REG_D
"
\n\t
"
"mov %2, %%"
REG_d
"
\n\t
"
"mov %3, %%"
REG_b
"
\n\t
"
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
// i
PREFETCH
" (%%"
REG_c
")
\n\t
"
PREFETCH
" 32(%%"
REG_c
")
\n\t
"
PREFETCH
" 64(%%"
REG_c
")
\n\t
"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
// i
"mov %5, %%"
REG_c
"
\n\t
"
// src
"mov %6, %%"
REG_D
"
\n\t
"
// buf2
PREFETCH
" (%%"
REG_c
")
\n\t
"
PREFETCH
" 32(%%"
REG_c
")
\n\t
"
PREFETCH
" 64(%%"
REG_c
")
\n\t
"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if defined(PIC)
"mov %7, %%"
REG_b
"
\n\t
"
#if ARCH_X86_64
"mov %8, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov %7, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#endif
::
"m"
(
src1
),
"m"
(
dst1
),
"m"
(
filter
),
"m"
(
filterPos
),
"m"
(
mmxextFilterCode
),
"m"
(
src2
),
"m"
(
dst2
)
#if defined(PIC)
,
"m"
(
ebxsave
)
#endif
#if ARCH_X86_64
,
"m"
(
retsave
)
#endif
:
"%"
REG_a
,
"%"
REG_c
,
"%"
REG_d
,
"%"
REG_S
,
"%"
REG_D
#if !defined(PIC)
,
"%"
REG_b
#endif
);
for
(
i
=
dstWidth
-
1
;
(
i
*
xInc
)
>>
16
>=
srcW
-
1
;
i
--
)
{
dst1
[
i
]
=
src1
[
srcW
-
1
]
*
128
;
dst2
[
i
]
=
src2
[
srcW
-
1
]
*
128
;
}
}
libswscale/x86/swscale_template.c
View file @
6532a1a8
...
...
@@ -28,11 +28,6 @@
#undef MOVNTQ2
#undef PREFETCH
#if COMPILE_TEMPLATE_MMXEXT
#define PREFETCH "prefetchnta"
#else
#define PREFETCH " # nop"
#endif
#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
...
...
@@ -1470,191 +1465,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
);
}
}
#if COMPILE_TEMPLATE_MMXEXT
static
void
RENAME
(
hyscale_fast
)(
SwsContext
*
c
,
int16_t
*
dst
,
int
dstWidth
,
const
uint8_t
*
src
,
int
srcW
,
int
xInc
)
{
int32_t
*
filterPos
=
c
->
hLumFilterPos
;
int16_t
*
filter
=
c
->
hLumFilter
;
void
*
mmxextFilterCode
=
c
->
lumMmxextFilterCode
;
int
i
;
#if defined(PIC)
uint64_t
ebxsave
;
#endif
#if ARCH_X86_64
uint64_t
retsave
;
#endif
__asm__
volatile
(
#if defined(PIC)
"mov %%"
REG_b
", %5
\n\t
"
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %6
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %5
\n\t
"
#endif
#endif
"pxor %%mm7, %%mm7
\n\t
"
"mov %0, %%"
REG_c
"
\n\t
"
"mov %1, %%"
REG_D
"
\n\t
"
"mov %2, %%"
REG_d
"
\n\t
"
"mov %3, %%"
REG_b
"
\n\t
"
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
// i
PREFETCH
" (%%"
REG_c
")
\n\t
"
PREFETCH
" 32(%%"
REG_c
")
\n\t
"
PREFETCH
" 64(%%"
REG_c
")
\n\t
"
#if ARCH_X86_64
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
"add %%"REG_S", %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
#else
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
#endif
/* ARCH_X86_64 */
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if defined(PIC)
"mov %5, %%"
REG_b
"
\n\t
"
#if ARCH_X86_64
"mov %6, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov %5, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#endif
::
"m"
(
src
),
"m"
(
dst
),
"m"
(
filter
),
"m"
(
filterPos
),
"m"
(
mmxextFilterCode
)
#if defined(PIC)
,
"m"
(
ebxsave
)
#endif
#if ARCH_X86_64
,
"m"
(
retsave
)
#endif
:
"%"
REG_a
,
"%"
REG_c
,
"%"
REG_d
,
"%"
REG_S
,
"%"
REG_D
#if !defined(PIC)
,
"%"
REG_b
#endif
);
for
(
i
=
dstWidth
-
1
;
(
i
*
xInc
)
>>
16
>=
srcW
-
1
;
i
--
)
dst
[
i
]
=
src
[
srcW
-
1
]
*
128
;
}
static
void
RENAME
(
hcscale_fast
)(
SwsContext
*
c
,
int16_t
*
dst1
,
int16_t
*
dst2
,
int
dstWidth
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
srcW
,
int
xInc
)
{
int32_t
*
filterPos
=
c
->
hChrFilterPos
;
int16_t
*
filter
=
c
->
hChrFilter
;
void
*
mmxextFilterCode
=
c
->
chrMmxextFilterCode
;
int
i
;
#if defined(PIC)
DECLARE_ALIGNED
(
8
,
uint64_t
,
ebxsave
);
#endif
#if ARCH_X86_64
DECLARE_ALIGNED
(
8
,
uint64_t
,
retsave
);
#endif
__asm__
volatile
(
#if defined(PIC)
"mov %%"
REG_b
", %7
\n\t
"
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %8
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov -8(%%rsp), %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", %7
\n\t
"
#endif
#endif
"pxor %%mm7, %%mm7
\n\t
"
"mov %0, %%"
REG_c
"
\n\t
"
"mov %1, %%"
REG_D
"
\n\t
"
"mov %2, %%"
REG_d
"
\n\t
"
"mov %3, %%"
REG_b
"
\n\t
"
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
// i
PREFETCH
" (%%"
REG_c
")
\n\t
"
PREFETCH
" 32(%%"
REG_c
")
\n\t
"
PREFETCH
" 64(%%"
REG_c
")
\n\t
"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
// i
"mov %5, %%"
REG_c
"
\n\t
"
// src
"mov %6, %%"
REG_D
"
\n\t
"
// buf2
PREFETCH
" (%%"
REG_c
")
\n\t
"
PREFETCH
" 32(%%"
REG_c
")
\n\t
"
PREFETCH
" 64(%%"
REG_c
")
\n\t
"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if defined(PIC)
"mov %7, %%"
REG_b
"
\n\t
"
#if ARCH_X86_64
"mov %8, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#else
#if ARCH_X86_64
"mov %7, %%"
REG_a
"
\n\t
"
"mov %%"
REG_a
", -8(%%rsp)
\n\t
"
#endif
#endif
::
"m"
(
src1
),
"m"
(
dst1
),
"m"
(
filter
),
"m"
(
filterPos
),
"m"
(
mmxextFilterCode
),
"m"
(
src2
),
"m"
(
dst2
)
#if defined(PIC)
,
"m"
(
ebxsave
)
#endif
#if ARCH_X86_64
,
"m"
(
retsave
)
#endif
:
"%"
REG_a
,
"%"
REG_c
,
"%"
REG_d
,
"%"
REG_S
,
"%"
REG_D
#if !defined(PIC)
,
"%"
REG_b
#endif
);
for
(
i
=
dstWidth
-
1
;
(
i
*
xInc
)
>>
16
>=
srcW
-
1
;
i
--
)
{
dst1
[
i
]
=
src1
[
srcW
-
1
]
*
128
;
dst2
[
i
]
=
src2
[
srcW
-
1
]
*
128
;
}
}
#endif
/* COMPILE_TEMPLATE_MMXEXT */
static
av_cold
void
RENAME
(
sws_init_swscale
)(
SwsContext
*
c
)
{
enum
AVPixelFormat
dstFormat
=
c
->
dstFormat
;
...
...
@@ -1723,8 +1533,8 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
#if COMPILE_TEMPLATE_MMXEXT
if
(
c
->
flags
&
SWS_FAST_BILINEAR
&&
c
->
canMMXEXTBeUsed
)
{
c
->
hyscale_fast
=
RENAME
(
hyscale_fast
)
;
c
->
hcscale_fast
=
RENAME
(
hcscale_fast
)
;
c
->
hyscale_fast
=
ff_hyscale_fast_mmxext
;
c
->
hcscale_fast
=
ff_hcscale_fast_mmxext
;
}
else
{
#endif
/* COMPILE_TEMPLATE_MMXEXT */
c
->
hyscale_fast
=
NULL
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment