Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
3c172a41
Commit
3c172a41
authored
Jan 13, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swscale: change yuv2yuvX code to use cpuflag().
parent
57facb73
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
59 additions
and
60 deletions
+59
-60
output.asm
libswscale/x86/output.asm
+59
-60
No files found.
libswscale/x86/output.asm
View file @
3c172a41
...
@@ -56,7 +56,7 @@ SECTION .text
...
@@ -56,7 +56,7 @@ SECTION .text
; of 2. $offset is either 0 or 3. $dither holds 8 values.
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
yuv2planeX_fn
4
%macro
yuv2planeX_fn
3
%ifdef
ARCH_X86_32
%ifdef
ARCH_X86_32
%define
cntr_reg
r1
%define
cntr_reg
r1
...
@@ -66,12 +66,12 @@ SECTION .text
...
@@ -66,12 +66,12 @@ SECTION .text
%define
movsx
movsxd
%define
movsx
movsxd
%endif
%endif
cglobal
yuv2planeX_
%
2
_
%1
,
%4
,
7
,
%3
cglobal
yuv2planeX_
%
1
,
%3
,
7
,
%2
%if
%
2
==
8
||
%2
==
9
||
%2
==
10
%if
%
1
==
8
||
%1
==
9
||
%1
==
10
pxor
m6
,
m6
pxor
m6
,
m6
%endif
; %
2
== 8/9/10
%endif
; %
1
== 8/9/10
%if
%
2
==
8
%if
%
1
==
8
%ifdef
ARCH_X86_32
%ifdef
ARCH_X86_32
%assign
pad
0x2c
-
(
stack_offset
&
15
)
%assign
pad
0x2c
-
(
stack_offset
&
15
)
SUB
rsp
,
pad
SUB
rsp
,
pad
...
@@ -120,7 +120,7 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
...
@@ -120,7 +120,7 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
mova
[
rsp
+
16
]
,
m3
mova
[
rsp
+
16
]
,
m3
mova
[
rsp
+
24
]
,
m_dith
mova
[
rsp
+
24
]
,
m_dith
%endif
; mmsize == 8/16
%endif
; mmsize == 8/16
%endif
; %
2
== 8
%endif
; %
1
== 8
xor
r5
,
r5
xor
r5
,
r5
...
@@ -130,11 +130,11 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
...
@@ -130,11 +130,11 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
; 8 pixels but we can only handle 2 pixels per register, and thus 4
; 8 pixels but we can only handle 2 pixels per register, and thus 4
; pixels per iteration. In order to not have to keep track of where
; pixels per iteration. In order to not have to keep track of where
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
%if
%
2
==
8
%if
%
1
==
8
%rep
16
/
mmsize
%rep
16
/
mmsize
%endif
; %
2
== 8
%endif
; %
1
== 8
%if
%
2
==
8
%if
%
1
==
8
%ifdef
ARCH_X86_32
%ifdef
ARCH_X86_32
mova
m2
,
[
rsp
+
mmsize
*
(
0
+
%%
i
)
]
mova
m2
,
[
rsp
+
mmsize
*
(
0
+
%%
i
)
]
mova
m1
,
[
rsp
+
mmsize
*
(
1
+
%%
i
)
]
mova
m1
,
[
rsp
+
mmsize
*
(
1
+
%%
i
)
]
...
@@ -142,31 +142,31 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
...
@@ -142,31 +142,31 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
mova
m2
,
m8
mova
m2
,
m8
mova
m1
,
m_dith
mova
m1
,
m_dith
%endif
; x86-32/64
%endif
; x86-32/64
%else
; %
2
== 9/10/16
%else
; %
1
== 9/10/16
mova
m1
,
[
yuv2yuvX_
%
2
_start
]
mova
m1
,
[
yuv2yuvX_
%
1
_start
]
mova
m2
,
m1
mova
m2
,
m1
%endif
; %
2
== 8/9/10/16
%endif
; %
1
== 8/9/10/16
movsx
cntr_reg
,
r1m
movsx
cntr_reg
,
r1m
.
filterloop_
%
+
%%
i
:
.
filterloop_
%
+
%%
i
:
; input pixels
; input pixels
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
2
*
gprsize
]
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
2
*
gprsize
]
%if
%
2
==
16
%if
%
1
==
16
mova
m3
,
[
r6
+
r5
*
4
]
mova
m3
,
[
r6
+
r5
*
4
]
mova
m5
,
[
r6
+
r5
*
4
+
mmsize
]
mova
m5
,
[
r6
+
r5
*
4
+
mmsize
]
%else
; %
2
== 8/9/10
%else
; %
1
== 8/9/10
mova
m3
,
[
r6
+
r5
*
2
]
mova
m3
,
[
r6
+
r5
*
2
]
%endif
; %
2
== 8/9/10/16
%endif
; %
1
== 8/9/10/16
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
gprsize
]
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
gprsize
]
%if
%
2
==
16
%if
%
1
==
16
mova
m4
,
[
r6
+
r5
*
4
]
mova
m4
,
[
r6
+
r5
*
4
]
mova
m6
,
[
r6
+
r5
*
4
+
mmsize
]
mova
m6
,
[
r6
+
r5
*
4
+
mmsize
]
%else
; %
2
== 8/9/10
%else
; %
1
== 8/9/10
mova
m4
,
[
r6
+
r5
*
2
]
mova
m4
,
[
r6
+
r5
*
2
]
%endif
; %
2
== 8/9/10/16
%endif
; %
1
== 8/9/10/16
; coefficients
; coefficients
movd
m0
,
[
r0
+
2
*
cntr_reg
-
4
]
; coeff[0], coeff[1]
movd
m0
,
[
r0
+
2
*
cntr_reg
-
4
]
; coeff[0], coeff[1]
%if
%
2
==
16
%if
%
1
==
16
pshuflw
m7
,
m0
,
0
; coeff[0]
pshuflw
m7
,
m0
,
0
; coeff[0]
pshuflw
m0
,
m0
,
0x55
; coeff[1]
pshuflw
m0
,
m0
,
0x55
; coeff[1]
pmovsxwd
m7
,
m7
; word -> dword
pmovsxwd
m7
,
m7
; word -> dword
...
@@ -181,7 +181,7 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
...
@@ -181,7 +181,7 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
paddd
m1
,
m5
paddd
m1
,
m5
paddd
m2
,
m4
paddd
m2
,
m4
paddd
m1
,
m6
paddd
m1
,
m6
%else
; %
2
== 10/9/8
%else
; %
1
== 10/9/8
punpcklwd
m5
,
m3
,
m4
punpcklwd
m5
,
m3
,
m4
punpckhwd
m3
,
m4
punpckhwd
m3
,
m4
SPLATD
m0
,
m0
SPLATD
m0
,
m0
...
@@ -191,84 +191,83 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
...
@@ -191,84 +191,83 @@ cglobal yuv2planeX_%2_%1, %4, 7, %3
paddd
m2
,
m5
paddd
m2
,
m5
paddd
m1
,
m3
paddd
m1
,
m3
%endif
; %
2
== 8/9/10/16
%endif
; %
1
== 8/9/10/16
sub
cntr_reg
,
2
sub
cntr_reg
,
2
jg
.
filterloop_
%
+
%%
i
jg
.
filterloop_
%
+
%%
i
%if
%
2
==
16
%if
%
1
==
16
psrad
m2
,
31
-
%
2
psrad
m2
,
31
-
%
1
psrad
m1
,
31
-
%
2
psrad
m1
,
31
-
%
1
%else
; %
2
== 10/9/8
%else
; %
1
== 10/9/8
psrad
m2
,
27
-
%
2
psrad
m2
,
27
-
%
1
psrad
m1
,
27
-
%
2
psrad
m1
,
27
-
%
1
%endif
; %
2
== 8/9/10/16
%endif
; %
1
== 8/9/10/16
%if
%
2
==
8
%if
%
1
==
8
packssdw
m2
,
m1
packssdw
m2
,
m1
packuswb
m2
,
m2
packuswb
m2
,
m2
movh
[
r3
+
r5
*
1
]
,
m2
movh
[
r3
+
r5
*
1
]
,
m2
%else
; %
2
== 9/10/16
%else
; %
1
== 9/10/16
%if
%
2
==
16
%if
%
1
==
16
packssdw
m2
,
m1
packssdw
m2
,
m1
paddw
m2
,
[minshort]
paddw
m2
,
[minshort]
%else
; %2 == 9/10
%else
; %1 == 9/10
%ifidn
%1
,
sse4
%if
cpuflag
(
sse4
)
packusdw
m2
,
m1
%elifidn
%1
,
avx
packusdw
m2
,
m1
packusdw
m2
,
m1
%else
; mmx2/sse2
%else
; mmx2/sse2
packssdw
m2
,
m1
packssdw
m2
,
m1
pmaxsw
m2
,
m6
pmaxsw
m2
,
m6
%endif
; mmx2/sse2/sse4/avx
%endif
; mmx2/sse2/sse4/avx
pminsw
m2
,
[
yuv2yuvX_
%
2
_upper
]
pminsw
m2
,
[
yuv2yuvX_
%
1
_upper
]
%endif
; %
2
== 9/10/16
%endif
; %
1
== 9/10/16
mova
[
r3
+
r5
*
2
]
,
m2
mova
[
r3
+
r5
*
2
]
,
m2
%endif
; %
2
== 8/9/10/16
%endif
; %
1
== 8/9/10/16
add
r5
,
mmsize
/
2
add
r5
,
mmsize
/
2
sub
r4d
,
mmsize
/
2
sub
r4d
,
mmsize
/
2
%if
%
2
==
8
%if
%
1
==
8
%assign
%%
i
%%
i
+
2
%assign
%%
i
%%
i
+
2
%endrep
%endrep
%endif
; %
2
== 8
%endif
; %
1
== 8
jg
.
pixelloop
jg
.
pixelloop
%if
%
2
==
8
%if
%
1
==
8
%ifdef
ARCH_X86_32
%ifdef
ARCH_X86_32
ADD
rsp
,
pad
ADD
rsp
,
pad
RET
RET
%else
; x86-64
%else
; x86-64
REP_RET
REP_RET
%endif
; x86-32/64
%endif
; x86-32/64
%else
; %
2
== 9/10/16
%else
; %
1
== 9/10/16
REP_RET
REP_RET
%endif
; %
2
== 8/9/10/16
%endif
; %
1
== 8/9/10/16
%endmacro
%endmacro
%define
PALIGNR
PALIGNR_MMX
%define
PALIGNR
PALIGNR_MMX
%ifdef
ARCH_X86_32
%ifdef
ARCH_X86_32
INIT_MMX
INIT_MMX
mmx2
yuv2planeX_fn
mmx2
,
8
,
0
,
7
yuv2planeX_fn
8
,
0
,
7
yuv2planeX_fn
mmx2
,
9
,
0
,
5
yuv2planeX_fn
9
,
0
,
5
yuv2planeX_fn
mmx2
,
10
,
0
,
5
yuv2planeX_fn
10
,
0
,
5
%endif
%endif
INIT_XMM
INIT_XMM
sse2
yuv2planeX_fn
sse2
,
8
,
10
,
7
yuv2planeX_fn
8
,
10
,
7
yuv2planeX_fn
sse2
,
9
,
7
,
5
yuv2planeX_fn
9
,
7
,
5
yuv2planeX_fn
sse2
,
10
,
7
,
5
yuv2planeX_fn
10
,
7
,
5
%define
PALIGNR
PALIGNR_SSSE3
%define
PALIGNR
PALIGNR_SSSE3
yuv2planeX_fn
sse4
,
8
,
10
,
7
INIT_XMM
sse4
yuv2planeX_fn
sse4
,
9
,
7
,
5
yuv2planeX_fn
8
,
10
,
7
yuv2planeX_fn
sse4
,
10
,
7
,
5
yuv2planeX_fn
9
,
7
,
5
yuv2planeX_fn
sse4
,
16
,
8
,
5
yuv2planeX_fn
10
,
7
,
5
yuv2planeX_fn
16
,
8
,
5
INIT_AVX
yuv2planeX_fn
avx
,
8
,
10
,
7
INIT_XMM
avx
yuv2planeX_fn
avx
,
9
,
7
,
5
yuv2planeX_fn
8
,
10
,
7
yuv2planeX_fn
avx
,
10
,
7
,
5
yuv2planeX_fn
9
,
7
,
5
yuv2planeX_fn
10
,
7
,
5
; %1=outout-bpc, %2=alignment (u/a)
; %1=outout-bpc, %2=alignment (u/a)
%macro
yuv2plane1_mainloop
2
%macro
yuv2plane1_mainloop
2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment