Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
82992604
Commit
82992604
authored
Jun 23, 2012
by
Mans Rullgard
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: fft: convert sse inline asm to yasm
parent
8123e090
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
129 additions
and
121 deletions
+129
-121
Makefile
libavcodec/x86/Makefile
+0
-1
fft_mmx.asm
libavcodec/x86/fft_mmx.asm
+129
-10
fft_sse.c
libavcodec/x86/fft_sse.c
+0
-110
No files found.
libavcodec/x86/Makefile
View file @
82992604
...
@@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
...
@@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
YASM-OBJS-$(CONFIG_ENCODERS)
+=
x86/dsputilenc_yasm.o
YASM-OBJS-$(CONFIG_ENCODERS)
+=
x86/dsputilenc_yasm.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW)
+=
x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW)
+=
x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT)
+=
x86/fft_3dn2.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT)
+=
x86/fft_3dn2.o
YASM-OBJS-FFT-$(HAVE_SSE)
+=
x86/fft_sse.o
YASM-OBJS-$(CONFIG_FFT)
+=
x86/fft_mmx.o
\
YASM-OBJS-$(CONFIG_FFT)
+=
x86/fft_mmx.o
\
$(YASM-OBJS-FFT-yes)
$(YASM-OBJS-FFT-yes)
YASM-OBJS-$(CONFIG_H264CHROMA)
+=
x86/h264_chromamc.o
\
YASM-OBJS-$(CONFIG_H264CHROMA)
+=
x86/h264_chromamc.o
\
...
...
libavcodec/x86/fft_mmx.asm
View file @
82992604
...
@@ -45,6 +45,10 @@ struc FFTContext
...
@@ -45,6 +45,10 @@ struc FFTContext
.
mdctbits
:
resd
1
.
mdctbits
:
resd
1
.
tcos
:
pointer
1
.
tcos
:
pointer
1
.
tsin
:
pointer
1
.
tsin
:
pointer
1
.
fftperm
:
pointer
1
.
fftcalc
:
pointer
1
.
imdctcalc
:
pointer
1
.
imdcthalf
:
pointer
1
endstruc
endstruc
SECTION_RODATA
SECTION_RODATA
...
@@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
...
@@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2
:
dd
0x00
,
0x01
,
0x02
,
0x03
,
0x01
,
0x00
,
0x02
,
0x03
perm2
:
dd
0x00
,
0x01
,
0x02
,
0x03
,
0x01
,
0x00
,
0x02
,
0x03
ps_p1p1m1p1root2
:
dd
1
.
0
,
1
.
0
,
-
1
.
0
,
1
.
0
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
ps_p1p1m1p1root2
:
dd
1
.
0
,
1
.
0
,
-
1
.
0
,
1
.
0
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
ps_m1m1p1m1p1m1m1m1
:
dd
1
<<
31
,
1
<<
31
,
0
,
1
<<
31
,
0
,
1
<<
31
,
1
<<
31
,
1
<<
31
ps_m1m1p1m1p1m1m1m1
:
dd
1
<<
31
,
1
<<
31
,
0
,
1
<<
31
,
0
,
1
<<
31
,
1
<<
31
,
1
<<
31
ps_m1m1m1m1
:
times
4
dd
1
<<
31
ps_m1p1
:
dd
1
<<
31
,
0
ps_m1p1
:
dd
1
<<
31
,
0
%assign
i
16
%assign
i
16
...
@@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
...
@@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
rep
ret
rep
ret
%endmacro
%endmacro
%macro
FFT_DISPATCH
2
; clobbers 5 GPRs, 8 XMMs
lea
r2
,
[
dispatch_tab
%1
]
mov
r2
,
[
r2
+
(
%2
q
-
2
)
*
gprsize
]
%ifdef
PIC
lea
r3
,
[$$]
add
r2
,
r3
%endif
call
r2
%endmacro
; FFT_DISPATCH
INIT_YMM
avx
INIT_YMM
avx
%if
HAVE_AVX
%if
HAVE_AVX
...
@@ -548,6 +563,14 @@ INIT_YMM avx
...
@@ -548,6 +563,14 @@ INIT_YMM avx
DECL_PASS
pass_avx
,
PASS_BIG
1
DECL_PASS
pass_avx
,
PASS_BIG
1
DECL_PASS
pass_interleave_avx
,
PASS_BIG
0
DECL_PASS
pass_interleave_avx
,
PASS_BIG
0
cglobal
fft_calc
,
2
,
5
,
8
mov
r3d
,
[
r0
+
FFTContext
.
nbits
]
mov
r0
,
r1
mov
r1
,
r3
FFT_DISPATCH
_interleave
%
+
SUFFIX
,
r1
REP_RET
%endif
%endif
INIT_XMM
sse
INIT_XMM
sse
...
@@ -565,6 +588,112 @@ INIT_XMM sse
...
@@ -565,6 +588,112 @@ INIT_XMM sse
DECL_PASS
pass_sse
,
PASS_BIG
1
DECL_PASS
pass_sse
,
PASS_BIG
1
DECL_PASS
pass_interleave_sse
,
PASS_BIG
0
DECL_PASS
pass_interleave_sse
,
PASS_BIG
0
cglobal
fft_calc
,
2
,
5
,
8
mov
r3d
,
[
r0
+
FFTContext
.
nbits
]
PUSH
r1
PUSH
r3
mov
r0
,
r1
mov
r1
,
r3
FFT_DISPATCH
_interleave
%
+
SUFFIX
,
r1
POP
rcx
POP
r4
cmp
rcx
,
4
jg
.
end
mov
r2
,
-
1
add
rcx
,
3
shl
r2
,
cl
sub
r4
,
r2
.
loop
movaps
xmm0
,
[
r4
+
r2
]
movaps
xmm1
,
xmm0
unpcklps
xmm0
,
[
r4
+
r2
+
16
]
unpckhps
xmm1
,
[
r4
+
r2
+
16
]
movaps
[
r4
+
r2
]
,
xmm0
movaps
[
r4
+
r2
+
16
]
,
xmm1
add
r2
,
32
jl
.
loop
.
end
:
REP_RET
cextern_naked
memcpy
cglobal
fft_permute
,
2
,
7
,
1
mov
r4
,
[
r0
+
FFTContext
.
revtab
]
mov
r5
,
[
r0
+
FFTContext
.
tmpbuf
]
mov
ecx
,
[
r0
+
FFTContext
.
nbits
]
mov
r2
,
1
shl
r2
,
cl
xor
r0
,
r0
%if
ARCH_X86_32
mov
r1
,
r1m
%endif
.
loop
:
movaps
xmm0
,
[
r1
+
8
*
r0
]
movzx
r6
,
word
[
r4
+
2
*
r0
]
movzx
r3
,
word
[
r4
+
2
*
r0
+
2
]
movlps
[
r5
+
8
*
r6
]
,
xmm0
movhps
[
r5
+
8
*
r3
]
,
xmm0
add
r0
,
2
cmp
r0
,
r2
jl
.
loop
shl
r2
,
3
%if
ARCH_X86_64
mov
r0
,
r1
mov
r1
,
r5
%else
push
r2
push
r5
push
r1
%endif
%if
ARCH_X86_64
&&
WIN64
==
0
jmp
memcpy
%else
call
memcpy
%if
ARCH_X86_32
add
esp
,
12
%endif
REP_RET
%endif
cglobal
imdct_calc
,
3
,
5
,
3
mov
r3d
,
[
r0
+
FFTContext
.
mdctsize
]
mov
r4
,
[
r0
+
FFTContext
.
imdcthalf
]
add
r1
,
r3
PUSH
r3
PUSH
r1
%if
ARCH_X86_32
push
r2
push
r1
push
r0
%else
sub
rsp
,
8
%endif
call
r4
%if
ARCH_X86_32
add
esp
,
12
%else
add
rsp
,
8
%endif
POP
r1
POP
r3
lea
r0
,
[
r1
+
2
*
r3
]
mov
r2
,
r3
sub
r3
,
16
neg
r2
movaps
xmm2
,
[
ps_m1m1m1m1
]
.
loop
:
movaps
xmm0
,
[
r1
+
r3
]
movaps
xmm1
,
[
r0
+
r2
]
shufps
xmm0
,
xmm0
,
0x1b
shufps
xmm1
,
xmm1
,
0x1b
xorps
xmm0
,
xmm2
movaps
[
r0
+
r3
]
,
xmm1
movaps
[
r1
+
r2
]
,
xmm0
sub
r3
,
16
add
r2
,
16
jl
.
loop
REP_RET
INIT_MMX
3
dnow
INIT_MMX
3
dnow
%define
mulps
pfmul
%define
mulps
pfmul
%define
addps
pfadd
%define
addps
pfadd
...
@@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
...
@@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define
SECTION_REL
%define
SECTION_REL
%endif
%endif
%macro
FFT_DISPATCH
2
; clobbers 5 GPRs, 8 XMMs
lea
r2
,
[
dispatch_tab
%1
]
mov
r2
,
[
r2
+
(
%2
q
-
2
)
*
gprsize
]
%ifdef
PIC
lea
r3
,
[$$]
add
r2
,
r3
%endif
call
r2
%endmacro
; FFT_DISPATCH
%macro
DECL_FFT
1
-
2
; nbits, suffix
%macro
DECL_FFT
1
-
2
; nbits, suffix
%ifidn
%0
,
1
%ifidn
%0
,
1
%xdefine
fullsuffix
SUFFIX
%xdefine
fullsuffix
SUFFIX
...
...
libavcodec/x86/fft_sse.c
deleted
100644 → 0
View file @
8123e090
/*
* FFT/MDCT transform with SSE optimizations
* Copyright (c) 2008 Loren Merritt
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
#include "config.h"
DECLARE_ASM_CONST
(
16
,
unsigned
int
,
ff_m1m1m1m1
)[
4
]
=
{
1U
<<
31
,
1U
<<
31
,
1U
<<
31
,
1U
<<
31
};
void
ff_fft_dispatch_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_avx
(
FFTComplex
*
z
,
int
nbits
);
#if HAVE_AVX
void
ff_fft_calc_avx
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
ff_fft_dispatch_interleave_avx
(
z
,
s
->
nbits
);
}
#endif
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
int
n
=
1
<<
s
->
nbits
;
ff_fft_dispatch_interleave_sse
(
z
,
s
->
nbits
);
if
(
n
<=
16
)
{
x86_reg
i
=
-
8
*
n
;
__asm__
volatile
(
"1:
\n
"
"movaps (%0,%1), %%xmm0
\n
"
"movaps %%xmm0, %%xmm1
\n
"
"unpcklps 16(%0,%1), %%xmm0
\n
"
"unpckhps 16(%0,%1), %%xmm1
\n
"
"movaps %%xmm0, (%0,%1)
\n
"
"movaps %%xmm1, 16(%0,%1)
\n
"
"add $32, %0
\n
"
"jl 1b
\n
"
:
"+r"
(
i
)
:
"r"
(
z
+
n
)
:
"memory"
);
}
}
void
ff_fft_permute_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
int
n
=
1
<<
s
->
nbits
;
int
i
;
for
(
i
=
0
;
i
<
n
;
i
+=
2
)
{
__asm__
volatile
(
"movaps %2, %%xmm0
\n
"
"movlps %%xmm0, %0
\n
"
"movhps %%xmm0, %1
\n
"
:
"=m"
(
s
->
tmp_buf
[
s
->
revtab
[
i
]]),
"=m"
(
s
->
tmp_buf
[
s
->
revtab
[
i
+
1
]])
:
"m"
(
z
[
i
])
);
}
memcpy
(
z
,
s
->
tmp_buf
,
n
*
sizeof
(
FFTComplex
));
}
void
ff_imdct_calc_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
)
{
x86_reg
j
,
k
;
long
n
=
s
->
mdct_size
;
long
n4
=
n
>>
2
;
s
->
imdct_half
(
s
,
output
+
n4
,
input
);
j
=
-
n
;
k
=
n
-
16
;
__asm__
volatile
(
"movaps "
MANGLE
(
ff_m1m1m1m1
)
", %%xmm7
\n
"
"1:
\n
"
"movaps (%2,%1), %%xmm0
\n
"
"movaps (%3,%0), %%xmm1
\n
"
"shufps $0x1b, %%xmm0, %%xmm0
\n
"
"shufps $0x1b, %%xmm1, %%xmm1
\n
"
"xorps %%xmm7, %%xmm0
\n
"
"movaps %%xmm1, (%3,%1)
\n
"
"movaps %%xmm0, (%2,%0)
\n
"
"sub $16, %1
\n
"
"add $16, %0
\n
"
"jl 1b
\n
"
:
"+r"
(
j
),
"+r"
(
k
)
:
"r"
(
output
+
n4
),
"r"
(
output
+
n4
*
3
)
XMM_CLOBBERS_ONLY
(
"%xmm0"
,
"%xmm1"
,
"%xmm7"
)
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment