Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
e229df94
Commit
e229df94
authored
Jun 19, 2017
by
James Almer
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}
About 2x faster than the c version.
parent
3c5a53cd
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
140 additions
and
6 deletions
+140
-6
aacpsdsp.asm
libavcodec/x86/aacpsdsp.asm
+123
-0
aacpsdsp_init.c
libavcodec/x86/aacpsdsp_init.c
+8
-0
x86util.asm
libavutil/x86/x86util.asm
+9
-6
No files found.
libavcodec/x86/aacpsdsp.asm
View file @
e229df94
...
...
@@ -166,6 +166,129 @@ align 16
jl
.
loop
REP_RET
;***********************************************************
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;***********************************************************
%macro
HYBRID_SYNTHESIS_DEINT
0
cglobal
ps_hybrid_synthesis_deint
,
3
,
7
,
5
,
out
,
in
,
i
,
len
,
out0
,
out1
,
tmp
%if
cpuflag
(
sse4
)
%define
MOVH
movsd
%else
%define
MOVH
movlps
%endif
movsxdifnidn
iq
,
id
mov
lend
,
32
<<
3
lea
outq
,
[
outq
+
iq
*
4
]
mov
tmpd
,
id
shl
tmpd
,
8
add
inq
,
tmpq
mov
tmpd
,
64
sub
tmpd
,
id
mov
id
,
tmpd
test
id
,
1
jne
.
loop4
test
id
,
2
jne
.
loop8
align
16
.
loop16
:
mov
out0q
,
outq
mov
out1q
,
38
*
64
*
4
add
out1q
,
out0q
mov
tmpd
,
lend
.
inner_loop16
:
movaps
m0
,
[inq]
movaps
m1
,
[
inq
+
lenq
]
movaps
m2
,
[
inq
+
lenq
*
2
]
movaps
m3
,
[
inq
+
3
*
32
*
2
*
4
]
TRANSPOSE4x4PS
0
,
1
,
2
,
3
,
4
movaps
[out0q],
m0
movaps
[out1q],
m1
movaps
[
out0q
+
lenq
]
,
m2
movaps
[
out1q
+
lenq
]
,
m3
lea
out0q
,
[
out0q
+
lenq
*
2
]
lea
out1q
,
[
out1q
+
lenq
*
2
]
add
inq
,
mmsize
sub
tmpd
,
mmsize
jg
.
inner_loop16
add
outq
,
16
add
inq
,
3
*
32
*
2
*
4
sub
id
,
4
jg
.
loop16
RET
align
16
.
loop8
:
mov
out0q
,
outq
mov
out1q
,
38
*
64
*
4
add
out1q
,
out0q
mov
tmpd
,
lend
.
inner_loop8
:
movaps
m0
,
[inq]
movaps
m1
,
[
inq
+
lenq
]
SBUTTERFLYPS
0
,
1
,
2
SBUTTERFLYPD
0
,
1
,
2
MOVH
[out0q],
m0
MOVH
[out1q],
m1
movhps
[
out0q
+
lenq
]
,
m0
movhps
[
out1q
+
lenq
]
,
m1
lea
out0q
,
[
out0q
+
lenq
*
2
]
lea
out1q
,
[
out1q
+
lenq
*
2
]
add
inq
,
mmsize
sub
tmpd
,
mmsize
jg
.
inner_loop8
add
outq
,
8
add
inq
,
lenq
sub
id
,
2
jg
.
loop16
RET
align
16
.
loop4
:
mov
out0q
,
outq
mov
out1q
,
38
*
64
*
4
add
out1q
,
out0q
mov
tmpd
,
lend
.
inner_loop4
:
movaps
m0
,
[inq]
movss
[out0q],
m0
%if
cpuflag
(
sse4
)
extractps
[out1q],
m0
,
1
extractps
[
out0q
+
lenq
]
,
m0
,
2
extractps
[
out1q
+
lenq
]
,
m0
,
3
%else
movhlps
m1
,
m0
movss
[
out0q
+
lenq
]
,
m1
shufps
m0
,
m0
,
0xb1
movss
[out1q],
m0
movhlps
m1
,
m0
movss
[
out1q
+
lenq
]
,
m1
%endif
lea
out0q
,
[
out0q
+
lenq
*
2
]
lea
out1q
,
[
out1q
+
lenq
*
2
]
add
inq
,
mmsize
sub
tmpd
,
mmsize
jg
.
inner_loop4
add
outq
,
4
sub
id
,
1
test
id
,
2
jne
.
loop8
cmp
id
,
4
jge
.
loop16
RET
%endmacro
INIT_XMM
sse
HYBRID_SYNTHESIS_DEINT
INIT_XMM
sse4
HYBRID_SYNTHESIS_DEINT
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
...
...
libavcodec/x86/aacpsdsp_init.c
View file @
e229df94
...
...
@@ -40,6 +40,10 @@ void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
void
ff_ps_stereo_interpolate_ipdopd_sse3
(
float
(
*
l
)[
2
],
float
(
*
r
)[
2
],
float
h
[
2
][
4
],
float
h_step
[
2
][
4
],
int
len
);
void
ff_ps_hybrid_synthesis_deint_sse
(
float
out
[
2
][
38
][
64
],
float
(
*
in
)[
32
][
2
],
int
i
,
int
len
);
void
ff_ps_hybrid_synthesis_deint_sse4
(
float
out
[
2
][
38
][
64
],
float
(
*
in
)[
32
][
2
],
int
i
,
int
len
);
av_cold
void
ff_psdsp_init_x86
(
PSDSPContext
*
s
)
{
...
...
@@ -48,6 +52,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
if
(
EXTERNAL_SSE
(
cpu_flags
))
{
s
->
add_squares
=
ff_ps_add_squares_sse
;
s
->
mul_pair_single
=
ff_ps_mul_pair_single_sse
;
s
->
hybrid_synthesis_deint
=
ff_ps_hybrid_synthesis_deint_sse
;
s
->
hybrid_analysis
=
ff_ps_hybrid_analysis_sse
;
}
if
(
EXTERNAL_SSE3
(
cpu_flags
))
{
...
...
@@ -56,4 +61,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
s
->
stereo_interpolate
[
1
]
=
ff_ps_stereo_interpolate_ipdopd_sse3
;
s
->
hybrid_analysis
=
ff_ps_hybrid_analysis_sse3
;
}
if
(
EXTERNAL_SSE4
(
cpu_flags
))
{
s
->
hybrid_synthesis_deint
=
ff_ps_hybrid_synthesis_deint_sse4
;
}
}
libavutil/x86/x86util.asm
View file @
e229df94
...
...
@@ -71,6 +71,12 @@
SWAP
%1
,
%3
,
%2
%endmacro
%macro
SBUTTERFLYPD
3
movlhps
m%3
,
m%1
,
m%2
movhlps
m%2
,
m%2
,
m%1
SWAP
%1
,
%3
%endmacro
%macro
TRANSPOSE4x4B
5
SBUTTERFLY
bw
,
%1
,
%2
,
%5
SBUTTERFLY
bw
,
%3
,
%4
,
%5
...
...
@@ -117,12 +123,9 @@
%macro
TRANSPOSE4x4PS
5
SBUTTERFLYPS
%1
,
%2
,
%5
SBUTTERFLYPS
%3
,
%4
,
%5
movlhps
m%5
,
m%1
,
m%3
movhlps
m%3
,
m%1
SWAP
%5
,
%1
movlhps
m%5
,
m%2
,
m%4
movhlps
m%4
,
m%2
SWAP
%5
,
%2
,
%3
SBUTTERFLYPD
%1
,
%3
,
%5
SBUTTERFLYPD
%2
,
%4
,
%5
SWAP
%2
,
%3
%endmacro
%macro
TRANSPOSE8x4D
9
-
11
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment