Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
4dcc4f8e
Commit
4dcc4f8e
authored
Jul 06, 2010
by
Vitor Sessak
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
SSE optimized 32-point DCT
Originally committed as revision 24077 to
svn://svn.ffmpeg.org/ffmpeg/trunk
parent
defb0009
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
278 additions
and
0 deletions
+278
-0
dct.c
libavcodec/dct.c
+2
-0
fft.h
libavcodec/fft.h
+1
-0
fft.c
libavcodec/x86/fft.c
+8
-0
fft.h
libavcodec/x86/fft.h
+1
-0
fft_sse.c
libavcodec/x86/fft_sse.c
+266
-0
No files found.
libavcodec/dct.c
View file @
4dcc4f8e
...
...
@@ -30,6 +30,7 @@
#include <math.h>
#include "libavutil/mathematics.h"
#include "fft.h"
#include "x86/fft.h"
#define DCT32_FLOAT
#include "dct32.c"
...
...
@@ -213,6 +214,7 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
s
->
dct_calc
=
dct32_func
;
s
->
dct32
=
dct32
;
if
(
HAVE_MMX
)
ff_dct_init_mmx
(
s
);
return
0
;
}
...
...
libavcodec/fft.h
View file @
4dcc4f8e
...
...
@@ -112,6 +112,7 @@ void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
void
ff_fft_init_altivec
(
FFTContext
*
s
);
void
ff_fft_init_mmx
(
FFTContext
*
s
);
void
ff_fft_init_arm
(
FFTContext
*
s
);
void
ff_dct_init_mmx
(
DCTContext
*
s
);
/**
* Do the permutation needed BEFORE calling ff_fft_calc().
...
...
libavcodec/x86/fft.c
View file @
4dcc4f8e
...
...
@@ -42,3 +42,11 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
}
#endif
}
av_cold
void
ff_dct_init_mmx
(
DCTContext
*
s
)
{
int
has_vectors
=
mm_support
();
if
(
has_vectors
&
FF_MM_SSE
&&
HAVE_SSE
)
s
->
dct32
=
ff_dct32_float_sse
;
}
libavcodec/x86/fft.h
View file @
4dcc4f8e
...
...
@@ -32,5 +32,6 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
void
ff_imdct_half_3dn2
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_calc_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_dct32_float_sse
(
FFTSample
*
out
,
const
FFTSample
*
in
);
#endif
libavcodec/x86/fft_sse.c
View file @
4dcc4f8e
...
...
@@ -20,6 +20,7 @@
*/
#include "libavutil/x86_cpu.h"
#include "libavutil/common.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
...
...
@@ -201,3 +202,268 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
);
}
DECLARE_ALIGNED
(
16
,
static
const
float
,
b1
)[]
=
{
0
.
500603
,
0
.
505471
,
0
.
515447
,
0
.
531043
,
0
.
553104
,
0
.
582935
,
0
.
622504
,
0
.
674808
,
-
1
.
169440
,
-
0
.
972568
,
-
0
.
839350
,
-
0
.
744536
,
-
10
.
190008
,
-
3
.
407609
,
-
2
.
0577
81
,
-
1
.
484165
,
0
.
502419
,
0
.
522499
,
0
.
566944
,
0
.
646822
,
0
.
788155
,
1
.
06067
8
,
1
.
722447
,
5
.
101149
,
0
.
509796
,
0
.
601345
,
0
.
899976
,
2
.
562916
,
1
.
000000
,
1
.
000000
,
1
.
306563
,
0
.
541196
,
1
.
000000
,
0
.
707107
,
1
.
000000
,
-
0
.
707107
};
DECLARE_ALIGNED
(
16
,
static
const
int32_t
,
smask
)[
4
]
=
{
0
,
0
,
0x80000000
,
0x80000000
};
/* butterfly operator */
#define BUTTERFLY(a,b,c,tmp) \
"movaps %%" #a ", %%" #tmp " \n\t" \
"subps %%" #b ", %%" #a " \n\t" \
"addps %%" #tmp ", %%" #b " \n\t" \
"mulps " #c ", %%" #a " \n\t"
///* Same as BUTTERFLY when vectors a and b overlap */
#define BUTTERFLY0(val, mask, cos, tmp, shuf) \
"movaps %%" #val ", %%" #tmp " \n\t" \
"shufps " #shuf ", %%" #val ",%%" #val " \n\t" \
"xorps %%" #mask ", %%" #tmp " \n\t"
/* flip signs */
\
"addps %%" #tmp ", %%" #val " \n\t" \
"mulps %%" #cos ", %%" #val " \n\t"
#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b)
#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1)
void
ff_dct32_float_sse
(
FFTSample
*
out
,
const
FFTSample
*
in
)
{
int32_t
tmp1
=
0
;
__asm__
volatile
(
/* pass 1 */
"movaps (%4), %%xmm0
\n\t
"
"movaps 112(%4), %%xmm1
\n\t
"
"shufps $0x1b, %%xmm1, %%xmm1
\n\t
"
BUTTERFLY
(
xmm0
,
xmm1
,
(
%
2
),
xmm3
)
"movaps 64(%4), %%xmm7
\n\t
"
"movaps 48(%4), %%xmm4
\n\t
"
"shufps $0x1b, %%xmm4, %%xmm4
\n\t
"
BUTTERFLY
(
xmm7
,
xmm4
,
48
(
%
2
),
xmm3
)
/* pass 2 */
"movaps 64(%2), %%xmm2
\n\t
"
BUTTERFLY
(
xmm1
,
xmm4
,
%%
xmm2
,
xmm3
)
"movaps %%xmm1, 48(%1)
\n\t
"
"movaps %%xmm4, (%1)
\n\t
"
/* pass 1 */
"movaps 16(%4), %%xmm1
\n\t
"
"movaps 96(%4), %%xmm6
\n\t
"
"shufps $0x1b, %%xmm6, %%xmm6
\n\t
"
BUTTERFLY
(
xmm1
,
xmm6
,
16
(
%
2
),
xmm3
)
"movaps 80(%4), %%xmm4
\n\t
"
"movaps 32(%4), %%xmm5
\n\t
"
"shufps $0x1b, %%xmm5, %%xmm5
\n\t
"
BUTTERFLY
(
xmm4
,
xmm5
,
32
(
%
2
),
xmm3
)
/* pass 2 */
BUTTERFLY
(
xmm0
,
xmm7
,
%%
xmm2
,
xmm3
)
"movaps 80(%2), %%xmm2
\n\t
"
BUTTERFLY
(
xmm6
,
xmm5
,
%%
xmm2
,
xmm3
)
BUTTERFLY
(
xmm1
,
xmm4
,
%%
xmm2
,
xmm3
)
/* pass 3 */
"movaps 96(%2), %%xmm2
\n\t
"
"shufps $0x1b, %%xmm1, %%xmm1
\n\t
"
BUTTERFLY
(
xmm0
,
xmm1
,
%%
xmm2
,
xmm3
)
"movaps %%xmm0, 112(%1)
\n\t
"
"movaps %%xmm1, 96(%1)
\n\t
"
"movaps 0(%1), %%xmm0
\n\t
"
"shufps $0x1b, %%xmm5, %%xmm5
\n\t
"
BUTTERFLY
(
xmm0
,
xmm5
,
%%
xmm2
,
xmm3
)
"movaps 48(%1), %%xmm1
\n\t
"
"shufps $0x1b, %%xmm6, %%xmm6
\n\t
"
BUTTERFLY
(
xmm1
,
xmm6
,
%%
xmm2
,
xmm3
)
"movaps %%xmm1, 48(%1)
\n\t
"
"shufps $0x1b, %%xmm4, %%xmm4
\n\t
"
BUTTERFLY
(
xmm7
,
xmm4
,
%%
xmm2
,
xmm3
)
/* pass 4 */
"movaps (%3), %%xmm3
\n\t
"
"movaps 112(%2), %%xmm2
\n\t
"
BUTTERFLY2
(
xmm5
,
xmm3
,
xmm2
,
xmm1
)
BUTTERFLY2
(
xmm0
,
xmm3
,
xmm2
,
xmm1
)
"movaps %%xmm0, 16(%1)
\n\t
"
BUTTERFLY2
(
xmm6
,
xmm3
,
xmm2
,
xmm1
)
"movaps %%xmm6, 32(%1)
\n\t
"
"movaps 48(%1), %%xmm0
\n\t
"
BUTTERFLY2
(
xmm0
,
xmm3
,
xmm2
,
xmm1
)
"movaps %%xmm0, 48(%1)
\n\t
"
BUTTERFLY2
(
xmm4
,
xmm3
,
xmm2
,
xmm1
)
BUTTERFLY2
(
xmm7
,
xmm3
,
xmm2
,
xmm1
)
"movaps 96(%1), %%xmm6
\n\t
"
BUTTERFLY2
(
xmm6
,
xmm3
,
xmm2
,
xmm1
)
"movaps 112(%1), %%xmm0
\n\t
"
BUTTERFLY2
(
xmm0
,
xmm3
,
xmm2
,
xmm1
)
/* pass 5 */
"movaps 128(%2), %%xmm2
\n\t
"
"shufps $0xCC, %%xmm3,%%xmm3
\n\t
"
BUTTERFLY3
(
xmm5
,
xmm3
,
xmm2
,
xmm1
)
"movaps %%xmm5, (%1)
\n\t
"
"movaps 16(%1), %%xmm1
\n\t
"
BUTTERFLY3
(
xmm1
,
xmm3
,
xmm2
,
xmm5
)
"movaps %%xmm1, 16(%1)
\n\t
"
BUTTERFLY3
(
xmm4
,
xmm3
,
xmm2
,
xmm5
)
"movaps %%xmm4, 64(%1)
\n\t
"
BUTTERFLY3
(
xmm7
,
xmm3
,
xmm2
,
xmm5
)
"movaps %%xmm7, 80(%1)
\n\t
"
"movaps 32(%1), %%xmm5
\n\t
"
BUTTERFLY3
(
xmm5
,
xmm3
,
xmm2
,
xmm7
)
"movaps %%xmm5, 32(%1)
\n\t
"
"movaps 48(%1), %%xmm4
\n\t
"
BUTTERFLY3
(
xmm4
,
xmm3
,
xmm2
,
xmm7
)
"movaps %%xmm4, 48(%1)
\n\t
"
BUTTERFLY3
(
xmm6
,
xmm3
,
xmm2
,
xmm7
)
"movaps %%xmm6, 96(%1)
\n\t
"
BUTTERFLY3
(
xmm0
,
xmm3
,
xmm2
,
xmm7
)
"movaps %%xmm0, 112(%1)
\n\t
"
/* pass 6, no SIMD... */
"movss 56(%1), %%xmm3
\n\t
"
"movl 4(%1), %0
\n\t
"
"addss 60(%1), %%xmm3
\n\t
"
"movss 72(%1), %%xmm7
\n\t
"
"addss %%xmm3, %%xmm4
\n\t
"
"movss 52(%1), %%xmm2
\n\t
"
"addss %%xmm3, %%xmm2
\n\t
"
"movss 24(%1), %%xmm3
\n\t
"
"addss 28(%1), %%xmm3
\n\t
"
"addss 76(%1), %%xmm7
\n\t
"
"addss %%xmm3, %%xmm1
\n\t
"
"addss %%xmm4, %%xmm5
\n\t
"
"movss %%xmm1, 16(%1)
\n\t
"
"movss 20(%1), %%xmm1
\n\t
"
"addss %%xmm3, %%xmm1
\n\t
"
"movss 40(%1), %%xmm3
\n\t
"
"movss %%xmm1, 48(%1)
\n\t
"
"addss 44(%1), %%xmm3
\n\t
"
"movss 20(%1), %%xmm1
\n\t
"
"addss %%xmm3, %%xmm4
\n\t
"
"addss %%xmm2, %%xmm3
\n\t
"
"addss 28(%1), %%xmm1
\n\t
"
"movss %%xmm3, 40(%1)
\n\t
"
"addss 36(%1), %%xmm2
\n\t
"
"movss 8(%1), %%xmm3
\n\t
"
"movss %%xmm2, 56(%1)
\n\t
"
"addss 12(%1), %%xmm3
\n\t
"
"movss %%xmm5, 8(%1)
\n\t
"
"movss %%xmm3, 32(%1)
\n\t
"
"movss 52(%1), %%xmm2
\n\t
"
"movss 80(%1), %%xmm3
\n\t
"
"movss 120(%1), %%xmm5
\n\t
"
"movss %%xmm1, 80(%1)
\n\t
"
"movss %%xmm4, 24(%1)
\n\t
"
"addss 124(%1), %%xmm5
\n\t
"
"movss 64(%1), %%xmm1
\n\t
"
"addss 60(%1), %%xmm2
\n\t
"
"addss %%xmm5, %%xmm0
\n\t
"
"addss 116(%1), %%xmm5
\n\t
"
"movl %0, 64(%1)
\n\t
"
"addss %%xmm0, %%xmm6
\n\t
"
"addss %%xmm6, %%xmm1
\n\t
"
"movl 12(%1), %0
\n\t
"
"movss %%xmm1, 4(%1)
\n\t
"
"movss 88(%1), %%xmm1
\n\t
"
"movl %0, 96(%1)
\n\t
"
"addss 92(%1), %%xmm1
\n\t
"
"movss 104(%1), %%xmm4
\n\t
"
"movl 28(%1), %0
\n\t
"
"addss 108(%1), %%xmm4
\n\t
"
"addss %%xmm4, %%xmm0
\n\t
"
"addss %%xmm1, %%xmm3
\n\t
"
"addss 84(%1), %%xmm1
\n\t
"
"addss %%xmm5, %%xmm4
\n\t
"
"addss %%xmm3, %%xmm6
\n\t
"
"addss %%xmm0, %%xmm3
\n\t
"
"addss %%xmm7, %%xmm0
\n\t
"
"addss 100(%1), %%xmm5
\n\t
"
"addss %%xmm4, %%xmm7
\n\t
"
"movl %0, 112(%1)
\n\t
"
"movss %%xmm0, 28(%1)
\n\t
"
"movss 36(%1), %%xmm0
\n\t
"
"movss %%xmm7, 36(%1)
\n\t
"
"addss %%xmm1, %%xmm4
\n\t
"
"movss 116(%1), %%xmm7
\n\t
"
"addss %%xmm2, %%xmm0
\n\t
"
"addss 124(%1), %%xmm7
\n\t
"
"movss %%xmm0, 72(%1)
\n\t
"
"movss 44(%1), %%xmm0
\n\t
"
"movss %%xmm6, 12(%1)
\n\t
"
"movss %%xmm3, 20(%1)
\n\t
"
"addss %%xmm0, %%xmm2
\n\t
"
"movss %%xmm4, 44(%1)
\n\t
"
"movss %%xmm2, 88(%1)
\n\t
"
"addss 60(%1), %%xmm0
\n\t
"
"movl 60(%1), %0
\n\t
"
"movl %0, 120(%1)
\n\t
"
"movss %%xmm0, 104(%1)
\n\t
"
"addss %%xmm5, %%xmm1
\n\t
"
"addss 68(%1), %%xmm5
\n\t
"
"movss %%xmm1, 52(%1)
\n\t
"
"movss %%xmm5, 60(%1)
\n\t
"
"movss 68(%1), %%xmm1
\n\t
"
"movss 100(%1), %%xmm5
\n\t
"
"addss %%xmm7, %%xmm5
\n\t
"
"addss 108(%1), %%xmm7
\n\t
"
"addss %%xmm5, %%xmm1
\n\t
"
"movss 84(%1), %%xmm2
\n\t
"
"addss 92(%1), %%xmm2
\n\t
"
"addss %%xmm2, %%xmm5
\n\t
"
"movss %%xmm1, 68(%1)
\n\t
"
"addss %%xmm7, %%xmm2
\n\t
"
"movss 76(%1), %%xmm1
\n\t
"
"movss %%xmm2, 84(%1)
\n\t
"
"movss %%xmm5, 76(%1)
\n\t
"
"movss 108(%1), %%xmm2
\n\t
"
"addss %%xmm1, %%xmm7
\n\t
"
"addss 124(%1), %%xmm2
\n\t
"
"addss %%xmm2, %%xmm1
\n\t
"
"addss 92(%1), %%xmm2
\n\t
"
"movss %%xmm1, 100(%1)
\n\t
"
"movss %%xmm2, 108(%1)
\n\t
"
"movss 92(%1), %%xmm2
\n\t
"
"movss %%xmm7, 92(%1)
\n\t
"
"addss 124(%1), %%xmm2
\n\t
"
"movss %%xmm2, 116(%1)
\n\t
"
:
"+&r"
(
tmp1
)
:
"r"
(
out
),
"r"
(
b1
),
"r"
(
smask
),
"r"
(
in
)
:
"memory"
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment