Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
edc0f5dc
Commit
edc0f5dc
authored
Sep 11, 2009
by
Måns Rullgård
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ARM: NEON optimised MDCT
Originally committed as revision 19819 to
svn://svn.ffmpeg.org/ffmpeg/trunk
parent
46c32e26
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
162 additions
and
0 deletions
+162
-0
mdct_neon.S
libavcodec/arm/mdct_neon.S
+160
-0
dsputil.h
libavcodec/dsputil.h
+1
-0
fft.c
libavcodec/fft.c
+1
-0
No files found.
libavcodec/arm/mdct_neon.S
View file @
edc0f5dc
...
@@ -164,3 +164,163 @@ function ff_imdct_calc_neon, export=1
...
@@ -164,3 +164,163 @@ function ff_imdct_calc_neon, export=1
pop {r4-r6,pc}
pop {r4-r6,pc}
.endfunc
.endfunc
function ff_mdct_calc_neon, export=1
push {r4-r10,lr}
mov r12, #1
ldr lr, [r0, #4] @ nbits
ldr r4, [r0, #8] @ tcos
ldr r5, [r0, #12] @ tsin
ldr r3, [r0, #24] @ revtab
lsl lr, r12, lr @ n = 1 << nbits
add r7, r2, lr @ in4u
sub r9, r7, #16 @ in4d
add r2, r7, lr, lsl #1 @ in3u
add r8, r9, lr, lsl #1 @ in3d
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
vsub.f32 d20, d18, d20 @ in4d-in4u I
vld1.32 {d2}, [r4,:64]! @ c0,c1
vadd.f32 d0, d0, d19 @ in3u+in3d -R
vld1.32 {d3}, [r5,:64]! @ s0,s1
1:
vmul.f32 d7, d20, d3 @ I*s
vmul.f32 d6, d0, d2 @ -R*c
ldr r6, [r3], #4
vmul.f32 d4, d0, d3 @ -R*s
vmul.f32 d5, d20, d2 @ I*c
subs lr, lr, #16
vsub.f32 d6, d6, d7 @ -R*c-I*s
vadd.f32 d7, d4, d5 @ -R*s+I*c
uxtah r10, r1, r6, ror #16
uxtah r6, r1, r6
beq 1f
vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
vneg.f32 d7, d7 @ R*s-I*c
vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
vsub.f32 d20, d18, d20 @ in4d-in4u I
vld1.32 {d2}, [r4,:64]! @ c0,c1
vadd.f32 d0, d0, d19 @ in3u+in3d -R
vld1.32 {d3}, [r5,:64]! @ s0,s1
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r10,:64]
b 1b
1:
vneg.f32 d7, d7 @ R*s-I*c
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r10,:64]
mov r12, #1
ldr lr, [r0, #4] @ nbits
lsl lr, r12, lr @ n = 1 << nbits
sub r8, r2, #16 @ in1d
add r2, r9, #16 @ in0u
sub r9, r7, #16 @ in2d
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
vsub.f32 d0, d0, d18 @ in0u-in2d R
vld1.32 {d2}, [r4,:64]! @ c0,c1
vadd.f32 d20, d20, d19 @ in2u+in1d -I
vld1.32 {d3}, [r5,:64]! @ s0,s1
1:
vmul.f32 d6, d0, d2 @ R*c
vmul.f32 d7, d20, d3 @ -I*s
ldr r6, [r3], #4
vmul.f32 d4, d0, d3 @ R*s
vmul.f32 d5, d20, d2 @ I*c
subs lr, lr, #16
vsub.f32 d6, d7, d6 @ I*s-R*c
vadd.f32 d7, d4, d5 @ R*s-I*c
uxtah r10, r1, r6, ror #16
uxtah r6, r1, r6
beq 1f
vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
vsub.f32 d0, d0, d18 @ in0u-in2d R
vld1.32 {d2}, [r4,:64]! @ c0,c1
vadd.f32 d20, d20, d19 @ in2u+in1d -I
vld1.32 {d3}, [r5,:64]! @ s0,s1
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r10,:64]
b 1b
1:
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r10,:64]
mov r4, r0
mov r6, r1
add r0, r0, #16
bl ff_fft_calc_neon
mov r12, #1
ldr lr, [r4, #4] @ nbits
ldr r5, [r4, #12] @ tsin
ldr r4, [r4, #8] @ tcos
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #3 @ n8 = n >> 3
add r4, r4, lr, lsl #2
add r5, r5, lr, lsl #2
add r6, r6, lr, lsl #3
sub r1, r4, #8
sub r2, r5, #8
sub r3, r6, #16
mov r7, #-16
mov r12, #-8
mov r8, r6
mov r0, r3
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
1:
subs lr, lr, #2
vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
vld1.32 {d19}, [r5,:64]! @ s2,s3
vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
vld1.32 {d16}, [r1,:64], r12 @ c1,c0
vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
vld1.32 {d17}, [r4,:64]! @ c2,c3
vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
vneg.f32 q2, q2
beq 1f
vld2.32 {d0-d1}, [r3,:128], r7
vld2.32 {d20-d21},[r6,:128]!
vld1.32 {d18}, [r2,:64], r12
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128], r7
vst2.32 {d5,d7}, [r8,:128]!
b 1b
1:
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128]
vst2.32 {d5,d7}, [r8,:128]
pop {r4-r10,pc}
.endfunc
libavcodec/dsputil.h
View file @
edc0f5dc
...
@@ -780,6 +780,7 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input
...
@@ -780,6 +780,7 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input
void
ff_imdct_half_sse
(
MDCTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_sse
(
MDCTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_calc_neon
(
MDCTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_calc_neon
(
MDCTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_neon
(
MDCTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_neon
(
MDCTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_mdct_calc_neon
(
MDCTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_mdct_end
(
MDCTContext
*
s
);
void
ff_mdct_end
(
MDCTContext
*
s
);
/* Real Discrete Fourier Transform */
/* Real Discrete Fourier Transform */
...
...
libavcodec/fft.c
View file @
edc0f5dc
...
@@ -119,6 +119,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
...
@@ -119,6 +119,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
s
->
fft_calc
=
ff_fft_calc_neon
;
s
->
fft_calc
=
ff_fft_calc_neon
;
s
->
imdct_calc
=
ff_imdct_calc_neon
;
s
->
imdct_calc
=
ff_imdct_calc_neon
;
s
->
imdct_half
=
ff_imdct_half_neon
;
s
->
imdct_half
=
ff_imdct_half_neon
;
s
->
mdct_calc
=
ff_mdct_calc_neon
;
revtab_shift
=
3
;
revtab_shift
=
3
;
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment