Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
b30a3633
Commit
b30a3633
authored
Sep 25, 2012
by
Justin Ruggles
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: af_volume: add SSE2/SSSE3/AVX-optimized s32 volume scaling
parent
f96f1e06
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
106 additions
and
3 deletions
+106
-3
af_volume.asm
libavfilter/x86/af_volume.asm
+85
-3
af_volume_init.c
libavfilter/x86/af_volume_init.c
+20
-0
x86inc.asm
libavutil/x86/x86inc.asm
+1
-0
No files found.
libavfilter/x86/af_volume.asm
View file @
b30a3633
...
...
@@ -19,12 +19,15 @@
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86
inc
.asm"
%include
"libavutil/x86/x86
util
.asm"
SECTION_RODATA
32
pw_1
:
times
8
dw
1
pw_128
:
times
8
dw
128
pd_1_256
:
times
4
dq
0x3F70000000000000
pd_int32_max
:
times
4
dq
0x41DFFFFFFFC00000
pw_1
:
times
8
dw
1
pw_128
:
times
8
dw
128
pq_128
:
times
2
dq
128
SECTION_TEXT
...
...
@@ -54,3 +57,82 @@ cglobal scale_samples_s16, 4,4,4, dst, src, len, volume
sub
lenq
,
mmsize
jge
.
loop
REP_RET
;------------------------------------------------------------------------------
; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len,
; int volume)
;------------------------------------------------------------------------------
%macro
SCALE_SAMPLES_S32
0
cglobal
scale_samples_s32
,
4
,
4
,
4
,
dst
,
src
,
len
,
volume
%if
ARCH_X86_32
&&
cpuflag
(
avx
)
vbroadcastss
xmm2
,
volumem
%else
movd
xmm2
,
volumed
pshufd
xmm2
,
xmm2
,
0
%endif
CVTDQ2PD
m2
,
xmm2
mulpd
m2
,
m2
,
[
pd_1_256
]
mova
m3
,
[
pd_int32_max
]
lea
lenq
,
[
lend
*
4
-
mmsize
]
.
loop
:
CVTDQ2PD
m0
,
[
srcq
+
lenq
]
CVTDQ2PD
m1
,
[
srcq
+
lenq
+
mmsize
/
2
]
mulpd
m0
,
m0
,
m2
mulpd
m1
,
m1
,
m2
minpd
m0
,
m0
,
m3
minpd
m1
,
m1
,
m3
cvtpd2dq
xmm0
,
m0
cvtpd2dq
xmm1
,
m1
%if
cpuflag
(
avx
)
vmovdqa
[
dstq
+
lenq
]
,
xmm0
vmovdqa
[
dstq
+
lenq
+
mmsize
/
2
]
,
xmm1
%else
movq
[
dstq
+
lenq
]
,
xmm0
movq
[
dstq
+
lenq
+
mmsize
/
2
]
,
xmm1
%endif
sub
lenq
,
mmsize
jge
.
loop
REP_RET
%endmacro
INIT_XMM
sse2
%define
CVTDQ2PD
cvtdq2pd
SCALE_SAMPLES_S32
%define
CVTDQ2PD
vcvtdq2pd
INIT_YMM
avx
SCALE_SAMPLES_S32
%undef
CVTDQ2PD
; NOTE: This is not bit-identical with the C version because it clips to
; [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
INIT_XMM
ssse3
,
atom
cglobal
scale_samples_s32
,
4
,
4
,
8
,
dst
,
src
,
len
,
volume
movd
m4
,
volumem
pshufd
m4
,
m4
,
0
mova
m5
,
[
pq_128
]
pxor
m6
,
m6
lea
lenq
,
[
lend
*
4
-
mmsize
]
.
loop
:
; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8);
mova
m7
,
[
srcq
+
lenq
]
pabsd
m3
,
m7
pshufd
m0
,
m3
,
q0100
pshufd
m1
,
m3
,
q0302
pmuludq
m0
,
m4
pmuludq
m1
,
m4
paddq
m0
,
m5
paddq
m1
,
m5
psrlq
m0
,
7
psrlq
m1
,
7
shufps
m2
,
m0
,
m1
,
q3131
shufps
m0
,
m0
,
m1
,
q2020
pcmpgtd
m2
,
m6
por
m0
,
m2
psrld
m0
,
1
psignd
m0
,
m7
mova
[
dstq
+
lenq
]
,
m0
sub
lenq
,
mmsize
jge
.
loop
REP_RET
libavfilter/x86/af_volume_init.c
View file @
b30a3633
...
...
@@ -25,6 +25,13 @@
void
ff_scale_samples_s16_sse2
(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
len
,
int
volume
);
void
ff_scale_samples_s32_sse2
(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
len
,
int
volume
);
void
ff_scale_samples_s32_ssse3_atom
(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
len
,
int
volume
);
void
ff_scale_samples_s32_avx
(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
len
,
int
volume
);
void
ff_volume_init_x86
(
VolumeContext
*
vol
)
{
int
mm_flags
=
av_get_cpu_flags
();
...
...
@@ -35,5 +42,18 @@ void ff_volume_init_x86(VolumeContext *vol)
vol
->
scale_samples
=
ff_scale_samples_s16_sse2
;
vol
->
samples_align
=
8
;
}
}
else
if
(
sample_fmt
==
AV_SAMPLE_FMT_S32
)
{
if
(
EXTERNAL_SSE2
(
mm_flags
))
{
vol
->
scale_samples
=
ff_scale_samples_s32_sse2
;
vol
->
samples_align
=
4
;
}
if
(
EXTERNAL_SSSE3
(
mm_flags
)
&&
mm_flags
&
AV_CPU_FLAG_ATOM
)
{
vol
->
scale_samples
=
ff_scale_samples_s32_ssse3_atom
;
vol
->
samples_align
=
4
;
}
if
(
EXTERNAL_AVX
(
mm_flags
))
{
vol
->
scale_samples
=
ff_scale_samples_s32_avx
;
vol
->
samples_align
=
8
;
}
}
}
libavutil/x86/x86inc.asm
View file @
b30a3633
...
...
@@ -956,6 +956,7 @@ AVX_INSTR cmpps, 1, 0, 0
AVX_INSTR
cmpsd
,
1
,
0
,
0
AVX_INSTR
cmpss
,
1
,
0
,
0
AVX_INSTR
cvtdq2ps
,
1
,
0
,
0
AVX_INSTR
cvtpd2dq
,
1
,
0
,
0
AVX_INSTR
cvtps2dq
,
1
,
0
,
0
AVX_INSTR
divpd
,
1
,
0
,
0
AVX_INSTR
divps
,
1
,
0
,
0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment