Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
8c3849bc
Commit
8c3849bc
authored
Jul 15, 2012
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: dsputil: port to cpuflags
parent
8ff0f776
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
119 additions
and
127 deletions
+119
-127
dsputil.asm
libavcodec/x86/dsputil.asm
+106
-114
dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c
+13
-13
No files found.
libavcodec/x86/dsputil.asm
View file @
8c3849bc
...
...
@@ -33,9 +33,9 @@ pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
SECTION_TEXT
%macro
SCALARPRODUCT
1
%macro
SCALARPRODUCT
0
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
cglobal
scalarproduct_int16
_
%1
,
3
,
3
,
3
,
v1
,
v2
,
order
cglobal
scalarproduct_int16
,
3
,
3
,
3
,
v1
,
v2
,
order
shl
orderq
,
1
add
v1q
,
orderq
add
v2q
,
orderq
...
...
@@ -62,7 +62,7 @@ cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
RET
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
cglobal
scalarproduct_and_madd_int16
_
%1
,
4
,
4
,
8
,
v1
,
v2
,
v3
,
order
,
mul
cglobal
scalarproduct_and_madd_int16
,
4
,
4
,
8
,
v1
,
v2
,
v3
,
order
,
mul
shl
orderq
,
1
movd
m7
,
mulm
%if
mmsize
==
16
...
...
@@ -107,10 +107,10 @@ cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
RET
%endmacro
INIT_MMX
SCALARPRODUCT
mmxext
INIT_XMM
SCALARPRODUCT
sse2
INIT_MMX
mmxext
SCALARPRODUCT
INIT_XMM
sse2
SCALARPRODUCT
%macro
SCALARPRODUCT_LOOP
1
align
16
...
...
@@ -158,7 +158,8 @@ align 16
%endmacro
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
cglobal
scalarproduct_and_madd_int16_ssse3
,
4
,
5
,
10
,
v1
,
v2
,
v3
,
order
,
mul
INIT_XMM
ssse3
cglobal
scalarproduct_and_madd_int16
,
4
,
5
,
10
,
v1
,
v2
,
v3
,
order
,
mul
shl
orderq
,
1
movd
m7
,
mulm
pshuflw
m7
,
m7
,
0
...
...
@@ -207,48 +208,60 @@ SCALARPRODUCT_LOOP 0
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
%macro
REVERSE_WORDS_MMXEXT
1
-
2
pshufw
%1
,
%1
,
0x1B
%endmacro
%macro
REVERSE_WORDS_SSE2
1
-
2
%macro
REVERSE_WORDS
1
-
2
%if
cpuflag
(
ssse3
)
&&
notcpuflag
(
atom
)
pshufb
%1
,
%2
%elif
cpuflag
(
sse2
)
pshuflw
%1
,
%1
,
0x1B
pshufhw
%1
,
%1
,
0x1B
pshufd
%1
,
%1
,
0x4E
%elif
cpuflag
(
mmxext
)
pshufw
%1
,
%1
,
0x1B
%endif
%endmacro
%macro
REVERSE_WORDS_SSSE3
2
pshufb
%1
,
%2
%endmacro
%macro
MUL16FIXED
3
%if
cpuflag
(
ssse3
)
; dst, src, unused
; dst = ((dst * src) + (1<<14)) >> 15
pmulhrsw
%1
,
%2
%elif
cpuflag
(
mmxext
)
; dst, src, temp
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
%macro
MUL16FIXED_MMXEXT
3
; dst, src, temp
mova
%3
,
%1
pmulhw
%1
,
%2
pmullw
%3
,
%2
psrlw
%3
,
15
psllw
%1
,
1
por
%1
,
%3
%endif
%endmacro
; dst = ((dst * src) + (1<<14)) >> 15
%macro
MUL16FIXED_SSSE3
3
; dst, src, unused
pmulhrsw
%1
,
%2
%endmacro
%macro
APPLY_WINDOW_INT16
3
; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
cglobal
apply_window_int16_
%1
,
4
,
5
,
6
,
output
,
input
,
window
,
offset
,
offset2
%macro
APPLY_WINDOW_INT16
1
; %1 bitexact version
%if
%1
cglobal
apply_window_int16
,
4
,
5
,
6
,
output
,
input
,
window
,
offset
,
offset2
%else
cglobal
apply_window_int16_round
,
4
,
5
,
6
,
output
,
input
,
window
,
offset
,
offset2
%endif
lea
offset2q
,
[
offsetq
-
mmsize
]
%if
%2
mova
m5
,
[
pd_16384
]
%elifidn
%1
,
ssse3
%if
cpuflag
(
ssse3
)
&&
notcpuflag
(
atom
)
mova
m5
,
[
pb_revwords
]
ALIGN
16
%elif
%1
mova
m5
,
[
pd_16384
]
%endif
.
loop
:
%if
%2
%if
cpuflag
(
ssse3
)
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova
m0
,
[
windowq
+
offset2q
]
mova
m1
,
[
inputq
+
offset2q
]
pmulhrsw
m1
,
m0
REVERSE_WORDS
m0
,
m5
pmulhrsw
m0
,
[
inputq
+
offsetq
]
mova
[
outputq
+
offset2q
]
,
m1
mova
[
outputq
+
offsetq
]
,
m0
%elif
%1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
...
...
@@ -284,16 +297,6 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
psrad
m2
,
15
packssdw
m0
,
m2
mova
[
outputq
+
offsetq
]
,
m0
%elif
%3
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova
m0
,
[
windowq
+
offset2q
]
mova
m1
,
[
inputq
+
offset2q
]
pmulhrsw
m1
,
m0
REVERSE_WORDS
m0
,
m5
pmulhrsw
m0
,
[
inputq
+
offsetq
]
mova
[
outputq
+
offset2q
]
,
m1
mova
[
outputq
+
offsetq
]
,
m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
...
...
@@ -313,22 +316,24 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
REP_RET
%endmacro
INIT_MMX
%define
REVERSE_WORDS
REVERSE_WORDS_MMXEXT
%define
MUL16FIXED
MUL16FIXED_MMXEXT
APPLY_WINDOW_INT16
mmxext
,
0
,
0
APPLY_WINDOW_INT16
mmxext_ba
,
1
,
0
INIT_XMM
%define
REVERSE_WORDS
REVERSE_WORDS_SSE2
APPLY_WINDOW_INT16
sse2
,
0
,
0
APPLY_WINDOW_INT16
sse2_ba
,
1
,
0
APPLY_WINDOW_INT16
ssse3_atom
,
0
,
1
%define
REVERSE_WORDS
REVERSE_WORDS_SSSE3
APPLY_WINDOW_INT16
ssse3
,
0
,
1
INIT_MMX
mmxext
APPLY_WINDOW_INT16
0
INIT_XMM
sse2
APPLY_WINDOW_INT16
0
INIT_MMX
mmxext
APPLY_WINDOW_INT16
1
INIT_XMM
sse2
APPLY_WINDOW_INT16
1
INIT_XMM
ssse3
APPLY_WINDOW_INT16
1
INIT_XMM
ssse3
,
atom
APPLY_WINDOW_INT16
1
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
cglobal
add_hfyu_median_prediction_mmxext
,
6
,
6
,
0
,
dst
,
top
,
diff
,
w
,
left
,
left_top
INIT_MMX
mmxext
cglobal
add_hfyu_median_prediction
,
6
,
6
,
0
,
dst
,
top
,
diff
,
w
,
left
,
left_top
movq
mm0
,
[topq]
movq
mm2
,
mm0
movd
mm4
,
[
left_topq
]
...
...
@@ -430,8 +435,8 @@ cglobal add_hfyu_median_prediction_mmxext, 6,6,0, dst, top, diff, w, left, left_
%endmacro
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
INIT_MMX
cglobal
add_hfyu_left_prediction
_ssse3
,
3
,
3
,
7
,
dst
,
src
,
w
,
left
INIT_MMX
ssse3
cglobal
add_hfyu_left_prediction
,
3
,
3
,
7
,
dst
,
src
,
w
,
left
.
skip_prologue
:
mova
m5
,
[
pb_7
]
mova
m4
,
[
pb_zzzz3333zzzzbbbb
]
...
...
@@ -440,8 +445,8 @@ cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
psllq
m0
,
56
ADD_HFYU_LEFT_LOOP
1
,
1
INIT_XMM
cglobal
add_hfyu_left_prediction
_sse4
,
3
,
3
,
7
,
dst
,
src
,
w
,
left
INIT_XMM
sse4
cglobal
add_hfyu_left_prediction
,
3
,
3
,
7
,
dst
,
src
,
w
,
left
mova
m5
,
[
pb_f
]
mova
m6
,
[
pb_zzzzzzzz77777777
]
mova
m4
,
[
pb_zzzz3333zzzzbbbb
]
...
...
@@ -460,7 +465,8 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
cglobal
scalarproduct_float_sse
,
3
,
3
,
2
,
v1
,
v2
,
offset
INIT_XMM
sse
cglobal
scalarproduct_float
,
3
,
3
,
2
,
v1
,
v2
,
offset
neg
offsetq
shl
offsetq
,
2
sub
v1q
,
offsetq
...
...
@@ -1243,15 +1249,20 @@ BUTTERFLIES_FLOAT_INTERLEAVE
INIT_YMM
avx
BUTTERFLIES_FLOAT_INTERLEAVE
INIT_XMM
sse2
; %1 = aligned/unaligned
%macro
BSWAP_LOOPS
_SSE2
1
%macro
BSWAP_LOOPS
1
mov
r3
,
r2
sar
r2
,
3
jz
.
left4_
%1
.
loop8_
%1
:
mov%1
m0
,
[
r1
+
0
]
mov%1
m1
,
[
r1
+
16
]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
m2
pshufb
m1
,
m2
mova
[
r0
+
0
]
,
m0
mova
[
r0
+
16
]
,
m1
%else
pshuflw
m0
,
m0
,
10110001
b
pshuflw
m1
,
m1
,
10110001
b
pshufhw
m0
,
m0
,
10110001
b
...
...
@@ -1266,8 +1277,9 @@ INIT_XMM sse2
por
m3
,
m1
mova
[
r0
+
0
]
,
m2
mova
[
r0
+
16
]
,
m3
add
r1
,
32
%endif
add
r0
,
32
add
r1
,
32
dec
r2
jnz
.
loop8_
%1
.
left4_
%1
:
...
...
@@ -1275,6 +1287,10 @@ INIT_XMM sse2
and
r3
,
4
jz
.
left
mov%1
m0
,
[r1]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
m2
mova
[r0],
m0
%else
pshuflw
m0
,
m0
,
10110001
b
pshufhw
m0
,
m0
,
10110001
b
mova
m2
,
m0
...
...
@@ -1282,72 +1298,29 @@ INIT_XMM sse2
psrlw
m2
,
8
por
m2
,
m0
mova
[r0],
m2
%endif
add
r1
,
16
add
r0
,
16
%endmacro
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
%macro
BSWAP32_BUF
0
%if
cpuflag
(
ssse3
)
cglobal
bswap32_buf
,
3
,
4
,
3
mov
r3
,
r1
mova
m2
,
[
pb_bswap32
]
%else
cglobal
bswap32_buf
,
3
,
4
,
5
mov
r3
,
r1
%endif
and
r3
,
15
jz
.
start_align
BSWAP_LOOPS
_SSE2
u
BSWAP_LOOPS
u
jmp
.
left
.
start_align
:
BSWAP_LOOPS
_SSE2
a
BSWAP_LOOPS
a
.
left
:
and
r2
,
3
jz
.
end
.
loop2
:
mov
r3d
,
[r1]
bswap
r3d
mov
[r0],
r3d
add
r1
,
4
add
r0
,
4
dec
r2
jnz
.
loop2
.
end
:
RET
; %1 = aligned/unaligned
%macro
BSWAP_LOOPS_SSSE3
1
mov
r3
,
r2
sar
r2
,
3
jz
.
left4_
%1
.
loop8_
%1
:
mov%1
m0
,
[
r1
+
0
]
mov%1
m1
,
[
r1
+
16
]
pshufb
m0
,
m2
pshufb
m1
,
m2
mova
[
r0
+
0
]
,
m0
mova
[
r0
+
16
]
,
m1
add
r0
,
32
add
r1
,
32
dec
r2
jnz
.
loop8_
%1
.
left4_
%1
:
mov
r2
,
r3
and
r3
,
4
jz
.
left2
mov%1
m0
,
[r1]
pshufb
m0
,
m2
mova
[r0],
m0
add
r1
,
16
add
r0
,
16
%endmacro
INIT_XMM
ssse3
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
cglobal
bswap32_buf
,
3
,
4
,
3
mov
r3
,
r1
mova
m2
,
[
pb_bswap32
]
and
r3
,
15
jz
.
start_align
BSWAP_LOOPS_SSSE3
u
jmp
.
left2
.
start_align
:
BSWAP_LOOPS_SSSE3
a
.
left2
:
%if
cpuflag
(
ssse3
)
mov
r3
,
r2
and
r2
,
2
jz
.
left1
...
...
@@ -1362,5 +1335,24 @@ cglobal bswap32_buf, 3,4,3
mov
r2d
,
[r1]
bswap
r2d
mov
[r0],
r2d
%else
and
r2
,
3
jz
.
end
.
loop2
:
mov
r3d
,
[r1]
bswap
r3d
mov
[r0],
r3d
add
r1
,
4
add
r0
,
4
dec
r2
jnz
.
loop2
%endif
.
end
:
RET
%endmacro
INIT_XMM
sse2
BSWAP32_BUF
INIT_XMM
ssse3
BSWAP32_BUF
libavcodec/x86/dsputil_mmx.c
View file @
8c3849bc
...
...
@@ -2297,16 +2297,16 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const
int16_t
*
v3
,
int
order
,
int
mul
);
void
ff_apply_window_int16_mmxext
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_mmxext_ba
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_sse2
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_sse2_ba
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_ssse3
(
int16_t
*
output
,
const
int16_t
*
input
,
void
ff_apply_window_int16_round_mmxext
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_round_sse2
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_mmxext
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_sse2
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_ssse3
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
void
ff_apply_window_int16_ssse3_atom
(
int16_t
*
output
,
const
int16_t
*
input
,
const
int16_t
*
window
,
unsigned
int
len
);
...
...
@@ -2568,9 +2568,9 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
c
->
scalarproduct_and_madd_int16
=
ff_scalarproduct_and_madd_int16_mmxext
;
if
(
avctx
->
flags
&
CODEC_FLAG_BITEXACT
)
{
c
->
apply_window_int16
=
ff_apply_window_int16_mmxext_ba
;
}
else
{
c
->
apply_window_int16
=
ff_apply_window_int16_mmxext
;
}
else
{
c
->
apply_window_int16
=
ff_apply_window_int16_round_mmxext
;
}
#endif
/* HAVE_YASM */
}
...
...
@@ -2758,9 +2758,9 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c
->
vector_clip_int32
=
ff_vector_clip_int32_sse2
;
}
if
(
avctx
->
flags
&
CODEC_FLAG_BITEXACT
)
{
c
->
apply_window_int16
=
ff_apply_window_int16_sse2_ba
;
}
else
if
(
!
(
mm_flags
&
AV_CPU_FLAG_SSE2SLOW
))
{
c
->
apply_window_int16
=
ff_apply_window_int16_sse2
;
}
else
if
(
!
(
mm_flags
&
AV_CPU_FLAG_SSE2SLOW
))
{
c
->
apply_window_int16
=
ff_apply_window_int16_round_sse2
;
}
c
->
bswap_buf
=
ff_bswap32_buf_sse2
;
#endif
/* HAVE_YASM */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment