Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
2e89aeed
Commit
2e89aeed
authored
Jul 15, 2012
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: h264_idct: port to cpuflags
parent
490df522
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
70 additions
and
69 deletions
+70
-69
h264_idct.asm
libavcodec/x86/h264_idct.asm
+70
-69
No files found.
libavcodec/x86/h264_idct.asm
View file @
2e89aeed
...
...
@@ -76,9 +76,9 @@ SECTION .text
STORE_DIFFx2
m2
,
m3
,
m4
,
m5
,
m7
,
6
,
%1
,
%3
%endmacro
INIT_MMX
INIT_MMX
mmx
; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal
h264_idct_add_8
_mmx
,
3
,
3
,
0
cglobal
h264_idct_add_8
,
3
,
3
,
0
IDCT4_ADD
r0
,
r1
,
r2
RET
...
...
@@ -180,9 +180,9 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0
STORE_DIFFx2
m1
,
m2
,
m5
,
m6
,
m7
,
6
,
%1
,
%3
%endmacro
INIT_MMX
INIT_MMX
mmx
; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal
h264_idct8_add_8
_mmx
,
3
,
4
,
0
cglobal
h264_idct8_add_8
,
3
,
4
,
0
%
assign
pad
128
+
4
-
(
stack_offset
&
7
)
SUB
rsp
,
pad
...
...
@@ -240,9 +240,9 @@ cglobal h264_idct8_add_8_mmx, 3, 4, 0
STORE_DIFF
m1
,
m6
,
m7
,
[
%1
+
%4
]
%endmacro
INIT_XMM
INIT_XMM
sse2
; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
cglobal
h264_idct8_add_8
_sse2
,
3
,
4
,
10
cglobal
h264_idct8_add_8
,
3
,
4
,
10
IDCT8_ADD_SSE
r0
,
r1
,
r2
,
r3
RET
...
...
@@ -285,24 +285,25 @@ cglobal h264_idct8_add_8_sse2, 3, 4, 10
%1
[
%2
+
%4
]
,
m5
%endmacro
INIT_MMX
INIT_MMX
mmxext
; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal
h264_idct_dc_add_8
_mmxext
,
3
,
3
,
0
cglobal
h264_idct_dc_add_8
,
3
,
3
,
0
DC_ADD_MMXEXT_INIT
r1
,
r2
DC_ADD_MMXEXT_OP
movh
,
r0
,
r2
,
r1
RET
; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal
h264_idct8_dc_add_8
_mmxext
,
3
,
3
,
0
cglobal
h264_idct8_dc_add_8
,
3
,
3
,
0
DC_ADD_MMXEXT_INIT
r1
,
r2
DC_ADD_MMXEXT_OP
mova
,
r0
,
r2
,
r1
lea
r0
,
[
r0
+
r2
*
4
]
DC_ADD_MMXEXT_OP
mova
,
r0
,
r2
,
r1
RET
INIT_MMX
mmx
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add16_8
_mmx
,
5
,
7
+
npicregs
,
0
,
dst
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
picreg
cglobal
h264_idct_add16_8
,
5
,
7
+
npicregs
,
0
,
dst
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
picreg
xor
r5
,
r5
%ifdef
PIC
lea
picregq
,
[
scan8_mem
]
...
...
@@ -324,7 +325,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct8_add4_8
_mmx
,
5
,
7
+
npicregs
,
0
,
dst
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
picreg
cglobal
h264_idct8_add4_8
,
5
,
7
+
npicregs
,
0
,
dst
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
picreg
%
assign
pad
128
+
4
-
(
stack_offset
&
7
)
SUB
rsp
,
pad
...
...
@@ -354,9 +355,10 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str
ADD
rsp
,
pad
RET
INIT_MMX
mmxext
; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add16_8
_mmxext
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
cglobal
h264_idct_add16_8
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
xor
r5
,
r5
%ifdef
PIC
lea
picregq
,
[
scan8_mem
]
...
...
@@ -398,9 +400,10 @@ cglobal h264_idct_add16_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block,
jl
.
nextblock
REP_RET
INIT_MMX
mmx
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add16intra_8
_mmx
,
5
,
7
+
npicregs
,
0
,
dst
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
picreg
cglobal
h264_idct_add16intra_8
,
5
,
7
+
npicregs
,
0
,
dst
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
picreg
xor
r5
,
r5
%ifdef
PIC
lea
picregq
,
[
scan8_mem
]
...
...
@@ -421,10 +424,11 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block
jl
.
nextblock
REP_RET
INIT_MMX
mmxext
; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride,
; const uint8_t nnzc[6*8])
cglobal
h264_idct_add16intra_8
_mmxext
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
cglobal
h264_idct_add16intra_8
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
xor
r5
,
r5
%ifdef
PIC
lea
picregq
,
[
scan8_mem
]
...
...
@@ -467,7 +471,7 @@ cglobal h264_idct_add16intra_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, b
; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride,
; const uint8_t nnzc[6*8])
cglobal
h264_idct8_add4_8
_mmxext
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
cglobal
h264_idct8_add4_8
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
%
assign
pad
128
+
4
-
(
stack_offset
&
7
)
SUB
rsp
,
pad
...
...
@@ -524,10 +528,10 @@ cglobal h264_idct8_add4_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block,
ADD
rsp
,
pad
RET
INIT_XMM
INIT_XMM
sse2
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct8_add4_8
_sse2
,
5
,
8
+
npicregs
,
10
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
cglobal
h264_idct8_add4_8
,
5
,
8
+
npicregs
,
10
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
xor
r5
,
r5
%ifdef
PIC
lea
picregq
,
[
scan8_mem
]
...
...
@@ -542,7 +546,7 @@ cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block,
movsx
r6
,
word
[r2]
test
r6
,
r6
jz
.
no_dc
INIT_MMX
INIT_MMX
cpuname
DC_ADD_MMXEXT_INIT
r2
,
r3
,
r6
%if
ARCH_X86_64
==
0
%define
dst2q
r1
...
...
@@ -562,7 +566,7 @@ INIT_MMX
jl
.
nextblock
REP_RET
.
no_dc
:
INIT_XMM
INIT_XMM
cpuname
mov
dst2d
,
dword
[
r1
+
r5
*
4
]
add
dst2q
,
r0
IDCT8_ADD_SSE
dst2q
,
r2
,
r3
,
r6
...
...
@@ -576,7 +580,7 @@ INIT_XMM
jl
.
nextblock
REP_RET
INIT_MMX
INIT_MMX
mmx
h264_idct_add8_mmx_plane
:
.
nextblock
:
movzx
r6
,
byte
[
scan8
+
r5
]
...
...
@@ -602,7 +606,7 @@ h264_idct_add8_mmx_plane:
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add8_8
_mmx
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
cglobal
h264_idct_add8_8
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
mov
r5
,
16
add
r2
,
512
%ifdef
PIC
...
...
@@ -663,9 +667,10 @@ h264_idct_add8_mmxext_plane:
jnz
.
nextblock
rep
ret
INIT_MMX
mmxext
; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add8_8
_mmxext
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
cglobal
h264_idct_add8_8
,
5
,
8
+
npicregs
,
0
,
dst1
,
block_offset
,
block
,
stride
,
nnzc
,
cntr
,
coeff
,
dst2
,
picreg
mov
r5
,
16
add
r2
,
512
%if
ARCH_X86_64
...
...
@@ -685,7 +690,6 @@ cglobal h264_idct_add8_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block,
call
h264_idct_add8_mmxext_plane
RET
INIT_MMX
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext
:
movd
m0
,
[
r2
]
; 0 0 X D
...
...
@@ -703,7 +707,7 @@ h264_idct_dc_add8_mmxext:
ret
ALIGN
16
INIT_XMM
INIT_XMM
sse2
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
h264_add8x4_idct_sse2
:
movq
m0
,
[
r2
+
0
]
...
...
@@ -743,7 +747,7 @@ h264_add8x4_idct_sse2:
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add16_8
_sse2
,
5
,
5
+
ARCH_X86_64
,
8
cglobal
h264_idct_add16_8
,
5
,
5
+
ARCH_X86_64
,
8
%if
ARCH_X86_64
mov
r5
,
r0
%endif
...
...
@@ -790,7 +794,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add16intra_8
_sse2
,
5
,
7
+
ARCH_X86_64
,
8
cglobal
h264_idct_add16intra_8
,
5
,
7
+
ARCH_X86_64
,
8
%if
ARCH_X86_64
mov
r7
,
r0
%endif
...
...
@@ -841,7 +845,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal
h264_idct_add8_8
_sse2
,
5
,
7
+
ARCH_X86_64
,
8
cglobal
h264_idct_add8_8
,
5
,
7
+
ARCH_X86_64
,
8
add
r2
,
512
%if
ARCH_X86_64
mov
r7
,
r0
...
...
@@ -887,29 +891,8 @@ cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8
packssdw
%2
,
m5
%endmacro
%macro
STORE_WORDS_MMX
5
movd
t0d
,
%1
psrlq
%1
,
32
movd
t1d
,
%1
mov
[
t2
+
%2
*
32
]
,
t0w
mov
[
t2
+
%4
*
32
]
,
t1w
shr
t0d
,
16
shr
t1d
,
16
mov
[
t2
+
%3
*
32
]
,
t0w
mov
[
t2
+
%5
*
32
]
,
t1w
%endmacro
%macro
DEQUANT_STORE_MMX
1
DEQUANT_MMX
m0
,
m1
,
%1
STORE_WORDS_MMX
m0
,
0
,
1
,
4
,
5
STORE_WORDS_MMX
m1
,
2
,
3
,
6
,
7
DEQUANT_MMX
m2
,
m3
,
%1
STORE_WORDS_MMX
m2
,
8
,
9
,
12
,
13
STORE_WORDS_MMX
m3
,
10
,
11
,
14
,
15
%endmacro
%macro
STORE_WORDS_SSE
9
%macro
STORE_WORDS
5
-
9
%if
cpuflag
(
sse
)
movd
t0d
,
%1
psrldq
%1
,
4
movd
t1d
,
%1
...
...
@@ -929,9 +912,21 @@ cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8
shr
t1d
,
16
mov
[
t2
+
%7
*
32
]
,
t0w
mov
[
t2
+
%9
*
32
]
,
t1w
%else
movd
t0d
,
%1
psrlq
%1
,
32
movd
t1d
,
%1
mov
[
t2
+
%2
*
32
]
,
t0w
mov
[
t2
+
%4
*
32
]
,
t1w
shr
t0d
,
16
shr
t1d
,
16
mov
[
t2
+
%3
*
32
]
,
t0w
mov
[
t2
+
%5
*
32
]
,
t1w
%endif
%endmacro
%macro
DEQUANT_STORE_SSE2
1
%macro
DEQUANT_STORE
1
%if
cpuflag
(
sse2
)
movd
xmm4
,
t3d
movq
xmm5
,
[
pw_1
]
pshufd
xmm4
,
xmm4
,
0
...
...
@@ -953,15 +948,24 @@ cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8
psrad
xmm3
,
%1
packssdw
xmm0
,
xmm1
packssdw
xmm2
,
xmm3
STORE_WORDS_SSE
xmm0
,
0
,
1
,
4
,
5
,
2
,
3
,
6
,
7
STORE_WORDS_SSE
xmm2
,
8
,
9
,
12
,
13
,
10
,
11
,
14
,
15
STORE_WORDS
xmm0
,
0
,
1
,
4
,
5
,
2
,
3
,
6
,
7
STORE_WORDS
xmm2
,
8
,
9
,
12
,
13
,
10
,
11
,
14
,
15
%else
DEQUANT_MMX
m0
,
m1
,
%1
STORE_WORDS
m0
,
0
,
1
,
4
,
5
STORE_WORDS
m1
,
2
,
3
,
6
,
7
DEQUANT_MMX
m2
,
m3
,
%1
STORE_WORDS
m2
,
8
,
9
,
12
,
13
STORE_WORDS
m3
,
10
,
11
,
14
,
15
%endif
%endmacro
%macro
IDCT_DC_DEQUANT
2
cglobal
h264_luma_dc_dequant_idct
_
%1
,
3
,
4
,
%2
%macro
IDCT_DC_DEQUANT
1
cglobal
h264_luma_dc_dequant_idct
,
3
,
4
,
%1
; manually spill XMM registers for Win64 because
; the code here is initialized with INIT_MMX
WIN64_SPILL_XMM
%
2
WIN64_SPILL_XMM
%
1
movq
m3
,
[
r1
+
24
]
movq
m2
,
[
r1
+
16
]
movq
m1
,
[
r1
+
8
]
...
...
@@ -984,11 +988,7 @@ cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
cmp
t3d
,
32767
jg
.
big_qmul
add
t3d
,
128
<<
16
%ifidn
%1
,
mmx
DEQUANT_STORE_MMX
8
%else
DEQUANT_STORE_SSE2
8
%endif
DEQUANT_STORE
8
RET
.
big_qmul
:
bsr
t0d
,
t3d
...
...
@@ -999,16 +999,17 @@ cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
inc
t1d
shr
t3d
,
t0b
sub
t1d
,
t0d
%ifidn
%1
,
mmx
movd
m6
,
t1d
DEQUANT_STORE_MMX
m6
%else
%if
cpuflag
(
sse2
)
movd
xmm6
,
t1d
DEQUANT_STORE_SSE2
xmm6
DEQUANT_STORE
xmm6
%else
movd
m6
,
t1d
DEQUANT_STORE
m6
%endif
RET
%endmacro
INIT_MMX
IDCT_DC_DEQUANT
mmx
,
0
IDCT_DC_DEQUANT
sse2
,
7
INIT_MMX
mmx
IDCT_DC_DEQUANT
0
INIT_MMX
sse2
IDCT_DC_DEQUANT
7
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment