Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
decd5193
Commit
decd5193
authored
Mar 10, 2015
by
Christophe Gisquet
Committed by
Michael Niedermayer
Mar 14, 2015
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: xvid_idct: merged idct_put SSE2 versions
Signed-off-by:
Michael Niedermayer
<
michaelni@gmx.at
>
parent
8200575d
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
140 additions
and
70 deletions
+140
-70
xvididct.asm
libavcodec/x86/xvididct.asm
+138
-64
xvididct_init.c
libavcodec/x86/xvididct_init.c
+2
-6
No files found.
libavcodec/x86/xvididct.asm
View file @
decd5193
...
@@ -292,13 +292,13 @@ SECTION .text
...
@@ -292,13 +292,13 @@ SECTION .text
%define
TAN3
xmm13
%define
TAN3
xmm13
%define
TAN1
xmm14
%define
TAN1
xmm14
%else
%else
%define
ROW0
[
r0
+
0
*
16
]
%define
ROW0
[
BLOCK
+
0
*
16
]
%define
REG0
xmm4
%define
REG0
xmm4
%define
ROW2
[
r0
+
2
*
16
]
%define
ROW2
[
BLOCK
+
2
*
16
]
%define
REG2
xmm4
%define
REG2
xmm4
%define
ROW4
[
r0
+
4
*
16
]
%define
ROW4
[
BLOCK
+
4
*
16
]
%define
REG4
xmm6
%define
REG4
xmm6
%define
ROW6
[
r0
+
6
*
16
]
%define
ROW6
[
BLOCK
+
6
*
16
]
%define
REG6
xmm6
%define
REG6
xmm6
%define
XMMS
xmm2
%define
XMMS
xmm2
%define
SREG2
xmm7
%define
SREG2
xmm7
...
@@ -369,8 +369,71 @@ SECTION .text
...
@@ -369,8 +369,71 @@ SECTION .text
movdqa
TAN1
,
[tan1]
movdqa
TAN1
,
[tan1]
%endmacro
%endmacro
%macro
FIRST_HALF
2
; %1=dct %2=type(normal,add,put)
psraw
xmm5
,
6
psraw
REG0
,
6
psraw
TAN3
,
6
psraw
xmm3
,
6
; dct coeffs must still be written for AC prediction
%if
%2
==
0
movdqa
[
%1
+
1
*
16
]
,
TAN3
movdqa
[
%1
+
2
*
16
]
,
xmm3
movdqa
[
%1
+
5
*
16
]
,
REG0
movdqa
[
%1
+
6
*
16
]
,
xmm5
%else
; Must now load args as gprs are no longer used for masks
; DEST is set to where address of dest was loaded
%
if
ARCH_X86_32
%
xdefine
DEST
r2q
; BLOCK is r0, stride r1
movifnidn
DEST
,
destm
movifnidn
strideq
,
stridem
%
else
%
xdefine
DEST
r0q
%
endif
lea
r3q
,
[
3
*
strideq
]
%
if
%2
==
1
packuswb
TAN3
,
xmm3
packuswb
xmm5
,
REG0
movq
[
DEST
+
strideq
]
,
TAN3
movhps
[
DEST
+
2
*
strideq
]
,
TAN3
; REG0 and TAN3 are now available (and likely used in second half)
%
else
%
warning
Unimplemented
%
endif
%endif
%endmacro
%macro
SECOND_HALF
6
; %1=dct %2=type(normal,add,put) 3-6: xmms
psraw
%3
,
6
psraw
%4
,
6
psraw
%5
,
6
psraw
%6
,
6
; dct coeffs must still be written for AC prediction
%if
%2
==
0
movdqa
[
%1
+
0
*
16
]
,
%3
movdqa
[
%1
+
3
*
16
]
,
%5
movdqa
[
%1
+
4
*
16
]
,
%6
movdqa
[
%1
+
7
*
16
]
,
%4
%elif
%2
==
1
packuswb
%3
,
%5
packuswb
%6
,
%4
; address of dest may have been loaded
movq
[DEST],
%3
movhps
[
DEST
+
r3q
]
,
%3
lea
DEST
,
[
DEST
+
4
*
strideq
]
movq
[DEST],
%6
movhps
[
DEST
+
r3q
]
,
%6
; and now write remainder of first half
movq
[
DEST
+
2
*
strideq
]
,
xmm5
movhps
[
DEST
+
strideq
]
,
xmm5
%elif
%2
==
2
%warning
Unimplemented
%endif
%endmacro
; IDCT pass on columns.
; IDCT pass on columns.
%macro
iLLM_PASS
1
;dct
%macro
iLLM_PASS
2
; %1=dct %2=type(normal,add,put)
movdqa
xmm1
,
TAN3
movdqa
xmm1
,
TAN3
movdqa
xmm3
,
TAN1
movdqa
xmm3
,
TAN1
pmulhw
TAN3
,
xmm4
pmulhw
TAN3
,
xmm4
...
@@ -407,7 +470,7 @@ SECTION .text
...
@@ -407,7 +470,7 @@ SECTION .text
psubsw
xmm5
,
REG6
psubsw
xmm5
,
REG6
MOV32
ROW0
,
REG0
MOV32
ROW0
,
REG0
MOV32
ROW4
,
REG4
MOV32
ROW4
,
REG4
MOV32
TAN1
,
[
r0
]
MOV32
TAN1
,
[
BLOCK
]
movdqa
XMMS
,
REG0
movdqa
XMMS
,
REG0
psubsw
REG0
,
REG4
psubsw
REG0
,
REG4
paddsw
REG4
,
XMMS
paddsw
REG4
,
XMMS
...
@@ -423,33 +486,22 @@ SECTION .text
...
@@ -423,33 +486,22 @@ SECTION .text
movdqa
XMMS
,
REG0
movdqa
XMMS
,
REG0
psubsw
REG0
,
xmm3
psubsw
REG0
,
xmm3
paddsw
xmm3
,
XMMS
paddsw
xmm3
,
XMMS
MOV32
[r0],
TAN1
MOV32
[BLOCK],
TAN1
psraw
xmm5
,
6
psraw
REG0
,
6
FIRST_HALF
%1
,
%2
psraw
TAN3
,
6
psraw
xmm3
,
6
movdqa
[
%1
+
1
*
16
]
,
TAN3
movdqa
[
%1
+
2
*
16
]
,
xmm3
movdqa
[
%1
+
5
*
16
]
,
REG0
movdqa
[
%1
+
6
*
16
]
,
xmm5
movdqa
xmm0
,
xmm7
movdqa
xmm0
,
xmm7
movdqa
xmm4
,
REG4
movdqa
xmm4
,
REG4
psubsw
xmm7
,
xmm1
psubsw
xmm7
,
xmm1
psubsw
REG4
,
TAN1
psubsw
REG4
,
TAN1
paddsw
xmm1
,
xmm0
paddsw
xmm1
,
xmm0
paddsw
TAN1
,
xmm4
paddsw
TAN1
,
xmm4
psraw
xmm1
,
6
psraw
xmm7
,
6
SECOND_HALF
%1
,
%2
,
xmm1
,
xmm7
,
TAN1
,
REG4
psraw
TAN1
,
6
psraw
REG4
,
6
movdqa
[
%1
+
0
*
16
]
,
xmm1
movdqa
[
%1
+
3
*
16
]
,
TAN1
movdqa
[
%1
+
4
*
16
]
,
REG4
movdqa
[
%1
+
7
*
16
]
,
xmm7
%endmacro
%endmacro
; IDCT pass on columns, assuming rows 4-7 are zero
; IDCT pass on columns, assuming rows 4-7 are zero
%macro
iLLM_PASS_SPARSE
1
;dct
%macro
iLLM_PASS_SPARSE
2
; %1=dct %2=type(normal,put,add)
pmulhw
TAN3
,
xmm4
pmulhw
TAN3
,
xmm4
paddsw
TAN3
,
xmm4
paddsw
TAN3
,
xmm4
movdqa
xmm3
,
xmm6
movdqa
xmm3
,
xmm6
...
@@ -475,7 +527,7 @@ SECTION .text
...
@@ -475,7 +527,7 @@ SECTION .text
movdqa
xmm6
,
REG0
movdqa
xmm6
,
REG0
psubsw
xmm6
,
SREG2
psubsw
xmm6
,
SREG2
paddsw
SREG2
,
REG0
paddsw
SREG2
,
REG0
MOV32
TAN1
,
[
r0
]
MOV32
TAN1
,
[
BLOCK
]
movdqa
XMMS
,
REG0
movdqa
XMMS
,
REG0
psubsw
REG0
,
xmm5
psubsw
REG0
,
xmm5
paddsw
xmm5
,
XMMS
paddsw
xmm5
,
XMMS
...
@@ -485,70 +537,92 @@ SECTION .text
...
@@ -485,70 +537,92 @@ SECTION .text
movdqa
XMMS
,
REG0
movdqa
XMMS
,
REG0
psubsw
REG0
,
xmm3
psubsw
REG0
,
xmm3
paddsw
xmm3
,
XMMS
paddsw
xmm3
,
XMMS
MOV32
[r0],
TAN1
MOV32
[BLOCK],
TAN1
psraw
xmm5
,
6
psraw
REG0
,
6
FIRST_HALF
%1
,
%2
psraw
TAN3
,
6
psraw
xmm3
,
6
movdqa
[
%1
+
1
*
16
]
,
TAN3
movdqa
[
%1
+
2
*
16
]
,
xmm3
movdqa
[
%1
+
5
*
16
]
,
REG0
movdqa
[
%1
+
6
*
16
]
,
xmm5
movdqa
xmm0
,
SREG2
movdqa
xmm0
,
SREG2
movdqa
xmm4
,
xmm6
movdqa
xmm4
,
xmm6
psubsw
SREG2
,
xmm1
psubsw
SREG2
,
xmm1
psubsw
xmm6
,
TAN1
psubsw
xmm6
,
TAN1
paddsw
xmm1
,
xmm0
paddsw
xmm1
,
xmm0
paddsw
TAN1
,
xmm4
paddsw
TAN1
,
xmm4
psraw
xmm1
,
6
psraw
SREG2
,
6
SECOND_HALF
%1
,
%2
,
xmm1
,
SREG2
,
TAN1
,
xmm6
psraw
TAN1
,
6
psraw
xmm6
,
6
movdqa
[
%1
+
0
*
16
]
,
xmm1
movdqa
[
%1
+
3
*
16
]
,
TAN1
movdqa
[
%1
+
4
*
16
]
,
xmm6
movdqa
[
%1
+
7
*
16
]
,
SREG2
%endmacro
%endmacro
INIT_XMM
sse2
%macro
IDCT_SSE2
1
; 0=normal 1=put 2=add
cglobal
xvid_idct
,
1
,
5
,
8
+
7
*
ARCH_X86_64
,
block
%if
%1
==
0
||
ARCH_X86_32
%
define
GPR0
r1d
%
define
GPR1
r2d
%
define
GPR2
r3d
%
define
GPR3
r4d
%
define
NUM_GPRS
5
%else
%
define
GPR0
r3d
%
define
GPR1
r4d
%
define
GPR2
r5d
%
define
GPR3
r6d
%
define
NUM_GPRS
7
%endif
%if
%1
==
0
cglobal
xvid_idct
,
1
,
NUM_GPRS
,
8
+
7
*
ARCH_X86_64
,
block
%xdefine
BLOCK
blockq
%else
%
if
%1
==
1
cglobal
xvid_idct_put
,
0
,
NUM_GPRS
,
8
+
7
*
ARCH_X86_64
,
dest
,
stride
,
block
%
else
cglobal
xvid_idct_add
,
0
,
NUM_GPRS
,
8
+
7
*
ARCH_X86_64
,
dest
,
stride
,
block
%
endif
%
if
ARCH_X86_64
%
xdefine
BLOCK
blockq
%
else
mov
r0q
,
blockm
%
xdefine
BLOCK
r0q
%
endif
%endif
movq
mm0
,
[
pb_127
]
movq
mm0
,
[
pb_127
]
iMTX_MULT
r0
+
0
*
16
,
iTab1
,
PUT_EVEN
,
ROW0
,
0
*
16
iMTX_MULT
BLOCK
+
0
*
16
,
iTab1
,
PUT_EVEN
,
ROW0
,
0
*
16
iMTX_MULT
r0
+
1
*
16
,
iTab2
,
PUT_ODD
,
ROW1
,
1
*
16
iMTX_MULT
BLOCK
+
1
*
16
,
iTab2
,
PUT_ODD
,
ROW1
,
1
*
16
iMTX_MULT
r0
+
2
*
16
,
iTab3
,
PUT_EVEN
,
ROW2
,
2
*
16
iMTX_MULT
BLOCK
+
2
*
16
,
iTab3
,
PUT_EVEN
,
ROW2
,
2
*
16
TEST_TWO_ROWS
r0
+
3
*
16
,
r0
+
4
*
16
,
r1d
,
r2d
,
CLEAR_ODD
,
ROW3
,
CLEAR_EVEN
,
ROW4
; a, c
TEST_TWO_ROWS
BLOCK
+
3
*
16
,
BLOCK
+
4
*
16
,
GPR0
,
GPR1
,
CLEAR_ODD
,
ROW3
,
CLEAR_EVEN
,
ROW4
; a, c
JZ
r1d
,
col1
JZ
GPR0
,
col1
iMTX_MULT
r0
+
3
*
16
,
iTab4
,
PUT_ODD
,
ROW3
,
3
*
16
iMTX_MULT
BLOCK
+
3
*
16
,
iTab4
,
PUT_ODD
,
ROW3
,
3
*
16
.
col1
:
.
col1
:
TEST_TWO_ROWS
r0
+
5
*
16
,
r0
+
6
*
16
,
r1d
,
r3d
,
CLEAR_ODD
,
ROW5
,
CLEAR_EVEN
,
ROW6
; a, d
TEST_TWO_ROWS
BLOCK
+
5
*
16
,
BLOCK
+
6
*
16
,
GPR0
,
GPR2
,
CLEAR_ODD
,
ROW5
,
CLEAR_EVEN
,
ROW6
; a, d
TEST_ONE_ROW
r0
+
7
*
16
,
r4d
,
CLEAR_ODD
,
ROW7
; esi
TEST_ONE_ROW
BLOCK
+
7
*
16
,
GPR3
,
CLEAR_ODD
,
ROW7
; esi
iLLM_HEAD
iLLM_HEAD
JNZ
r2d
,
2
JNZ
GPR1
,
2
JNZ
r1d
,
3
JNZ
GPR0
,
3
JNZ
r3d
,
4
JNZ
GPR2
,
4
JNZ
r4d
,
5
JNZ
GPR3
,
5
iLLM_PASS_SPARSE
r0
iLLM_PASS_SPARSE
BLOCK
,
%1
jmp
.
6
jmp
.
6
.
2
:
.
2
:
iMTX_MULT
r0
+
4
*
16
,
iTab1
,
PUT_EVEN
,
ROW4
iMTX_MULT
BLOCK
+
4
*
16
,
iTab1
,
PUT_EVEN
,
ROW4
.
3
:
.
3
:
iMTX_MULT
r0
+
5
*
16
,
iTab4
,
PUT_ODD
,
ROW5
,
4
*
16
iMTX_MULT
BLOCK
+
5
*
16
,
iTab4
,
PUT_ODD
,
ROW5
,
4
*
16
JZ
r3d
,
col2
JZ
GPR2
,
col2
.
4
:
.
4
:
iMTX_MULT
r0
+
6
*
16
,
iTab3
,
PUT_EVEN
,
ROW6
,
5
*
16
iMTX_MULT
BLOCK
+
6
*
16
,
iTab3
,
PUT_EVEN
,
ROW6
,
5
*
16
.
col2
:
.
col2
:
JZ
r4d
,
col3
JZ
GPR3
,
col3
.
5
:
.
5
:
iMTX_MULT
r0
+
7
*
16
,
iTab2
,
PUT_ODD
,
ROW7
,
5
*
16
iMTX_MULT
BLOCK
+
7
*
16
,
iTab2
,
PUT_ODD
,
ROW7
,
5
*
16
.
col3
:
.
col3
:
%if
ARCH_X86_32
%if
ARCH_X86_32
iLLM_HEAD
iLLM_HEAD
%endif
%endif
iLLM_PASS
r0
iLLM_PASS
BLOCK
,
%1
.
6
:
.
6
:
RET
RET
%endmacro
INIT_XMM
sse2
IDCT_SSE2
0
IDCT_SSE2
1
%if
ARCH_X86_32
%if
ARCH_X86_32
...
...
libavcodec/x86/xvididct_init.c
View file @
decd5193
...
@@ -26,11 +26,7 @@
...
@@ -26,11 +26,7 @@
#include "idctdsp.h"
#include "idctdsp.h"
#include "xvididct.h"
#include "xvididct.h"
static
void
xvid_idct_sse2_put
(
uint8_t
*
dest
,
int
line_size
,
short
*
block
)
void
ff_xvid_idct_put_sse2
(
uint8_t
*
dest
,
int
line_size
,
short
*
block
);
{
ff_xvid_idct_sse2
(
block
);
ff_put_pixels_clamped
(
block
,
dest
,
line_size
);
}
static
void
xvid_idct_sse2_add
(
uint8_t
*
dest
,
int
line_size
,
short
*
block
)
static
void
xvid_idct_sse2_add
(
uint8_t
*
dest
,
int
line_size
,
short
*
block
)
{
{
...
@@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
...
@@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
#endif
#endif
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
c
->
idct_put
=
xvid_idct_sse2_put
;
c
->
idct_put
=
ff_xvid_idct_put_sse2
;
c
->
idct_add
=
xvid_idct_sse2_add
;
c
->
idct_add
=
xvid_idct_sse2_add
;
c
->
idct
=
ff_xvid_idct_sse2
;
c
->
idct
=
ff_xvid_idct_sse2
;
c
->
perm_type
=
FF_IDCT_PERM_SSE2
;
c
->
perm_type
=
FF_IDCT_PERM_SSE2
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment