Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
f76423d0
Commit
f76423d0
authored
Oct 06, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: add x86 simd (sse2/ssse3) for iadst4 10bpp functions.
parent
6b579cf5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
157 additions
and
82 deletions
+157
-82
vp9dsp_init.h
libavcodec/x86/vp9dsp_init.h
+6
-0
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+18
-3
vp9itxfm.asm
libavcodec/x86/vp9itxfm.asm
+0
-58
vp9itxfm_16bpp.asm
libavcodec/x86/vp9itxfm_16bpp.asm
+75
-21
vp9itxfm_template.asm
libavcodec/x86/vp9itxfm_template.asm
+58
-0
No files found.
libavcodec/x86/vp9dsp_init.h
View file @
f76423d0
...
...
@@ -62,6 +62,12 @@ void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t
int16_t *block, \
int eob)
#define decl_itxfm_funcs(size, bpp, opt) \
decl_itxfm_func(idct, idct, size, bpp, opt); \
decl_itxfm_func(iadst, idct, size, bpp, opt); \
decl_itxfm_func(idct, iadst, size, bpp, opt); \
decl_itxfm_func(iadst, iadst, size, bpp, opt)
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
...
...
libavcodec/x86/vp9dsp_init_16bpp_template.c
View file @
f76423d0
...
...
@@ -126,8 +126,11 @@ decl_ipred_fns(tm, BPC, mmxext, sse2);
decl_itxfm_func
(
iwht
,
iwht
,
4
,
BPC
,
mmxext
);
#if BPC == 10
decl_itxfm_func
(
idct
,
idct
,
4
,
BPC
,
mmxext
);
decl_itxfm_func
(
idct
,
idct
,
4
,
BPC
,
ssse3
);
decl_itxfm_func
(
idct
,
idct
,
4
,
BPC
,
mmxext
);
decl_itxfm_func
(
idct
,
iadst
,
4
,
BPC
,
sse2
);
decl_itxfm_func
(
iadst
,
idct
,
4
,
BPC
,
sse2
);
decl_itxfm_func
(
iadst
,
iadst
,
4
,
BPC
,
sse2
);
decl_itxfm_funcs
(
4
,
BPC
,
ssse3
);
#endif
#endif
/* HAVE_YASM */
...
...
@@ -169,6 +172,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \
init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \
init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
#define init_itx_funcs(idx, size, bpp, opt) \
init_itx_func(idx, DCT_DCT, idct, idct, size, bpp, opt); \
init_itx_func(idx, ADST_DCT, idct, iadst, size, bpp, opt); \
init_itx_func(idx, DCT_ADST, iadst, idct, size, bpp, opt); \
init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
if
(
EXTERNAL_MMXEXT
(
cpu_flags
))
{
init_ipred_func
(
tm
,
TM_VP8
,
4
,
BPC
,
mmxext
);
...
...
@@ -185,13 +193,20 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_subpel3
(
1
,
avg
,
BPC
,
sse2
);
init_lpf_funcs
(
BPC
,
sse2
);
init_8_16_32_ipred_funcs
(
tm
,
TM_VP8
,
BPC
,
sse2
);
#if BPC == 10
if
(
!
bitexact
)
{
init_itx_func
(
TX_4X4
,
ADST_DCT
,
idct
,
iadst
,
4
,
10
,
sse2
);
init_itx_func
(
TX_4X4
,
DCT_ADST
,
iadst
,
idct
,
4
,
10
,
sse2
);
init_itx_func
(
TX_4X4
,
ADST_ADST
,
iadst
,
iadst
,
4
,
10
,
sse2
);
}
#endif
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
init_lpf_funcs
(
BPC
,
ssse3
);
#if BPC == 10
if
(
!
bitexact
)
{
init_itx_func
(
TX_4X4
,
DCT_DCT
,
idct
,
idct
,
4
,
10
,
ssse3
);
init_itx_func
s
(
TX_4X4
,
4
,
BPC
,
ssse3
);
}
#endif
}
...
...
libavcodec/x86/vp9itxfm.asm
View file @
f76423d0
...
...
@@ -289,64 +289,6 @@ IDCT_4x4_FN ssse3
; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;-------------------------------------------------------------------------------------------
%macro
VP9_IADST4_1D
0
movq2dq
xmm0
,
m0
movq2dq
xmm1
,
m1
movq2dq
xmm2
,
m2
movq2dq
xmm3
,
m3
%if
cpuflag
(
ssse3
)
paddw
m3
,
m0
%endif
punpcklwd
xmm0
,
xmm1
punpcklwd
xmm2
,
xmm3
pmaddwd
xmm1
,
xmm0
,
[
pw_5283_13377
]
pmaddwd
xmm4
,
xmm0
,
[
pw_9929_13377
]
%if
notcpuflag
(
ssse3
)
pmaddwd
xmm6
,
xmm0
,
[
pw_13377_0
]
%endif
pmaddwd
xmm0
,
[
pw_15212_m13377
]
pmaddwd
xmm3
,
xmm2
,
[
pw_15212_9929
]
%if
notcpuflag
(
ssse3
)
pmaddwd
xmm7
,
xmm2
,
[
pw_m13377_13377
]
%endif
pmaddwd
xmm2
,
[
pw_m5283_m15212
]
%if
cpuflag
(
ssse3
)
psubw
m3
,
m2
%else
paddd
xmm6
,
xmm7
%endif
paddd
xmm0
,
xmm2
paddd
xmm3
,
xmm5
paddd
xmm2
,
xmm5
%if
notcpuflag
(
ssse3
)
paddd
xmm6
,
xmm5
%endif
paddd
xmm1
,
xmm3
paddd
xmm0
,
xmm3
paddd
xmm4
,
xmm2
psrad
xmm1
,
14
psrad
xmm0
,
14
psrad
xmm4
,
14
%if
cpuflag
(
ssse3
)
pmulhrsw
m3
,
[
pw_13377x2
]
; out2
%else
psrad
xmm6
,
14
%endif
packssdw
xmm0
,
xmm0
packssdw
xmm1
,
xmm1
packssdw
xmm4
,
xmm4
%if
notcpuflag
(
ssse3
)
packssdw
xmm6
,
xmm6
%endif
movdq2q
m0
,
xmm0
; out3
movdq2q
m1
,
xmm1
; out0
movdq2q
m2
,
xmm4
; out1
%if
notcpuflag
(
ssse3
)
movdq2q
m3
,
xmm6
; out2
%endif
SWAP
0
,
1
,
2
,
3
%endmacro
%macro
IADST4_FN
5
INIT_MMX
%5
cglobal
vp9_
%1
_
%3
_4x4_add
,
3
,
3
,
0
,
dst
,
stride
,
block
,
eob
...
...
libavcodec/x86/vp9itxfm_16bpp.asm
View file @
f76423d0
...
...
@@ -38,6 +38,15 @@ pw_m15137_6270: times 4 dw -15137, 6270
pw_6270_15137
:
times
4
dw
6270
,
15137
pw_11585x2
:
times
8
dw
11585
*
2
pw_5283_13377
:
times
4
dw
5283
,
13377
pw_9929_13377
:
times
4
dw
9929
,
13377
pw_15212_m13377
:
times
4
dw
15212
,
-
13377
pw_15212_9929
:
times
4
dw
15212
,
9929
pw_m5283_m15212
:
times
4
dw
-
5283
,
-
15212
pw_13377x2
:
times
8
dw
13377
*
2
pw_m13377_13377
:
times
4
dw
-
13377
,
13377
pw_13377_0
:
times
4
dw
13377
,
0
SECTION
.
text
%macro
VP9_STORE_2X
6
-
7
dstq
; reg1, reg2, tmp1, tmp2, min, max, dst
...
...
@@ -129,6 +138,30 @@ IWHT4_FN 10, 1023
INIT_MMX
mmxext
IWHT4_FN
12
,
4095
%macro
VP9_IDCT4_WRITEOUT
0
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pw_2048
]
pmulhrsw
m0
,
m5
pmulhrsw
m1
,
m5
pmulhrsw
m2
,
m5
pmulhrsw
m3
,
m5
%else
mova
m5
,
[
pw_8
]
paddw
m0
,
m5
paddw
m1
,
m5
paddw
m2
,
m5
paddw
m3
,
m5
psraw
m0
,
4
psraw
m1
,
4
psraw
m2
,
4
psraw
m3
,
4
%endif
mova
m5
,
[
pw_1023
]
VP9_STORE_2X
0
,
1
,
6
,
7
,
4
,
5
lea
dstq
,
[
dstq
+
2
*
strideq
]
VP9_STORE_2X
2
,
3
,
6
,
7
,
4
,
5
%endmacro
; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
; in 15+1 words without additional effort, since the coefficients are 15bpp.
...
...
@@ -186,27 +219,7 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
pxor
m4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pw_2048
]
pmulhrsw
m0
,
m5
pmulhrsw
m1
,
m5
pmulhrsw
m2
,
m5
pmulhrsw
m3
,
m5
%else
mova
m5
,
[
pw_8
]
paddw
m0
,
m5
paddw
m1
,
m5
paddw
m2
,
m5
paddw
m3
,
m5
psraw
m0
,
4
psraw
m1
,
4
psraw
m2
,
4
psraw
m3
,
4
%endif
mova
m5
,
[
pw_1023
]
VP9_STORE_2X
0
,
1
,
6
,
7
,
4
,
5
lea
dstq
,
[
dstq
+
2
*
strideq
]
VP9_STORE_2X
2
,
3
,
6
,
7
,
4
,
5
VP9_IDCT4_WRITEOUT
RET
%endmacro
...
...
@@ -214,3 +227,44 @@ INIT_MMX mmxext
IDCT4_10_FN
INIT_MMX
ssse3
IDCT4_10_FN
%macro
IADST4_FN
4
cglobal
vp9_
%1
_
%3
_4x4_add_10
,
3
,
3
,
0
,
dst
,
stride
,
block
,
eob
%if
WIN64
&&
notcpuflag
(
ssse3
)
WIN64_SPILL_XMM
8
%endif
movdqa
xmm5
,
[
pd_8192
]
mova
m0
,
[
blockq
+
0
*
16
+
0
]
mova
m1
,
[
blockq
+
1
*
16
+
0
]
packssdw
m0
,
[
blockq
+
0
*
16
+
8
]
packssdw
m1
,
[
blockq
+
1
*
16
+
8
]
mova
m2
,
[
blockq
+
2
*
16
+
0
]
mova
m3
,
[
blockq
+
3
*
16
+
0
]
packssdw
m2
,
[
blockq
+
2
*
16
+
8
]
packssdw
m3
,
[
blockq
+
3
*
16
+
8
]
%if
cpuflag
(
ssse3
)
mova
m6
,
[
pw_11585x2
]
%endif
%ifnidn
%1%3
,
iadstiadst
movdq2q
m7
,
xmm5
%endif
VP9_
%2
_1D
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
VP9_
%4
_1D
pxor
m4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
VP9_IDCT4_WRITEOUT
RET
%endmacro
INIT_MMX
sse2
IADST4_FN
idct
,
IDCT4
,
iadst
,
IADST4
IADST4_FN
iadst
,
IADST4
,
idct
,
IDCT4
IADST4_FN
iadst
,
IADST4
,
iadst
,
IADST4
INIT_MMX
ssse3
IADST4_FN
idct
,
IDCT4
,
iadst
,
IADST4
IADST4_FN
iadst
,
IADST4
,
idct
,
IDCT4
IADST4_FN
iadst
,
IADST4
,
iadst
,
IADST4
libavcodec/x86/vp9itxfm_template.asm
View file @
f76423d0
...
...
@@ -82,3 +82,61 @@
VP9_UNPACK_MULSUB_2W_4X
1
,
3
,
15137
,
6270
,
m7
,
4
,
5
; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
%macro
VP9_IADST4_1D
0
movq2dq
xmm0
,
m0
movq2dq
xmm1
,
m1
movq2dq
xmm2
,
m2
movq2dq
xmm3
,
m3
%if
cpuflag
(
ssse3
)
paddw
m3
,
m0
%endif
punpcklwd
xmm0
,
xmm1
punpcklwd
xmm2
,
xmm3
pmaddwd
xmm1
,
xmm0
,
[
pw_5283_13377
]
pmaddwd
xmm4
,
xmm0
,
[
pw_9929_13377
]
%if
notcpuflag
(
ssse3
)
pmaddwd
xmm6
,
xmm0
,
[
pw_13377_0
]
%endif
pmaddwd
xmm0
,
[
pw_15212_m13377
]
pmaddwd
xmm3
,
xmm2
,
[
pw_15212_9929
]
%if
notcpuflag
(
ssse3
)
pmaddwd
xmm7
,
xmm2
,
[
pw_m13377_13377
]
%endif
pmaddwd
xmm2
,
[
pw_m5283_m15212
]
%if
cpuflag
(
ssse3
)
psubw
m3
,
m2
%else
paddd
xmm6
,
xmm7
%endif
paddd
xmm0
,
xmm2
paddd
xmm3
,
xmm5
paddd
xmm2
,
xmm5
%if
notcpuflag
(
ssse3
)
paddd
xmm6
,
xmm5
%endif
paddd
xmm1
,
xmm3
paddd
xmm0
,
xmm3
paddd
xmm4
,
xmm2
psrad
xmm1
,
14
psrad
xmm0
,
14
psrad
xmm4
,
14
%if
cpuflag
(
ssse3
)
pmulhrsw
m3
,
[
pw_13377x2
]
; out2
%else
psrad
xmm6
,
14
%endif
packssdw
xmm0
,
xmm0
packssdw
xmm1
,
xmm1
packssdw
xmm4
,
xmm4
%if
notcpuflag
(
ssse3
)
packssdw
xmm6
,
xmm6
%endif
movdq2q
m0
,
xmm0
; out3
movdq2q
m1
,
xmm1
; out0
movdq2q
m2
,
xmm4
; out1
%if
notcpuflag
(
ssse3
)
movdq2q
m3
,
xmm6
; out2
%endif
SWAP
0
,
1
,
2
,
3
%endmacro
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment