Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
6b579cf5
Commit
6b579cf5
authored
Oct 06, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: add 10bpp simd (mmxext/ssse3) for idct_idct_4x4.
parent
1c3be325
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
159 additions
and
49 deletions
+159
-49
constants.c
libavcodec/x86/constants.c
+2
-0
constants.h
libavcodec/x86/constants.h
+1
-0
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+12
-0
vp9itxfm.asm
libavcodec/x86/vp9itxfm.asm
+1
-49
vp9itxfm_16bpp.asm
libavcodec/x86/vp9itxfm_16bpp.asm
+96
-0
vp9itxfm_template.asm
libavcodec/x86/vp9itxfm_template.asm
+47
-0
No files found.
libavcodec/x86/constants.c
View file @
6b579cf5
...
...
@@ -85,6 +85,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000
0x0000001000000010ULL
,
0x0000001000000010ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_32
)
=
{
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_8192
)
=
{
0x0000200000002000ULL
,
0x0000200000002000ULL
,
0x0000200000002000ULL
,
0x0000200000002000ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_65535
)
=
{
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
};
...
...
libavcodec/x86/constants.h
View file @
6b579cf5
...
...
@@ -65,6 +65,7 @@ extern const xmm_reg ff_ps_neg;
extern
const
ymm_reg
ff_pd_1
;
extern
const
ymm_reg
ff_pd_16
;
extern
const
ymm_reg
ff_pd_32
;
extern
const
ymm_reg
ff_pd_8192
;
extern
const
ymm_reg
ff_pd_65535
;
# if ARCH_X86_64
...
...
libavcodec/x86/vp9dsp_init_16bpp_template.c
View file @
6b579cf5
...
...
@@ -125,6 +125,10 @@ lpf_mix2_wrappers_set(BPC, avx);
decl_ipred_fns
(
tm
,
BPC
,
mmxext
,
sse2
);
decl_itxfm_func
(
iwht
,
iwht
,
4
,
BPC
,
mmxext
);
#if BPC == 10
decl_itxfm_func
(
idct
,
idct
,
4
,
BPC
,
mmxext
);
decl_itxfm_func
(
idct
,
idct
,
4
,
BPC
,
ssse3
);
#endif
#endif
/* HAVE_YASM */
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
,
int
bitexact
)
...
...
@@ -170,6 +174,9 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_ipred_func
(
tm
,
TM_VP8
,
4
,
BPC
,
mmxext
);
if
(
!
bitexact
)
{
init_itx_func_one
(
4
/* lossless */
,
iwht
,
iwht
,
4
,
BPC
,
mmxext
);
#if BPC == 10
init_itx_func
(
TX_4X4
,
DCT_DCT
,
idct
,
idct
,
4
,
10
,
mmxext
);
#endif
}
}
...
...
@@ -182,6 +189,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
init_lpf_funcs
(
BPC
,
ssse3
);
#if BPC == 10
if
(
!
bitexact
)
{
init_itx_func
(
TX_4X4
,
DCT_DCT
,
idct
,
idct
,
4
,
10
,
ssse3
);
}
#endif
}
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9itxfm.asm
View file @
6b579cf5
...
...
@@ -71,8 +71,6 @@ pw_13377x2: times 8 dw 13377*2
pw_m13377_13377
:
times
4
dw
-
13377
,
13377
pw_13377_0
:
times
4
dw
13377
,
0
pd_8192
:
times
4
dd
8192
cextern
pw_8
cextern
pw_16
cextern
pw_32
...
...
@@ -80,38 +78,10 @@ cextern pw_512
cextern
pw_1024
cextern
pw_2048
cextern
pw_m1
cextern
pd_8192
SECTION
.
text
; (a*x + b*y + round) >> shift
%macro
VP9_MULSUB_2W_2X
5
; dst1, dst2/src, round, coefs1, coefs2
pmaddwd
m%1
,
m%2
,
%4
pmaddwd
m%2
,
%5
paddd
m%1
,
%3
paddd
m%2
,
%3
psrad
m%1
,
14
psrad
m%2
,
14
%endmacro
%macro
VP9_MULSUB_2W_4X
7
; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
VP9_MULSUB_2W_2X
%7
,
%6
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
VP9_MULSUB_2W_2X
%1
,
%2
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
packssdw
m%1
,
m%7
packssdw
m%2
,
m%6
%endmacro
%macro
VP9_UNPACK_MULSUB_2W_4X
7
-
9
; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if
%0
==
7
punpckhwd
m%6
,
m%2
,
m%1
punpcklwd
m%2
,
m%1
VP9_MULSUB_2W_4X
%1
,
%2
,
%3
,
%4
,
%5
,
%6
,
%7
%else
punpckhwd
m%8
,
m%4
,
m%3
punpcklwd
m%2
,
m%4
,
m%3
VP9_MULSUB_2W_4X
%1
,
%2
,
%5
,
%6
,
%7
,
%8
,
%9
%endif
%endmacro
%macro
VP9_UNPACK_MULSUB_2D_4X
6
; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
punpckhwd
m%4
,
m%2
,
m%1
punpcklwd
m%2
,
m%1
...
...
@@ -191,24 +161,6 @@ cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;-------------------------------------------------------------------------------------------
%macro
VP9_IDCT4_1D_FINALIZE
0
SUMSUB_BA
w
,
3
,
2
,
4
; m3=t3+t0, m2=-t3+t0
SUMSUB_BA
w
,
1
,
0
,
4
; m1=t2+t1, m0=-t2+t1
SWAP
0
,
3
,
2
; 3102 -> 0123
%endmacro
%macro
VP9_IDCT4_1D
0
%if
cpuflag
(
ssse3
)
SUMSUB_BA
w
,
2
,
0
,
4
; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
pmulhrsw
m2
,
m6
; m2=t0
pmulhrsw
m0
,
m6
; m0=t1
%else
; <= sse2
VP9_UNPACK_MULSUB_2W_4X
0
,
2
,
11585
,
11585
,
m7
,
4
,
5
; m0=t1, m1=t0
%endif
VP9_UNPACK_MULSUB_2W_4X
1
,
3
,
15137
,
6270
,
m7
,
4
,
5
; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
; 2x2 top left corner
%macro
VP9_IDCT4_2x2_1D
0
pmulhrsw
m0
,
m5
; m0=t1
...
...
libavcodec/x86/vp9itxfm_16bpp.asm
View file @
6b579cf5
...
...
@@ -25,8 +25,18 @@
SECTION_RODATA
cextern
pw_8
cextern
pw_1023
cextern
pw_2048
cextern
pw_4095
cextern
pd_8192
; FIXME these should probably be shared between 8bpp and 10/12bpp
pw_m11585_11585
:
times
4
dw
-
11585
,
11585
pw_11585_11585
:
times
8
dw
11585
pw_m15137_6270
:
times
4
dw
-
15137
,
6270
pw_6270_15137
:
times
4
dw
6270
,
15137
pw_11585x2
:
times
8
dw
11585
*
2
SECTION
.
text
...
...
@@ -118,3 +128,89 @@ INIT_MMX mmxext
IWHT4_FN
10
,
1023
INIT_MMX
mmxext
IWHT4_FN
12
,
4095
; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
; in 15+1 words without additional effort, since the coefficients are 15bpp.
%macro
IDCT4_10_FN
0
cglobal
vp9_idct_idct_4x4_add_10
,
4
,
4
,
8
,
dst
,
stride
,
block
,
eob
cmp
eobd
,
1
jg
.
idctfull
; dc-only
%if
cpuflag
(
ssse3
)
movd
m0
,
[blockq]
mova
m5
,
[
pw_11585x2
]
pmulhrsw
m0
,
m5
pmulhrsw
m0
,
m5
%else
DEFINE_ARGS
dst
,
stride
,
block
,
coef
mov
coefd
,
dword
[blockq]
imul
coefd
,
11585
add
coefd
,
8192
sar
coefd
,
14
imul
coefd
,
11585
add
coefd
,
(
8
<<
14
)
+
8192
sar
coefd
,
14
+
4
movd
m0
,
coefd
%endif
pshufw
m0
,
m0
,
0
pxor
m4
,
m4
mova
m5
,
[
pw_1023
]
movh
[blockq],
m4
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_2048
]
; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
%endif
VP9_STORE_2X
0
,
0
,
6
,
7
,
4
,
5
lea
dstq
,
[
dstq
+
2
*
strideq
]
VP9_STORE_2X
0
,
0
,
6
,
7
,
4
,
5
RET
.
idctfull
:
mova
m0
,
[
blockq
+
0
*
16
+
0
]
mova
m1
,
[
blockq
+
1
*
16
+
0
]
packssdw
m0
,
[
blockq
+
0
*
16
+
8
]
packssdw
m1
,
[
blockq
+
1
*
16
+
8
]
mova
m2
,
[
blockq
+
2
*
16
+
0
]
mova
m3
,
[
blockq
+
3
*
16
+
0
]
packssdw
m2
,
[
blockq
+
2
*
16
+
8
]
packssdw
m3
,
[
blockq
+
3
*
16
+
8
]
%if
cpuflag
(
ssse3
)
mova
m6
,
[
pw_11585x2
]
%endif
mova
m7
,
[
pd_8192
]
; rounding
VP9_IDCT4_1D
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
VP9_IDCT4_1D
pxor
m4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pw_2048
]
pmulhrsw
m0
,
m5
pmulhrsw
m1
,
m5
pmulhrsw
m2
,
m5
pmulhrsw
m3
,
m5
%else
mova
m5
,
[
pw_8
]
paddw
m0
,
m5
paddw
m1
,
m5
paddw
m2
,
m5
paddw
m3
,
m5
psraw
m0
,
4
psraw
m1
,
4
psraw
m2
,
4
psraw
m3
,
4
%endif
mova
m5
,
[
pw_1023
]
VP9_STORE_2X
0
,
1
,
6
,
7
,
4
,
5
lea
dstq
,
[
dstq
+
2
*
strideq
]
VP9_STORE_2X
2
,
3
,
6
,
7
,
4
,
5
RET
%endmacro
INIT_MMX
mmxext
IDCT4_10_FN
INIT_MMX
ssse3
IDCT4_10_FN
libavcodec/x86/vp9itxfm_template.asm
View file @
6b579cf5
...
...
@@ -35,3 +35,50 @@
paddw
m3
,
m2
SWAP
3
,
2
,
1
%endmacro
; (a*x + b*y + round) >> shift
%macro
VP9_MULSUB_2W_2X
5
; dst1, dst2/src, round, coefs1, coefs2
pmaddwd
m%1
,
m%2
,
%4
pmaddwd
m%2
,
%5
paddd
m%1
,
%3
paddd
m%2
,
%3
psrad
m%1
,
14
psrad
m%2
,
14
%endmacro
%macro
VP9_MULSUB_2W_4X
7
; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
VP9_MULSUB_2W_2X
%7
,
%6
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
VP9_MULSUB_2W_2X
%1
,
%2
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
packssdw
m%1
,
m%7
packssdw
m%2
,
m%6
%endmacro
%macro
VP9_UNPACK_MULSUB_2W_4X
7
-
9
; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if
%0
==
7
punpckhwd
m%6
,
m%2
,
m%1
punpcklwd
m%2
,
m%1
VP9_MULSUB_2W_4X
%1
,
%2
,
%3
,
%4
,
%5
,
%6
,
%7
%else
punpckhwd
m%8
,
m%4
,
m%3
punpcklwd
m%2
,
m%4
,
m%3
VP9_MULSUB_2W_4X
%1
,
%2
,
%5
,
%6
,
%7
,
%8
,
%9
%endif
%endmacro
%macro
VP9_IDCT4_1D_FINALIZE
0
SUMSUB_BA
w
,
3
,
2
,
4
; m3=t3+t0, m2=-t3+t0
SUMSUB_BA
w
,
1
,
0
,
4
; m1=t2+t1, m0=-t2+t1
SWAP
0
,
3
,
2
; 3102 -> 0123
%endmacro
%macro
VP9_IDCT4_1D
0
%if
cpuflag
(
ssse3
)
SUMSUB_BA
w
,
2
,
0
,
4
; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
pmulhrsw
m2
,
m6
; m2=t0
pmulhrsw
m0
,
m6
; m0=t1
%else
; <= sse2
VP9_UNPACK_MULSUB_2W_4X
0
,
2
,
11585
,
11585
,
m7
,
4
,
5
; m0=t1, m1=t0
%endif
VP9_UNPACK_MULSUB_2W_4X
1
,
3
,
15137
,
6270
,
m7
,
4
,
5
; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment