Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
26ece7a5
Commit
26ece7a5
authored
Sep 25, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.
parent
db7786e8
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
669 additions
and
5 deletions
+669
-5
Makefile
libavcodec/x86/Makefile
+1
-0
constants.c
libavcodec/x86/constants.c
+4
-0
constants.h
libavcodec/x86/constants.h
+2
-0
h264_idct_10bit.asm
libavcodec/x86/h264_idct_10bit.asm
+1
-4
h264_intrapred_10bit.asm
libavcodec/x86/h264_intrapred_10bit.asm
+1
-1
vp9dsp_init.h
libavcodec/x86/vp9dsp_init.h
+23
-0
vp9dsp_init_16bpp.c
libavcodec/x86/vp9dsp_init_16bpp.c
+15
-0
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+7
-0
vp9intrapred_16bpp.asm
libavcodec/x86/vp9intrapred_16bpp.asm
+615
-0
No files found.
libavcodec/x86/Makefile
View file @
26ece7a5
...
...
@@ -158,6 +158,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER)
+=
x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER)
+=
x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER)
+=
x86/vp9intrapred.o
\
x86/vp9intrapred_16bpp.o
\
x86/vp9itxfm.o
\
x86/vp9lpf.o
\
x86/vp9lpf_16bpp.o
\
...
...
libavcodec/x86/constants.c
View file @
26ece7a5
...
...
@@ -81,3 +81,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x800
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_1
)
=
{
0x0000000100000001ULL
,
0x0000000100000001ULL
,
0x0000000100000001ULL
,
0x0000000100000001ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_16
)
=
{
0x0000001000000010ULL
,
0x0000001000000010ULL
,
0x0000001000000010ULL
,
0x0000001000000010ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_32
)
=
{
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
};
libavcodec/x86/constants.h
View file @
26ece7a5
...
...
@@ -63,5 +63,7 @@ extern const uint64_t ff_pb_FC;
extern
const
xmm_reg
ff_ps_neg
;
extern
const
ymm_reg
ff_pd_1
;
extern
const
ymm_reg
ff_pd_16
;
extern
const
ymm_reg
ff_pd_32
;
#endif
/* AVCODEC_X86_CONSTANTS_H */
libavcodec/x86/h264_idct_10bit.asm
View file @
26ece7a5
...
...
@@ -24,14 +24,11 @@
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
pd_32
:
times
4
dd
32
SECTION
.
text
cextern
pw_1023
%define
pw_pixel_max
pw_1023
cextern
pd_32
;-----------------------------------------------------------------------------
; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
...
...
libavcodec/x86/h264_intrapred_10bit.asm
View file @
26ece7a5
...
...
@@ -34,11 +34,11 @@ cextern pw_8
cextern
pw_4
cextern
pw_2
cextern
pw_1
cextern
pd_16
pw_m32101234
:
dw
-
3
,
-
2
,
-
1
,
0
,
1
,
2
,
3
,
4
pw_m3
:
times
8
dw
-
3
pd_17
:
times
4
dd
17
pd_16
:
times
4
dd
16
SECTION
.
text
...
...
libavcodec/x86/vp9dsp_init.h
View file @
26ece7a5
...
...
@@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
#define decl_ipred_fn(type, sz, bpp, opt) \
void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *l, \
const uint8_t *a)
#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
decl_ipred_fn(type, 4, bpp, opt4); \
decl_ipred_fn(type, 8, bpp, opt8_16_32); \
decl_ipred_fn(type, 16, bpp, opt8_16_32); \
decl_ipred_fn(type, 32, bpp, opt8_16_32)
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
...
...
@@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
#define cat(a, bpp, b) a##bpp##b
#define init_ipred_func(type, enum, sz, bpp, opt) \
dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
init_ipred_func(type, enum, 8, bpp, opt); \
init_ipred_func(type, enum, 16, bpp, opt); \
init_ipred_func(type, enum, 32, bpp, opt)
void
ff_vp9dsp_init_10bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_12bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
);
...
...
libavcodec/x86/vp9dsp_init_16bpp.c
View file @
26ece7a5
...
...
@@ -46,6 +46,11 @@ decl_fpel_func(avg, 32, _16, avx2);
decl_fpel_func
(
avg
,
64
,
_16
,
avx2
);
decl_fpel_func
(
avg
,
128
,
_16
,
avx2
);
decl_ipred_fns
(
v
,
16
,
mmx
,
sse
);
decl_ipred_fns
(
h
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc_top
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc_left
,
16
,
mmxext
,
sse2
);
#endif
/* HAVE_YASM */
av_cold
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
)
...
...
@@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
if
(
EXTERNAL_MMX
(
cpu_flags
))
{
init_fpel_func
(
4
,
0
,
8
,
put
,
,
mmx
);
init_ipred_func
(
v
,
VERT
,
4
,
16
,
mmx
);
}
if
(
EXTERNAL_MMXEXT
(
cpu_flags
))
{
init_fpel_func
(
4
,
1
,
8
,
avg
,
_16
,
mmxext
);
init_ipred_func
(
h
,
HOR
,
4
,
16
,
mmxext
);
init_ipred_func
(
dc
,
DC
,
4
,
16
,
mmxext
);
init_ipred_func
(
dc_top
,
TOP_DC
,
4
,
16
,
mmxext
);
init_ipred_func
(
dc_left
,
LEFT_DC
,
4
,
16
,
mmxext
);
}
if
(
EXTERNAL_SSE
(
cpu_flags
))
{
...
...
@@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func
(
2
,
0
,
32
,
put
,
,
sse
);
init_fpel_func
(
1
,
0
,
64
,
put
,
,
sse
);
init_fpel_func
(
0
,
0
,
128
,
put
,
,
sse
);
init_8_16_32_ipred_funcs
(
v
,
VERT
,
16
,
sse
);
}
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
...
...
@@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func
(
2
,
1
,
32
,
avg
,
_16
,
sse2
);
init_fpel_func
(
1
,
1
,
64
,
avg
,
_16
,
sse2
);
init_fpel_func
(
0
,
1
,
128
,
avg
,
_16
,
sse2
);
init_8_16_32_ipred_funcs
(
h
,
HOR
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc
,
DC
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc_top
,
TOP_DC
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc_left
,
LEFT_DC
,
16
,
sse2
);
}
if
(
EXTERNAL_AVX_FAST
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9dsp_init_16bpp_template.c
View file @
26ece7a5
...
...
@@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \
lpf_mix2_wrappers_set
(
BPC
,
sse2
);
lpf_mix2_wrappers_set
(
BPC
,
ssse3
);
lpf_mix2_wrappers_set
(
BPC
,
avx
);
decl_ipred_fns
(
tm
,
BPC
,
mmxext
,
sse2
);
#endif
/* HAVE_YASM */
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
)
...
...
@@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
if
(
EXTERNAL_MMXEXT
(
cpu_flags
))
{
init_ipred_func
(
tm
,
TM_VP8
,
4
,
BPC
,
mmxext
);
}
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
init_subpel3
(
0
,
put
,
BPC
,
sse2
);
init_subpel3
(
1
,
avg
,
BPC
,
sse2
);
init_lpf_funcs
(
BPC
,
sse2
);
init_8_16_32_ipred_funcs
(
tm
,
TM_VP8
,
BPC
,
sse2
);
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9intrapred_16bpp.asm
0 → 100644
View file @
26ece7a5
;******************************************************************************
;* VP9 Intra prediction SIMD optimizations
;*
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
32
pd_2
:
times
8
dd
2
pd_4
:
times
8
dd
4
pd_8
:
times
8
dd
8
cextern
pw_1
cextern
pw_1023
cextern
pw_4095
cextern
pd_16
cextern
pd_32
SECTION
.
text
INIT_MMX
mmx
cglobal
vp9_ipred_v_4x4_16
,
2
,
4
,
1
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[aq]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
sse
cglobal
vp9_ipred_v_8x8_16
,
2
,
4
,
1
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[aq]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
4
]
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
sse
cglobal
vp9_ipred_v_16x16_16
,
2
,
4
,
2
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[aq]
mova
m1
,
[
aq
+
mmsize
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
mov
cntd
,
4
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m1
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m1
mova
[
dstq
+
strideq
*
2
+
0
]
,
m0
mova
[
dstq
+
strideq
*
2
+
16
]
,
m1
mova
[
dstq
+
stride3q
+
0
]
,
m0
mova
[
dstq
+
stride3q
+
16
]
,
m1
lea
dstq
,
[
dstq
+
strideq
*
4
]
dec
cntd
jg
.
loop
RET
INIT_XMM
sse
cglobal
vp9_ipred_v_32x32_16
,
2
,
4
,
4
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[
aq
+
mmsize
*
0
]
mova
m1
,
[
aq
+
mmsize
*
1
]
mova
m2
,
[
aq
+
mmsize
*
2
]
mova
m3
,
[
aq
+
mmsize
*
3
]
DEFINE_ARGS
dst
,
stride
,
cnt
mov
cntd
,
16
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m1
mova
[
dstq
+
strideq
*
0
+
32
]
,
m2
mova
[
dstq
+
strideq
*
0
+
48
]
,
m3
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m1
mova
[
dstq
+
strideq
*
1
+
32
]
,
m2
mova
[
dstq
+
strideq
*
1
+
48
]
,
m3
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntd
jg
.
loop
RET
INIT_MMX
mmxext
cglobal
vp9_ipred_h_4x4_16
,
3
,
3
,
4
,
dst
,
stride
,
l
,
a
mova
m3
,
[lq]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pshufw
m0
,
m3
,
q3333
pshufw
m1
,
m3
,
q2222
pshufw
m2
,
m3
,
q1111
pshufw
m3
,
m3
,
q0000
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m1
mova
[
dstq
+
strideq
*
2
]
,
m2
mova
[
dstq
+
stride3q
]
,
m3
RET
INIT_XMM
sse2
cglobal
vp9_ipred_h_8x8_16
,
3
,
3
,
4
,
dst
,
stride
,
l
,
a
mova
m2
,
[lq]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
punpckhwd
m3
,
m2
,
m2
pshufd
m0
,
m3
,
q3333
pshufd
m1
,
m3
,
q2222
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m1
pshufd
m0
,
m3
,
q1111
pshufd
m1
,
m3
,
q0000
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m1
lea
dstq
,
[
dstq
+
strideq
*
4
]
punpcklwd
m2
,
m2
pshufd
m0
,
m2
,
q3333
pshufd
m1
,
m2
,
q2222
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m1
pshufd
m0
,
m2
,
q1111
pshufd
m1
,
m2
,
q0000
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m1
RET
INIT_XMM
sse2
cglobal
vp9_ipred_h_16x16_16
,
3
,
5
,
4
,
dst
,
stride
,
l
,
stride3
,
cnt
mov
cntd
,
3
lea
stride3q
,
[
strideq
*
3
]
.
loop
:
movh
m3
,
[
lq
+
cntq
*
8
]
punpcklwd
m3
,
m3
pshufd
m0
,
m3
,
q3333
pshufd
m1
,
m3
,
q2222
pshufd
m2
,
m3
,
q1111
pshufd
m3
,
m3
,
q0000
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m1
mova
[
dstq
+
strideq
*
1
+
16
]
,
m1
mova
[
dstq
+
strideq
*
2
+
0
]
,
m2
mova
[
dstq
+
strideq
*
2
+
16
]
,
m2
mova
[
dstq
+
stride3q
+
0
]
,
m3
mova
[
dstq
+
stride3q
+
16
]
,
m3
lea
dstq
,
[
dstq
+
strideq
*
4
]
dec
cntd
jge
.
loop
RET
INIT_XMM
sse2
cglobal
vp9_ipred_h_32x32_16
,
3
,
5
,
4
,
dst
,
stride
,
l
,
stride3
,
cnt
mov
cntd
,
7
lea
stride3q
,
[
strideq
*
3
]
.
loop
:
movh
m3
,
[
lq
+
cntq
*
8
]
punpcklwd
m3
,
m3
pshufd
m0
,
m3
,
q3333
pshufd
m1
,
m3
,
q2222
pshufd
m2
,
m3
,
q1111
pshufd
m3
,
m3
,
q0000
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
0
+
32
]
,
m0
mova
[
dstq
+
strideq
*
0
+
48
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m1
mova
[
dstq
+
strideq
*
1
+
16
]
,
m1
mova
[
dstq
+
strideq
*
1
+
32
]
,
m1
mova
[
dstq
+
strideq
*
1
+
48
]
,
m1
mova
[
dstq
+
strideq
*
2
+
0
]
,
m2
mova
[
dstq
+
strideq
*
2
+
16
]
,
m2
mova
[
dstq
+
strideq
*
2
+
32
]
,
m2
mova
[
dstq
+
strideq
*
2
+
48
]
,
m2
mova
[
dstq
+
stride3q
+
0
]
,
m3
mova
[
dstq
+
stride3q
+
16
]
,
m3
mova
[
dstq
+
stride3q
+
32
]
,
m3
mova
[
dstq
+
stride3q
+
48
]
,
m3
lea
dstq
,
[
dstq
+
strideq
*
4
]
dec
cntd
jge
.
loop
RET
INIT_MMX
mmxext
cglobal
vp9_ipred_dc_4x4_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
paddw
m0
,
[aq]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pmaddwd
m0
,
[
pw_1
]
pshufw
m1
,
m0
,
q3232
paddd
m0
,
[
pd_4
]
paddd
m0
,
m1
psrad
m0
,
3
pshufw
m0
,
m0
,
q0000
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
sse2
cglobal
vp9_ipred_dc_8x8_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
paddw
m0
,
[aq]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pmaddwd
m0
,
[
pw_1
]
pshufd
m1
,
m0
,
q3232
paddd
m0
,
m1
pshufd
m1
,
m0
,
q1111
paddd
m0
,
[
pd_8
]
paddd
m0
,
m1
psrad
m0
,
4
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
4
]
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
sse2
cglobal
vp9_ipred_dc_16x16_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
paddw
m0
,
[
lq
+
mmsize
]
paddw
m0
,
[aq]
paddw
m0
,
[
aq
+
mmsize
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
mov
cntd
,
4
pmaddwd
m0
,
[
pw_1
]
pshufd
m1
,
m0
,
q3232
paddd
m0
,
m1
pshufd
m1
,
m0
,
q1111
paddd
m0
,
[
pd_16
]
paddd
m0
,
m1
psrad
m0
,
5
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m0
mova
[
dstq
+
strideq
*
2
+
0
]
,
m0
mova
[
dstq
+
strideq
*
2
+
16
]
,
m0
mova
[
dstq
+
stride3q
+
0
]
,
m0
mova
[
dstq
+
stride3q
+
16
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
4
]
dec
cntd
jg
.
loop
RET
INIT_XMM
sse2
cglobal
vp9_ipred_dc_32x32_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[
lq
+
mmsize
*
0
]
paddw
m0
,
[
lq
+
mmsize
*
1
]
paddw
m0
,
[
lq
+
mmsize
*
2
]
paddw
m0
,
[
lq
+
mmsize
*
3
]
paddw
m0
,
[
aq
+
mmsize
*
0
]
paddw
m0
,
[
aq
+
mmsize
*
1
]
paddw
m0
,
[
aq
+
mmsize
*
2
]
paddw
m0
,
[
aq
+
mmsize
*
3
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
mov
cntd
,
16
pmaddwd
m0
,
[
pw_1
]
pshufd
m1
,
m0
,
q3232
paddd
m0
,
m1
pshufd
m1
,
m0
,
q1111
paddd
m0
,
[
pd_32
]
paddd
m0
,
m1
psrad
m0
,
6
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
0
+
32
]
,
m0
mova
[
dstq
+
strideq
*
0
+
48
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m0
mova
[
dstq
+
strideq
*
1
+
32
]
,
m0
mova
[
dstq
+
strideq
*
1
+
48
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntd
jg
.
loop
RET
%macro
DC_1D_FNS
2
INIT_MMX
mmxext
cglobal
vp9_ipred_dc_
%1
_4x4_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[
%2
]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pmaddwd
m0
,
[
pw_1
]
pshufw
m1
,
m0
,
q3232
paddd
m0
,
[
pd_2
]
paddd
m0
,
m1
psrad
m0
,
2
pshufw
m0
,
m0
,
q0000
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
sse2
cglobal
vp9_ipred_dc_
%1
_8x8_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[
%2
]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pmaddwd
m0
,
[
pw_1
]
pshufd
m1
,
m0
,
q3232
paddd
m0
,
m1
pshufd
m1
,
m0
,
q1111
paddd
m0
,
[
pd_4
]
paddd
m0
,
m1
psrad
m0
,
3
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
4
]
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
sse2
cglobal
vp9_ipred_dc_
%1
_16x16_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[
%2
]
paddw
m0
,
[
%2
+
mmsize
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
mov
cntd
,
4
pmaddwd
m0
,
[
pw_1
]
pshufd
m1
,
m0
,
q3232
paddd
m0
,
m1
pshufd
m1
,
m0
,
q1111
paddd
m0
,
[
pd_8
]
paddd
m0
,
m1
psrad
m0
,
4
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m0
mova
[
dstq
+
strideq
*
2
+
0
]
,
m0
mova
[
dstq
+
strideq
*
2
+
16
]
,
m0
mova
[
dstq
+
stride3q
+
0
]
,
m0
mova
[
dstq
+
stride3q
+
16
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
4
]
dec
cntd
jg
.
loop
RET
INIT_XMM
sse2
cglobal
vp9_ipred_dc_
%1
_32x32_16
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[
%2
+
mmsize
*
0
]
paddw
m0
,
[
%2
+
mmsize
*
1
]
paddw
m0
,
[
%2
+
mmsize
*
2
]
paddw
m0
,
[
%2
+
mmsize
*
3
]
DEFINE_ARGS
dst
,
stride
,
cnt
mov
cntd
,
16
pmaddwd
m0
,
[
pw_1
]
pshufd
m1
,
m0
,
q3232
paddd
m0
,
m1
pshufd
m1
,
m0
,
q1111
paddd
m0
,
[
pd_16
]
paddd
m0
,
m1
psrad
m0
,
5
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
0
+
32
]
,
m0
mova
[
dstq
+
strideq
*
0
+
48
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m0
mova
[
dstq
+
strideq
*
1
+
32
]
,
m0
mova
[
dstq
+
strideq
*
1
+
48
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntd
jg
.
loop
RET
%endmacro
DC_1D_FNS
top
,
aq
DC_1D_FNS
left
,
lq
INIT_MMX
mmxext
cglobal
vp9_ipred_tm_4x4_10
,
4
,
4
,
6
,
dst
,
stride
,
l
,
a
mova
m5
,
[
pw_1023
]
.
body
:
mova
m4
,
[aq]
mova
m3
,
[lq]
movd
m0
,
[
aq
-
4
]
pshufw
m0
,
m0
,
q1111
psubw
m4
,
m0
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pshufw
m0
,
m3
,
q3333
pshufw
m1
,
m3
,
q2222
pshufw
m2
,
m3
,
q1111
pshufw
m3
,
m3
,
q0000
paddw
m0
,
m4
paddw
m1
,
m4
paddw
m2
,
m4
paddw
m3
,
m4
pxor
m4
,
m4
pmaxsw
m0
,
m4
pmaxsw
m1
,
m4
pmaxsw
m2
,
m4
pmaxsw
m3
,
m4
pminsw
m0
,
m5
pminsw
m1
,
m5
pminsw
m2
,
m5
pminsw
m3
,
m5
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m1
mova
[
dstq
+
strideq
*
2
]
,
m2
mova
[
dstq
+
stride3q
]
,
m3
RET
cglobal
vp9_ipred_tm_4x4_12
,
4
,
4
,
6
,
dst
,
stride
,
l
,
a
mova
m5
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_ipred_tm_4x4_10
%
+
SUFFIX
).
body
INIT_XMM
sse2
cglobal
vp9_ipred_tm_8x8_10
,
4
,
5
,
7
,
dst
,
stride
,
l
,
a
mova
m4
,
[
pw_1023
]
.
body
:
pxor
m6
,
m6
mova
m5
,
[aq]
movd
m0
,
[
aq
-
4
]
pshuflw
m0
,
m0
,
q1111
punpcklqdq
m0
,
m0
psubw
m5
,
m0
DEFINE_ARGS
dst
,
stride
,
l
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
mov
cntd
,
1
.
loop
:
movh
m3
,
[
lq
+
cntq
*
8
]
punpcklwd
m3
,
m3
pshufd
m0
,
m3
,
q3333
pshufd
m1
,
m3
,
q2222
pshufd
m2
,
m3
,
q1111
pshufd
m3
,
m3
,
q0000
paddw
m0
,
m5
paddw
m1
,
m5
paddw
m2
,
m5
paddw
m3
,
m5
pmaxsw
m0
,
m6
pmaxsw
m1
,
m6
pmaxsw
m2
,
m6
pmaxsw
m3
,
m6
pminsw
m0
,
m4
pminsw
m1
,
m4
pminsw
m2
,
m4
pminsw
m3
,
m4
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m1
mova
[
dstq
+
strideq
*
2
]
,
m2
mova
[
dstq
+
stride3q
]
,
m3
lea
dstq
,
[
dstq
+
strideq
*
4
]
dec
cntd
jge
.
loop
RET
cglobal
vp9_ipred_tm_8x8_12
,
4
,
5
,
7
,
dst
,
stride
,
l
,
a
mova
m4
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_ipred_tm_8x8_10
%
+
SUFFIX
).
body
INIT_XMM
sse2
cglobal
vp9_ipred_tm_16x16_10
,
4
,
4
,
8
,
dst
,
stride
,
l
,
a
mova
m7
,
[
pw_1023
]
.
body
:
pxor
m6
,
m6
mova
m4
,
[aq]
mova
m5
,
[
aq
+
mmsize
]
movd
m0
,
[
aq
-
4
]
pshuflw
m0
,
m0
,
q1111
punpcklqdq
m0
,
m0
psubw
m4
,
m0
psubw
m5
,
m0
DEFINE_ARGS
dst
,
stride
,
l
,
cnt
mov
cntd
,
7
.
loop
:
movd
m3
,
[
lq
+
cntq
*
4
]
punpcklwd
m3
,
m3
pshufd
m2
,
m3
,
q1111
pshufd
m3
,
m3
,
q0000
paddw
m0
,
m2
,
m4
paddw
m2
,
m5
paddw
m1
,
m3
,
m4
paddw
m3
,
m5
pmaxsw
m0
,
m6
pmaxsw
m2
,
m6
pmaxsw
m1
,
m6
pmaxsw
m3
,
m6
pminsw
m0
,
m7
pminsw
m2
,
m7
pminsw
m1
,
m7
pminsw
m3
,
m7
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m2
mova
[
dstq
+
strideq
*
1
+
0
]
,
m1
mova
[
dstq
+
strideq
*
1
+
16
]
,
m3
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntd
jge
.
loop
RET
cglobal
vp9_ipred_tm_16x16_12
,
4
,
4
,
8
,
dst
,
stride
,
l
,
a
mova
m7
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_ipred_tm_16x16_10
%
+
SUFFIX
).
body
INIT_XMM
sse2
cglobal
vp9_ipred_tm_32x32_10
,
4
,
4
,
10
,
32
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
mova
m0
,
[
pw_1023
]
.
body
:
pxor
m1
,
m1
%if
ARCH_X86_64
SWAP
0
,
8
SWAP
1
,
9
%define
reg_min
m9
%define
reg_max
m8
%else
mova
[
rsp
+
0
]
,
m0
mova
[
rsp
+
16
]
,
m1
%define
reg_min
[
rsp
+
16
]
%define
reg_max
[
rsp
+
0
]
%endif
mova
m4
,
[
aq
+
mmsize
*
0
]
mova
m5
,
[
aq
+
mmsize
*
1
]
mova
m6
,
[
aq
+
mmsize
*
2
]
mova
m7
,
[
aq
+
mmsize
*
3
]
movd
m0
,
[
aq
-
4
]
pshuflw
m0
,
m0
,
q1111
punpcklqdq
m0
,
m0
psubw
m4
,
m0
psubw
m5
,
m0
psubw
m6
,
m0
psubw
m7
,
m0
DEFINE_ARGS
dst
,
stride
,
l
,
cnt
mov
cntd
,
31
.
loop
:
pinsrw
m3
,
[
lq
+
cntq
*
2
]
,
0
punpcklwd
m3
,
m3
pshufd
m3
,
m3
,
q0000
paddw
m0
,
m3
,
m4
paddw
m1
,
m3
,
m5
paddw
m2
,
m3
,
m6
paddw
m3
,
m7
pmaxsw
m0
,
reg_min
pmaxsw
m1
,
reg_min
pmaxsw
m2
,
reg_min
pmaxsw
m3
,
reg_min
pminsw
m0
,
reg_max
pminsw
m1
,
reg_max
pminsw
m2
,
reg_max
pminsw
m3
,
reg_max
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m1
mova
[
dstq
+
strideq
*
0
+
32
]
,
m2
mova
[
dstq
+
strideq
*
0
+
48
]
,
m3
add
dstq
,
strideq
dec
cntd
jge
.
loop
RET
cglobal
vp9_ipred_tm_32x32_12
,
4
,
4
,
10
,
32
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
mova
m0
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_ipred_tm_32x32_10
%
+
SUFFIX
).
body
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment