Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
061b67fb
Commit
061b67fb
authored
Oct 02, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
parent
26ece7a5
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1562 additions
and
2 deletions
+1562
-2
constants.c
libavcodec/x86/constants.c
+2
-0
constants.h
libavcodec/x86/constants.h
+1
-0
h264_qpel_10bit.asm
libavcodec/x86/h264_qpel_10bit.asm
+2
-2
vp9dsp_init.h
libavcodec/x86/vp9dsp_init.h
+4
-0
vp9dsp_init_16bpp.c
libavcodec/x86/vp9dsp_init_16bpp.c
+33
-0
vp9intrapred_16bpp.asm
libavcodec/x86/vp9intrapred_16bpp.asm
+1520
-0
No files found.
libavcodec/x86/constants.c
View file @
061b67fb
...
@@ -85,3 +85,5 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000
...
@@ -85,3 +85,5 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000
0x0000001000000010ULL
,
0x0000001000000010ULL
};
0x0000001000000010ULL
,
0x0000001000000010ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_32
)
=
{
0x0000002000000020ULL
,
0x0000002000000020ULL
,
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_32
)
=
{
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
};
0x0000002000000020ULL
,
0x0000002000000020ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_65535
)
=
{
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
};
libavcodec/x86/constants.h
View file @
061b67fb
...
@@ -65,5 +65,6 @@ extern const xmm_reg ff_ps_neg;
...
@@ -65,5 +65,6 @@ extern const xmm_reg ff_ps_neg;
extern
const
ymm_reg
ff_pd_1
;
extern
const
ymm_reg
ff_pd_1
;
extern
const
ymm_reg
ff_pd_16
;
extern
const
ymm_reg
ff_pd_16
;
extern
const
ymm_reg
ff_pd_32
;
extern
const
ymm_reg
ff_pd_32
;
extern
const
ymm_reg
ff_pd_65535
;
#endif
/* AVCODEC_X86_CONSTANTS_H */
#endif
/* AVCODEC_X86_CONSTANTS_H */
libavcodec/x86/h264_qpel_10bit.asm
View file @
061b67fb
...
@@ -26,6 +26,7 @@
...
@@ -26,6 +26,7 @@
SECTION_RODATA
32
SECTION_RODATA
32
cextern
pd_65535
cextern
pw_1023
cextern
pw_1023
%define
pw_pixel_max
pw_1023
%define
pw_pixel_max
pw_1023
cextern
pw_16
cextern
pw_16
...
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
...
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
tap1
:
times
4
dw
1
,
-
5
tap1
:
times
4
dw
1
,
-
5
tap2
:
times
4
dw
20
,
20
tap2
:
times
4
dw
20
,
20
tap3
:
times
4
dw
-
5
,
1
tap3
:
times
4
dw
-
5
,
1
pd_0f
:
times
4
dd
0xffff
SECTION
.
text
SECTION
.
text
...
@@ -708,7 +708,7 @@ h%1_loop_op:
...
@@ -708,7 +708,7 @@ h%1_loop_op:
psrad
m1
,
10
psrad
m1
,
10
psrad
m2
,
10
psrad
m2
,
10
pslld
m2
,
16
pslld
m2
,
16
pand
m1
,
[
pd_
0f
]
pand
m1
,
[
pd_
65535
]
por
m1
,
m2
por
m1
,
m2
%if
num_mmregs
<=
8
%if
num_mmregs
<=
8
pxor
m0
,
m0
pxor
m0
,
m0
...
...
libavcodec/x86/vp9dsp_init.h
View file @
061b67fb
...
@@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
...
@@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_ipred_func(type, enum, 16, bpp, opt); \
init_ipred_func(type, enum, 16, bpp, opt); \
init_ipred_func(type, enum, 32, bpp, opt)
init_ipred_func(type, enum, 32, bpp, opt)
#define init_ipred_funcs(type, enum, bpp, opt) \
init_ipred_func(type, enum, 4, bpp, opt); \
init_8_16_32_ipred_funcs(type, enum, bpp, opt)
void
ff_vp9dsp_init_10bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_10bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_12bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_12bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
);
...
...
libavcodec/x86/vp9dsp_init_16bpp.c
View file @
061b67fb
...
@@ -51,6 +51,18 @@ decl_ipred_fns(h, 16, mmxext, sse2);
...
@@ -51,6 +51,18 @@ decl_ipred_fns(h, 16, mmxext, sse2);
decl_ipred_fns
(
dc
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc_top
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc_top
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc_left
,
16
,
mmxext
,
sse2
);
decl_ipred_fns
(
dc_left
,
16
,
mmxext
,
sse2
);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
decl_ipred_fns(type, 16, ssse3, ssse3); \
decl_ipred_fns(type, 16, avx, avx)
decl_ipred_dir_funcs
(
dl
);
decl_ipred_dir_funcs
(
dr
);
decl_ipred_dir_funcs
(
vl
);
decl_ipred_dir_funcs
(
vr
);
decl_ipred_dir_funcs
(
hu
);
decl_ipred_dir_funcs
(
hd
);
#endif
/* HAVE_YASM */
#endif
/* HAVE_YASM */
av_cold
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
)
av_cold
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
)
...
@@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
...
@@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_8_16_32_ipred_funcs
(
dc
,
DC
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc
,
DC
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc_top
,
TOP_DC
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc_top
,
TOP_DC
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc_left
,
LEFT_DC
,
16
,
sse2
);
init_8_16_32_ipred_funcs
(
dc_left
,
LEFT_DC
,
16
,
sse2
);
init_ipred_funcs
(
dl
,
DIAG_DOWN_LEFT
,
16
,
sse2
);
init_ipred_funcs
(
dr
,
DIAG_DOWN_RIGHT
,
16
,
sse2
);
init_ipred_funcs
(
vl
,
VERT_LEFT
,
16
,
sse2
);
init_ipred_funcs
(
vr
,
VERT_RIGHT
,
16
,
sse2
);
init_ipred_funcs
(
hu
,
HOR_UP
,
16
,
sse2
);
init_ipred_funcs
(
hd
,
HOR_DOWN
,
16
,
sse2
);
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
init_ipred_funcs
(
dl
,
DIAG_DOWN_LEFT
,
16
,
ssse3
);
init_ipred_funcs
(
dr
,
DIAG_DOWN_RIGHT
,
16
,
ssse3
);
init_ipred_funcs
(
vl
,
VERT_LEFT
,
16
,
ssse3
);
init_ipred_funcs
(
vr
,
VERT_RIGHT
,
16
,
ssse3
);
init_ipred_funcs
(
hu
,
HOR_UP
,
16
,
ssse3
);
init_ipred_funcs
(
hd
,
HOR_DOWN
,
16
,
ssse3
);
}
}
if
(
EXTERNAL_AVX_FAST
(
cpu_flags
))
{
if
(
EXTERNAL_AVX_FAST
(
cpu_flags
))
{
init_fpel_func
(
2
,
0
,
32
,
put
,
,
avx
);
init_fpel_func
(
2
,
0
,
32
,
put
,
,
avx
);
init_fpel_func
(
1
,
0
,
64
,
put
,
,
avx
);
init_fpel_func
(
1
,
0
,
64
,
put
,
,
avx
);
init_fpel_func
(
0
,
0
,
128
,
put
,
,
avx
);
init_fpel_func
(
0
,
0
,
128
,
put
,
,
avx
);
init_ipred_funcs
(
dl
,
DIAG_DOWN_LEFT
,
16
,
avx
);
init_ipred_funcs
(
dr
,
DIAG_DOWN_RIGHT
,
16
,
avx
);
init_ipred_funcs
(
vl
,
VERT_LEFT
,
16
,
avx
);
init_ipred_funcs
(
vr
,
VERT_RIGHT
,
16
,
avx
);
init_ipred_funcs
(
hu
,
HOR_UP
,
16
,
avx
);
init_ipred_funcs
(
hd
,
HOR_DOWN
,
16
,
avx
);
}
}
if
(
EXTERNAL_AVX2
(
cpu_flags
))
{
if
(
EXTERNAL_AVX2
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9intrapred_16bpp.asm
View file @
061b67fb
...
@@ -29,14 +29,59 @@ pd_2: times 8 dd 2
...
@@ -29,14 +29,59 @@ pd_2: times 8 dd 2
pd_4
:
times
8
dd
4
pd_4
:
times
8
dd
4
pd_8
:
times
8
dd
8
pd_8
:
times
8
dd
8
pb_2to15_14_15
:
db
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
14
,
15
pb_4_5_8to13_8x0
:
db
4
,
5
,
8
,
9
,
10
,
11
,
12
,
13
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
pb_0to7_67x4
:
db
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
cextern
pw_1
cextern
pw_1
cextern
pw_1023
cextern
pw_1023
cextern
pw_4095
cextern
pw_4095
cextern
pd_16
cextern
pd_16
cextern
pd_32
cextern
pd_32
cextern
pd_65535
;
; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
; only 3 registers on x86-32, which would make it one cycle faster, but that
; would make the code quite a bit uglier...
SECTION
.
text
SECTION
.
text
%macro
SCRATCH
3
-
4
%if
ARCH_X86_64
SWAP
%1
,
%2
%if
%0
==
4
%define
reg_
%4
m%2
%endif
%else
mova
[
%3
]
,
m%1
%if
%0
==
4
%define
reg_
%4
[
%3
]
%endif
%endif
%endmacro
%macro
UNSCRATCH
3
-
4
%if
ARCH_X86_64
SWAP
%1
,
%2
%else
mova
m%1
,
[
%3
]
%endif
%if
%0
==
4
%undef
reg_
%4
%endif
%endmacro
%macro
PRELOAD
2
-
3
%if
ARCH_X86_64
mova
m%1
,
[
%2
]
%if
%0
==
3
%define
reg_
%3
m%1
%endif
%elif
%0
==
3
%define
reg_
%3
[
%2
]
%endif
%endmacro
INIT_MMX
mmx
INIT_MMX
mmx
cglobal
vp9_ipred_v_4x4_16
,
2
,
4
,
1
,
dst
,
stride
,
l
,
a
cglobal
vp9_ipred_v_4x4_16
,
2
,
4
,
1
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
movifnidn
aq
,
amp
...
@@ -613,3 +658,1478 @@ cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
...
@@ -613,3 +658,1478 @@ cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
cglobal
vp9_ipred_tm_32x32_12
,
4
,
4
,
10
,
32
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
cglobal
vp9_ipred_tm_32x32_12
,
4
,
4
,
10
,
32
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
mova
m0
,
[
pw_4095
]
mova
m0
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_ipred_tm_32x32_10
%
+
SUFFIX
).
body
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_ipred_tm_32x32_10
%
+
SUFFIX
).
body
; Directional intra predicion functions
;
; in the functions below, 'abcdefgh' refers to above data (sometimes simply
; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
; top-left data.
; left=(left+2*center+right+2)>>2
%macro
LOWPASS
3
; left [dst], center, right
paddw
m%1
,
m%3
psraw
m%1
,
1
pavgw
m%1
,
m%2
%endmacro
; abcdefgh (src) -> bcdefghh (dst)
; dst/src can be the same register
%macro
SHIFT_RIGHT
2
-
3
[
pb_2to15_14_15
]
; dst, src, [ssse3_shift_reg]
%if
cpuflag
(
ssse3
)
pshufb
%1
,
%2
,
%3
; abcdefgh -> bcdefghh
%else
psrldq
%1
,
%2
,
2
; abcdefgh -> bcdefgh.
pshufhw
%1
,
%1
,
q2210
; bcdefgh. -> bcdefghh
%endif
%endmacro
; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
%macro
SHIFT_RIGHTx2
3
-
4
[
pb_2to15_14_15
]
; dst1, dst2, src, [ssse3_shift_reg]
%if
cpuflag
(
ssse3
)
pshufb
%1
,
%3
,
%4
; abcdefgh -> bcdefghh
pshufb
%2
,
%1
,
%4
; bcdefghh -> cdefghhh
%else
psrldq
%1
,
%3
,
2
; abcdefgh -> bcdefgh.
psrldq
%2
,
%3
,
4
; abcdefgh -> cdefgh..
pshufhw
%1
,
%1
,
q2210
; bcdefgh. -> bcdefghh
pshufhw
%2
,
%2
,
q1110
; cdefgh.. -> cdefghhh
%endif
%endmacro
%macro
DL_FUNCS
0
cglobal
vp9_ipred_dl_4x4_16
,
2
,
4
,
3
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
movu
m1
,
[aq]
; abcdefgh
pshufhw
m0
,
m1
,
q3310
; abcdefhh
SHIFT_RIGHT
m1
,
m1
; bcdefghh
psrldq
m2
,
m1
,
2
; cdefghh.
LOWPASS
0
,
1
,
2
; BCDEFGh.
pshufd
m1
,
m0
,
q3321
; DEFGh...
movh
[
dstq
+
strideq
*
0
]
,
m0
movh
[
dstq
+
strideq
*
2
]
,
m1
add
dstq
,
strideq
psrldq
m0
,
2
; CDEFGh..
psrldq
m1
,
2
; EFGh....
movh
[
dstq
+
strideq
*
0
]
,
m0
movh
[
dstq
+
strideq
*
2
]
,
m1
RET
cglobal
vp9_ipred_dl_8x8_16
,
2
,
4
,
5
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[aq]
; abcdefgh
%if
cpuflag
(
ssse3
)
mova
m4
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m1
,
m2
,
m0
,
m4
; bcdefghh/cdefghhh
LOWPASS
0
,
1
,
2
; BCDEFGHh
shufps
m1
,
m0
,
m2
,
q3332
; FGHhhhhh
shufps
m3
,
m0
,
m1
,
q2121
; DEFGHhhh
DEFINE_ARGS
dst
,
stride
,
stride5
lea
stride5q
,
[
strideq
*
5
]
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
4
]
,
m1
SHIFT_RIGHT
m0
,
m0
,
m4
; CDEFGHhh
pshuflw
m1
,
m1
,
q3321
; GHhhhhhh
pshufd
m2
,
m0
,
q3321
; EFGHhhhh
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
stride5q
]
,
m1
lea
dstq
,
[
dstq
+
strideq
*
2
]
pshuflw
m1
,
m1
,
q3321
; Hhhhhhhh
mova
[
dstq
+
strideq
*
0
]
,
m3
mova
[
dstq
+
strideq
*
4
]
,
m1
pshuflw
m1
,
m1
,
q3321
; hhhhhhhh
mova
[
dstq
+
strideq
*
1
]
,
m2
mova
[
dstq
+
stride5q
]
,
m1
RET
cglobal
vp9_ipred_dl_16x16_16
,
2
,
4
,
5
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[aq]
; abcdefgh
mova
m3
,
[
aq
+
mmsize
]
; ijklmnop
PALIGNR
m1
,
m3
,
m0
,
2
,
m4
; bcdefghi
PALIGNR
m2
,
m3
,
m0
,
4
,
m4
; cdefghij
LOWPASS
0
,
1
,
2
; BCDEFGHI
%if
cpuflag
(
ssse3
)
mova
m4
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m2
,
m1
,
m3
,
m4
; jklmnopp/klmnoppp
LOWPASS
1
,
2
,
3
; JKLMNOPp
pshufd
m2
,
m2
,
q3333
; pppppppp
DEFINE_ARGS
dst
,
stride
,
cnt
mov
cntd
,
8
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m1
mova
[
dstq
+
strideq
*
8
+
0
]
,
m1
mova
[
dstq
+
strideq
*
8
+
16
]
,
m2
add
dstq
,
strideq
%if
cpuflag
(
avx
)
vpalignr
m0
,
m1
,
m0
,
2
%else
PALIGNR
m3
,
m1
,
m0
,
2
,
m4
mova
m0
,
m3
%endif
SHIFT_RIGHT
m1
,
m1
,
m4
dec
cntd
jg
.
loop
RET
cglobal
vp9_ipred_dl_32x32_16
,
2
,
5
,
7
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[
aq
+
mmsize
*
0
]
; abcdefgh
mova
m1
,
[
aq
+
mmsize
*
1
]
; ijklmnop
mova
m2
,
[
aq
+
mmsize
*
2
]
; qrstuvwx
mova
m3
,
[
aq
+
mmsize
*
3
]
; yz012345
PALIGNR
m4
,
m1
,
m0
,
2
,
m6
PALIGNR
m5
,
m1
,
m0
,
4
,
m6
LOWPASS
0
,
4
,
5
; BCDEFGHI
PALIGNR
m4
,
m2
,
m1
,
2
,
m6
PALIGNR
m5
,
m2
,
m1
,
4
,
m6
LOWPASS
1
,
4
,
5
; JKLMNOPQ
PALIGNR
m4
,
m3
,
m2
,
2
,
m6
PALIGNR
m5
,
m3
,
m2
,
4
,
m6
LOWPASS
2
,
4
,
5
; RSTUVWXY
%if
cpuflag
(
ssse3
)
mova
m6
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m4
,
m5
,
m3
,
m6
LOWPASS
3
,
4
,
5
; Z0123455
pshufd
m4
,
m4
,
q3333
; 55555555
DEFINE_ARGS
dst
,
stride
,
stride8
,
stride24
,
cnt
mov
cntd
,
8
lea
stride8q
,
[
strideq
*
8
]
lea
stride24q
,
[
stride8q
*
3
]
.
loop
:
mova
[
dstq
+
stride8q
*
0
+
0
]
,
m0
mova
[
dstq
+
stride8q
*
0
+
16
]
,
m1
mova
[
dstq
+
stride8q
*
0
+
32
]
,
m2
mova
[
dstq
+
stride8q
*
0
+
48
]
,
m3
mova
[
dstq
+
stride8q
*
1
+
0
]
,
m1
mova
[
dstq
+
stride8q
*
1
+
16
]
,
m2
mova
[
dstq
+
stride8q
*
1
+
32
]
,
m3
mova
[
dstq
+
stride8q
*
1
+
48
]
,
m4
mova
[
dstq
+
stride8q
*
2
+
0
]
,
m2
mova
[
dstq
+
stride8q
*
2
+
16
]
,
m3
mova
[
dstq
+
stride8q
*
2
+
32
]
,
m4
mova
[
dstq
+
stride8q
*
2
+
48
]
,
m4
mova
[
dstq
+
stride24q
+
0
]
,
m3
mova
[
dstq
+
stride24q
+
16
]
,
m4
mova
[
dstq
+
stride24q
+
32
]
,
m4
mova
[
dstq
+
stride24q
+
48
]
,
m4
add
dstq
,
strideq
%if
cpuflag
(
avx
)
vpalignr
m0
,
m1
,
m0
,
2
vpalignr
m1
,
m2
,
m1
,
2
vpalignr
m2
,
m3
,
m2
,
2
%else
PALIGNR
m5
,
m1
,
m0
,
2
,
m6
mova
m0
,
m5
PALIGNR
m5
,
m2
,
m1
,
2
,
m6
mova
m1
,
m5
PALIGNR
m5
,
m3
,
m2
,
2
,
m6
mova
m2
,
m5
%endif
SHIFT_RIGHT
m3
,
m3
,
m6
dec
cntd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
DL_FUNCS
INIT_XMM
ssse3
DL_FUNCS
INIT_XMM
avx
DL_FUNCS
%macro
DR_FUNCS
1
; stack_mem_for_32x32_32bit_function
cglobal
vp9_ipred_dr_4x4_16
,
4
,
4
,
3
,
dst
,
stride
,
l
,
a
movh
m0
,
[lq]
; wxyz....
movhps
m0
,
[
aq
-
2
]
; wxyz*abc
movd
m1
,
[
aq
+
6
]
; d.......
PALIGNR
m1
,
m0
,
2
,
m2
; xyz*abcd
psrldq
m2
,
m1
,
2
; yz*abcd.
LOWPASS
0
,
1
,
2
; XYZ#ABC.
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
movh
[
dstq
+
stride3q
]
,
m0
psrldq
m0
,
2
; YZ#ABC..
movh
[
dstq
+
strideq
*
2
]
,
m0
psrldq
m0
,
2
; Z#ABC...
movh
[
dstq
+
strideq
*
1
]
,
m0
psrldq
m0
,
2
; #ABC....
movh
[
dstq
+
strideq
*
0
]
,
m0
RET
cglobal
vp9_ipred_dr_8x8_16
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
; stuvwxyz
movu
m1
,
[
aq
-
2
]
; *abcdefg
mova
m2
,
[aq]
; abcdefgh
psrldq
m3
,
m2
,
2
; bcdefgh.
LOWPASS
3
,
2
,
1
; ABCDEFG.
PALIGNR
m1
,
m0
,
2
,
m4
; tuvwxyz*
PALIGNR
m2
,
m1
,
2
,
m4
; uvwxyz*a
LOWPASS
2
,
1
,
0
; TUVWXYZ#
DEFINE_ARGS
dst
,
stride
,
dst4
,
stride3
lea
stride3q
,
[
strideq
*
3
]
lea
dst4q
,
[
dstq
+
strideq
*
4
]
movhps
[
dstq
+
stride3q
+
0
]
,
m2
movh
[
dstq
+
stride3q
+
8
]
,
m3
mova
[
dst4q
+
stride3q
+
0
]
,
m2
PALIGNR
m1
,
m3
,
m2
,
2
,
m0
psrldq
m3
,
2
movhps
[
dstq
+
strideq
*
2
+
0
]
,
m1
movh
[
dstq
+
strideq
*
2
+
8
]
,
m3
mova
[
dst4q
+
strideq
*
2
+
0
]
,
m1
PALIGNR
m2
,
m3
,
m1
,
2
,
m0
psrldq
m3
,
2
movhps
[
dstq
+
strideq
*
1
+
0
]
,
m2
movh
[
dstq
+
strideq
*
1
+
8
]
,
m3
mova
[
dst4q
+
strideq
*
1
+
0
]
,
m2
PALIGNR
m1
,
m3
,
m2
,
2
,
m0
psrldq
m3
,
2
movhps
[
dstq
+
strideq
*
0
+
0
]
,
m1
movh
[
dstq
+
strideq
*
0
+
8
]
,
m3
mova
[
dst4q
+
strideq
*
0
+
0
]
,
m1
RET
cglobal
vp9_ipred_dr_16x16_16
,
4
,
4
,
7
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
; klmnopqr
mova
m1
,
[
lq
+
mmsize
]
; stuvwxyz
movu
m2
,
[
aq
-
2
]
; *abcdefg
movu
m3
,
[
aq
+
mmsize
-
2
]
; hijklmno
mova
m4
,
[aq]
; abcdefgh
mova
m5
,
[
aq
+
mmsize
]
; ijklmnop
psrldq
m6
,
m5
,
2
; jklmnop.
LOWPASS
6
,
5
,
3
; IJKLMNO.
PALIGNR
m5
,
m4
,
2
,
m3
; bcdefghi
LOWPASS
5
,
4
,
2
; ABCDEFGH
PALIGNR
m2
,
m1
,
2
,
m3
; tuvwxyz*
PALIGNR
m4
,
m2
,
2
,
m3
; uvwxyz*a
LOWPASS
4
,
2
,
1
; TUVWXYZ#
PALIGNR
m1
,
m0
,
2
,
m3
; lmnopqrs
PALIGNR
m2
,
m1
,
2
,
m3
; mnopqrst
LOWPASS
2
,
1
,
0
; LMNOPQRS
DEFINE_ARGS
dst
,
stride
,
dst8
,
cnt
lea
dst8q
,
[
dstq
+
strideq
*
8
]
mov
cntd
,
8
.
loop
:
sub
dst8q
,
strideq
mova
[
dst8q
+
strideq
*
0
+
0
]
,
m4
mova
[
dst8q
+
strideq
*
0
+
16
]
,
m5
mova
[
dst8q
+
strideq
*
8
+
0
]
,
m2
mova
[
dst8q
+
strideq
*
8
+
16
]
,
m4
%if
cpuflag
(
avx
)
vpalignr
m2
,
m4
,
m2
,
2
vpalignr
m4
,
m5
,
m4
,
2
vpalignr
m5
,
m6
,
m5
,
2
%else
PALIGNR
m0
,
m4
,
m2
,
2
,
m1
mova
m2
,
m0
PALIGNR
m0
,
m5
,
m4
,
2
,
m1
mova
m4
,
m0
PALIGNR
m0
,
m6
,
m5
,
2
,
m1
mova
m5
,
m0
%endif
psrldq
m6
,
2
dec
cntd
jg
.
loop
RET
cglobal
vp9_ipred_dr_32x32_16
,
4
,
5
,
10
+
notcpuflag
(
ssse3
),
\
%1
*
ARCH_X86_32
*
mmsize
,
dst
,
stride
,
l
,
a
mova
m0
,
[
aq
+
mmsize
*
3
]
; a[24-31]
movu
m1
,
[
aq
+
mmsize
*
3
-
2
]
; a[23-30]
psrldq
m2
,
m0
,
2
; a[25-31].
LOWPASS
2
,
0
,
1
; A[24-30].
mova
m1
,
[
aq
+
mmsize
*
2
]
; a[16-23]
movu
m3
,
[
aq
+
mmsize
*
2
-
2
]
; a[15-22]
PALIGNR
m0
,
m1
,
2
,
m4
; a[17-24]
LOWPASS
0
,
1
,
3
; A[16-23]
mova
m3
,
[
aq
+
mmsize
*
1
]
; a[8-15]
movu
m4
,
[
aq
+
mmsize
*
1
-
2
]
; a[7-14]
PALIGNR
m1
,
m3
,
2
,
m5
; a[9-16]
LOWPASS
1
,
3
,
4
; A[8-15]
mova
m4
,
[
aq
+
mmsize
*
0
]
; a[0-7]
movu
m5
,
[
aq
+
mmsize
*
0
-
2
]
; *a[0-6]
PALIGNR
m3
,
m4
,
2
,
m6
; a[1-8]
LOWPASS
3
,
4
,
5
; A[0-7]
SCRATCH
1
,
8
,
rsp
+
0
*
mmsize
SCRATCH
3
,
9
,
rsp
+
1
*
mmsize
%if
notcpuflag
(
ssse3
)
SCRATCH
0
,
10
,
rsp
+
2
*
mmsize
%endif
mova
m6
,
[
lq
+
mmsize
*
3
]
; l[24-31]
PALIGNR
m5
,
m6
,
2
,
m0
; l[25-31]*
PALIGNR
m4
,
m5
,
2
,
m0
; l[26-31]*a
LOWPASS
4
,
5
,
6
; L[25-31]#
mova
m7
,
[
lq
+
mmsize
*
2
]
; l[16-23]
PALIGNR
m6
,
m7
,
2
,
m0
; l[17-24]
PALIGNR
m5
,
m6
,
2
,
m0
; l[18-25]
LOWPASS
5
,
6
,
7
; L[17-24]
mova
m1
,
[
lq
+
mmsize
*
1
]
; l[8-15]
PALIGNR
m7
,
m1
,
2
,
m0
; l[9-16]
PALIGNR
m6
,
m7
,
2
,
m0
; l[10-17]
LOWPASS
6
,
7
,
1
; L[9-16]
mova
m3
,
[
lq
+
mmsize
*
0
]
; l[0-7]
PALIGNR
m1
,
m3
,
2
,
m0
; l[1-8]
PALIGNR
m7
,
m1
,
2
,
m0
; l[2-9]
LOWPASS
7
,
1
,
3
; L[1-8]
%if
cpuflag
(
ssse3
)
%if
cpuflag
(
avx
)
UNSCRATCH
1
,
8
,
rsp
+
0
*
mmsize
%endif
UNSCRATCH
3
,
9
,
rsp
+
1
*
mmsize
%else
UNSCRATCH
0
,
10
,
rsp
+
2
*
mmsize
%endif
DEFINE_ARGS
dst8
,
stride
,
stride8
,
stride24
,
cnt
lea
stride8q
,
[
strideq
*
8
]
lea
stride24q
,
[
stride8q
*
3
]
lea
dst8q
,
[
dst8q
+
strideq
*
8
]
mov
cntd
,
8
.
loop
:
sub
dst8q
,
strideq
%if
notcpuflag
(
avx
)
UNSCRATCH
1
,
8
,
rsp
+
0
*
mmsize
%if
notcpuflag
(
ssse3
)
UNSCRATCH
3
,
9
,
rsp
+
1
*
mmsize
%endif
%endif
mova
[
dst8q
+
stride8q
*
0
+
0
]
,
m4
mova
[
dst8q
+
stride8q
*
0
+
16
]
,
m3
mova
[
dst8q
+
stride8q
*
0
+
32
]
,
m1
mova
[
dst8q
+
stride8q
*
0
+
48
]
,
m0
mova
[
dst8q
+
stride8q
*
1
+
0
]
,
m5
mova
[
dst8q
+
stride8q
*
1
+
16
]
,
m4
mova
[
dst8q
+
stride8q
*
1
+
32
]
,
m3
mova
[
dst8q
+
stride8q
*
1
+
48
]
,
m1
mova
[
dst8q
+
stride8q
*
2
+
0
]
,
m6
mova
[
dst8q
+
stride8q
*
2
+
16
]
,
m5
mova
[
dst8q
+
stride8q
*
2
+
32
]
,
m4
mova
[
dst8q
+
stride8q
*
2
+
48
]
,
m3
mova
[
dst8q
+
stride24q
+
0
]
,
m7
mova
[
dst8q
+
stride24q
+
16
]
,
m6
mova
[
dst8q
+
stride24q
+
32
]
,
m5
mova
[
dst8q
+
stride24q
+
48
]
,
m4
%if
cpuflag
(
avx
)
vpalignr
m7
,
m6
,
m7
,
2
vpalignr
m6
,
m5
,
m6
,
2
vpalignr
m5
,
m4
,
m5
,
2
vpalignr
m4
,
m3
,
m4
,
2
vpalignr
m3
,
m1
,
m3
,
2
vpalignr
m1
,
m0
,
m1
,
2
vpalignr
m0
,
m2
,
m0
,
2
%else
SCRATCH
2
,
8
,
rsp
+
0
*
mmsize
%if
notcpuflag
(
ssse3
)
SCRATCH
0
,
9
,
rsp
+
1
*
mmsize
%endif
PALIGNR
m2
,
m6
,
m7
,
2
,
m0
mova
m7
,
m2
PALIGNR
m2
,
m5
,
m6
,
2
,
m0
mova
m6
,
m2
PALIGNR
m2
,
m4
,
m5
,
2
,
m0
mova
m5
,
m2
PALIGNR
m2
,
m3
,
m4
,
2
,
m0
mova
m4
,
m2
PALIGNR
m2
,
m1
,
m3
,
2
,
m0
mova
m3
,
m2
%if
notcpuflag
(
ssse3
)
UNSCRATCH
0
,
9
,
rsp
+
1
*
mmsize
SCRATCH
3
,
9
,
rsp
+
1
*
mmsize
%endif
PALIGNR
m2
,
m0
,
m1
,
2
,
m3
mova
m1
,
m2
UNSCRATCH
2
,
8
,
rsp
+
0
*
mmsize
SCRATCH
1
,
8
,
rsp
+
0
*
mmsize
PALIGNR
m1
,
m2
,
m0
,
2
,
m3
mova
m0
,
m1
%endif
psrldq
m2
,
2
dec
cntd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
DR_FUNCS
3
INIT_XMM
ssse3
DR_FUNCS
2
INIT_XMM
avx
DR_FUNCS
2
%macro
VL_FUNCS
1
; stack_mem_for_32x32_32bit_function
cglobal
vp9_ipred_vl_4x4_16
,
2
,
4
,
3
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
movu
m0
,
[aq]
; abcdefgh
psrldq
m1
,
m0
,
2
; bcdefgh.
psrldq
m2
,
m0
,
4
; cdefgh..
LOWPASS
2
,
1
,
0
; BCDEFGH.
pavgw
m1
,
m0
; ABCDEFG.
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
movh
[
dstq
+
strideq
*
0
]
,
m1
movh
[
dstq
+
strideq
*
1
]
,
m2
psrldq
m1
,
2
psrldq
m2
,
2
movh
[
dstq
+
strideq
*
2
]
,
m1
movh
[
dstq
+
stride3q
]
,
m2
RET
cglobal
vp9_ipred_vl_8x8_16
,
2
,
4
,
4
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[aq]
; abcdefgh
%if
cpuflag
(
ssse3
)
mova
m3
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m1
,
m2
,
m0
,
m3
; bcdefghh/cdefghhh
LOWPASS
2
,
1
,
0
; BCDEFGHh
pavgw
m1
,
m0
; ABCDEFGh
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
mova
[
dstq
+
strideq
*
0
]
,
m1
mova
[
dstq
+
strideq
*
1
]
,
m2
SHIFT_RIGHT
m1
,
m1
,
m3
SHIFT_RIGHT
m2
,
m2
,
m3
mova
[
dstq
+
strideq
*
2
]
,
m1
mova
[
dstq
+
stride3q
]
,
m2
lea
dstq
,
[
dstq
+
strideq
*
4
]
SHIFT_RIGHT
m1
,
m1
,
m3
SHIFT_RIGHT
m2
,
m2
,
m3
mova
[
dstq
+
strideq
*
0
]
,
m1
mova
[
dstq
+
strideq
*
1
]
,
m2
SHIFT_RIGHT
m1
,
m1
,
m3
SHIFT_RIGHT
m2
,
m2
,
m3
mova
[
dstq
+
strideq
*
2
]
,
m1
mova
[
dstq
+
stride3q
]
,
m2
RET
cglobal
vp9_ipred_vl_16x16_16
,
2
,
4
,
6
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[aq]
mova
m1
,
[
aq
+
mmsize
]
PALIGNR
m2
,
m1
,
m0
,
2
,
m3
PALIGNR
m3
,
m1
,
m0
,
4
,
m4
LOWPASS
3
,
2
,
0
pavgw
m2
,
m0
%if
cpuflag
(
ssse3
)
mova
m4
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m5
,
m0
,
m1
,
m4
LOWPASS
0
,
5
,
1
pavgw
m1
,
m5
DEFINE_ARGS
dst
,
stride
,
cnt
mov
cntd
,
8
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m2
mova
[
dstq
+
strideq
*
0
+
16
]
,
m1
mova
[
dstq
+
strideq
*
1
+
0
]
,
m3
mova
[
dstq
+
strideq
*
1
+
16
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
2
]
%if
cpuflag
(
avx
)
vpalignr
m2
,
m1
,
m2
,
2
vpalignr
m3
,
m0
,
m3
,
2
%else
PALIGNR
m5
,
m1
,
m2
,
2
,
m4
mova
m2
,
m5
PALIGNR
m5
,
m0
,
m3
,
2
,
m4
mova
m3
,
m5
%endif
SHIFT_RIGHT
m1
,
m1
,
m4
SHIFT_RIGHT
m0
,
m0
,
m4
dec
cntd
jg
.
loop
RET
cglobal
vp9_ipred_vl_32x32_16
,
2
,
5
,
11
,
%1
*
mmsize
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
movifnidn
aq
,
amp
mova
m0
,
[
aq
+
mmsize
*
0
]
mova
m1
,
[
aq
+
mmsize
*
1
]
mova
m2
,
[
aq
+
mmsize
*
2
]
PALIGNR
m6
,
m1
,
m0
,
2
,
m5
PALIGNR
m7
,
m1
,
m0
,
4
,
m5
LOWPASS
7
,
6
,
0
pavgw
m6
,
m0
SCRATCH
6
,
8
,
rsp
+
0
*
mmsize
PALIGNR
m4
,
m2
,
m1
,
2
,
m0
PALIGNR
m5
,
m2
,
m1
,
4
,
m0
LOWPASS
5
,
4
,
1
pavgw
m4
,
m1
mova
m0
,
[
aq
+
mmsize
*
3
]
PALIGNR
m1
,
m0
,
m2
,
2
,
m6
PALIGNR
m3
,
m0
,
m2
,
4
,
m6
LOWPASS
3
,
1
,
2
pavgw
m2
,
m1
%if
cpuflag
(
ssse3
)
PRELOAD
10
,
pb_2to15_14_15
,
shuf
%endif
SHIFT_RIGHTx2
m6
,
m1
,
m0
,
reg_shuf
LOWPASS
1
,
6
,
0
pavgw
m0
,
m6
%if
ARCH_X86_64
pshufd
m9
,
m6
,
q3333
%endif
%if
cpuflag
(
avx
)
UNSCRATCH
6
,
8
,
rsp
+
0
*
mmsize
%endif
DEFINE_ARGS
dst
,
stride
,
cnt
,
stride16
,
stride17
mov
stride16q
,
strideq
mov
cntd
,
8
shl
stride16q
,
4
lea
stride17q
,
[
stride16q
+
strideq
]
; FIXME m8 is unused for avx, so we could save one register here for win64
.
loop
:
%if
notcpuflag
(
avx
)
UNSCRATCH
6
,
8
,
rsp
+
0
*
mmsize
%endif
mova
[
dstq
+
strideq
*
0
+
0
]
,
m6
mova
[
dstq
+
strideq
*
0
+
16
]
,
m4
mova
[
dstq
+
strideq
*
0
+
32
]
,
m2
mova
[
dstq
+
strideq
*
0
+
48
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m7
mova
[
dstq
+
strideq
*
1
+
16
]
,
m5
mova
[
dstq
+
strideq
*
1
+
32
]
,
m3
mova
[
dstq
+
strideq
*
1
+
48
]
,
m1
mova
[
dstq
+
stride16q
+
0
]
,
m4
mova
[
dstq
+
stride16q
+
16
]
,
m2
mova
[
dstq
+
stride16q
+
32
]
,
m0
%if
ARCH_X86_64
mova
[
dstq
+
stride16q
+
48
]
,
m9
%endif
mova
[
dstq
+
stride17q
+
0
]
,
m5
mova
[
dstq
+
stride17q
+
16
]
,
m3
mova
[
dstq
+
stride17q
+
32
]
,
m1
%if
ARCH_X86_64
mova
[
dstq
+
stride17q
+
48
]
,
m9
%endif
lea
dstq
,
[
dstq
+
strideq
*
2
]
%if
cpuflag
(
avx
)
vpalignr
m6
,
m4
,
m6
,
2
vpalignr
m4
,
m2
,
m4
,
2
vpalignr
m2
,
m0
,
m2
,
2
vpalignr
m7
,
m5
,
m7
,
2
vpalignr
m5
,
m3
,
m5
,
2
vpalignr
m3
,
m1
,
m3
,
2
%else
SCRATCH
3
,
8
,
rsp
+
0
*
mmsize
%if
notcpuflag
(
ssse3
)
SCRATCH
1
,
10
,
rsp
+
1
*
mmsize
%endif
PALIGNR
m3
,
m4
,
m6
,
2
,
m1
mova
m6
,
m3
PALIGNR
m3
,
m2
,
m4
,
2
,
m1
mova
m4
,
m3
PALIGNR
m3
,
m0
,
m2
,
2
,
m1
mova
m2
,
m3
PALIGNR
m3
,
m5
,
m7
,
2
,
m1
mova
m7
,
m3
UNSCRATCH
3
,
8
,
rsp
+
0
*
mmsize
SCRATCH
6
,
8
,
rsp
+
0
*
mmsize
%if
notcpuflag
(
ssse3
)
UNSCRATCH
1
,
10
,
rsp
+
1
*
mmsize
SCRATCH
7
,
10
,
rsp
+
1
*
mmsize
%endif
PALIGNR
m6
,
m3
,
m5
,
2
,
m7
mova
m5
,
m6
PALIGNR
m6
,
m1
,
m3
,
2
,
m7
mova
m3
,
m6
%if
notcpuflag
(
ssse3
)
UNSCRATCH
7
,
10
,
rsp
+
1
*
mmsize
%endif
%endif
SHIFT_RIGHT
m1
,
m1
,
reg_shuf
SHIFT_RIGHT
m0
,
m0
,
reg_shuf
dec
cntd
jg
.
loop
%if
ARCH_X86_32
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
%assign
%%
n
0
%rep
4
mova
[
dstq
+
strideq
*
0
+
48
]
,
m0
mova
[
dstq
+
strideq
*
1
+
48
]
,
m0
mova
[
dstq
+
strideq
*
2
+
48
]
,
m0
mova
[
dstq
+
stride3q
+
48
]
,
m0
%if
%%
n
<
3
lea
dstq
,
[
dstq
+
strideq
*
4
]
%endif
%assign
%%
n
(
%%
n
+
1
)
%endrep
%endif
RET
%endmacro
INIT_XMM
sse2
VL_FUNCS
2
INIT_XMM
ssse3
VL_FUNCS
1
INIT_XMM
avx
VL_FUNCS
1
%macro
VR_FUNCS
0
cglobal
vp9_ipred_vr_4x4_16
,
4
,
4
,
3
,
dst
,
stride
,
l
,
a
movu
m0
,
[
aq
-
2
]
movhps
m1
,
[lq]
PALIGNR
m0
,
m1
,
10
,
m2
; xyz*abcd
pslldq
m1
,
m0
,
2
; .xyz*abc
pslldq
m2
,
m0
,
4
; ..xyz*ab
LOWPASS
2
,
1
,
0
; ..YZ#ABC
pavgw
m1
,
m0
; ....#ABC
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
movhps
[
dstq
+
strideq
*
0
]
,
m1
movhps
[
dstq
+
strideq
*
1
]
,
m2
shufps
m0
,
m2
,
m1
,
q3210
%if
cpuflag
(
ssse3
)
pshufb
m2
,
[
pb_4_5_8to13_8x0
]
%else
pshuflw
m2
,
m2
,
q2222
psrldq
m2
,
6
%endif
psrldq
m0
,
6
movh
[
dstq
+
strideq
*
2
]
,
m0
movh
[
dstq
+
stride3q
]
,
m2
RET
cglobal
vp9_ipred_vr_8x8_16
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
movu
m1
,
[
aq
-
2
]
; *abcdefg
movu
m2
,
[lq]
; stuvwxyz
mova
m0
,
[aq]
; abcdefgh
PALIGNR
m3
,
m1
,
m2
,
14
,
m4
; z*abcdef
LOWPASS
3
,
1
,
0
pavgw
m0
,
m1
PALIGNR
m1
,
m2
,
2
,
m4
; tuvwxyz*
pslldq
m4
,
m2
,
2
; .stuvwxy
LOWPASS
4
,
2
,
1
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m3
PALIGNR
m0
,
m4
,
14
,
m1
pslldq
m4
,
2
PALIGNR
m3
,
m4
,
14
,
m1
pslldq
m4
,
2
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m3
lea
dstq
,
[
dstq
+
strideq
*
4
]
PALIGNR
m0
,
m4
,
14
,
m1
pslldq
m4
,
2
PALIGNR
m3
,
m4
,
14
,
m1
pslldq
m4
,
2
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m3
PALIGNR
m0
,
m4
,
14
,
m1
pslldq
m4
,
2
PALIGNR
m3
,
m4
,
14
,
m4
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
stride3q
]
,
m3
RET
cglobal
vp9_ipred_vr_16x16_16
,
4
,
4
,
8
,
dst
,
stride
,
l
,
a
movu
m1
,
[
aq
-
2
]
; *abcdefg
movu
m2
,
[
aq
+
mmsize
-
2
]
; hijklmno
mova
m3
,
[aq]
; abcdefgh
mova
m4
,
[
aq
+
mmsize
]
; ijklmnop
mova
m5
,
[
lq
+
mmsize
]
; stuvwxyz
PALIGNR
m0
,
m1
,
m5
,
14
,
m6
; z*abcdef
movu
m6
,
[
aq
+
mmsize
-
4
]
; ghijklmn
LOWPASS
6
,
2
,
4
pavgw
m2
,
m4
LOWPASS
0
,
1
,
3
pavgw
m3
,
m1
PALIGNR
m1
,
m5
,
2
,
m7
; tuvwxyz*
movu
m7
,
[
lq
+
mmsize
-
2
]
; rstuvwxy
LOWPASS
1
,
5
,
7
movu
m5
,
[
lq
+
2
]
; lmnopqrs
pslldq
m4
,
m5
,
2
; .lmnopqr
pslldq
m7
,
m5
,
4
; ..lmnopq
LOWPASS
5
,
4
,
7
psrld
m4
,
m1
,
16
psrld
m7
,
m5
,
16
pand
m1
,
[
pd_65535
]
pand
m5
,
[
pd_65535
]
packssdw
m7
,
m4
packssdw
m5
,
m1
DEFINE_ARGS
dst
,
stride
,
cnt
mov
cntd
,
8
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m3
mova
[
dstq
+
strideq
*
0
+
16
]
,
m2
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m6
lea
dstq
,
[
dstq
+
strideq
*
2
]
PALIGNR
m2
,
m3
,
14
,
m4
PALIGNR
m3
,
m7
,
14
,
m4
pslldq
m7
,
2
PALIGNR
m6
,
m0
,
14
,
m4
PALIGNR
m0
,
m5
,
14
,
m4
pslldq
m5
,
2
dec
cntd
jg
.
loop
RET
cglobal
vp9_ipred_vr_32x32_16
,
4
,
5
,
14
,
6
*
mmsize
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
movu
m0
,
[
aq
+
mmsize
*
0
-
2
]
; *a[0-6]
movu
m1
,
[
aq
+
mmsize
*
1
-
2
]
; a[7-14]
movu
m2
,
[
aq
+
mmsize
*
2
-
2
]
; a[15-22]
movu
m3
,
[
aq
+
mmsize
*
3
-
2
]
; a[23-30]
mova
m4
,
[
aq
+
mmsize
*
3
+
0
]
; a[24-31]
movu
m5
,
[
aq
+
mmsize
*
3
-
4
]
; a[22-29]
LOWPASS
5
,
3
,
4
; A[23-30]
SCRATCH
5
,
8
,
rsp
+
0
*
mmsize
pavgw
m3
,
m4
mova
m4
,
[
aq
+
mmsize
*
2
+
0
]
; a[16-23]
movu
m6
,
[
aq
+
mmsize
*
2
-
4
]
; a[14-21]
LOWPASS
6
,
2
,
4
; A[15-22]
SCRATCH
6
,
9
,
rsp
+
1
*
mmsize
pavgw
m2
,
m4
mova
m4
,
[
aq
+
mmsize
*
1
+
0
]
; a[8-15]
movu
m7
,
[
aq
+
mmsize
*
1
-
4
]
; a[6-13]
LOWPASS
7
,
1
,
4
; A[7-14]
SCRATCH
7
,
10
,
rsp
+
2
*
mmsize
pavgw
m1
,
m4
mova
m4
,
[
aq
+
mmsize
*
0
+
0
]
; a[0-7]
mova
m5
,
[
lq
+
mmsize
*
3
+
0
]
; l[24-31]
PALIGNR
m6
,
m0
,
m5
,
14
,
m7
; l[31]*a[0-5]
LOWPASS
6
,
0
,
4
; #A[0-6]
SCRATCH
6
,
11
,
rsp
+
3
*
mmsize
pavgw
m4
,
m0
PALIGNR
m0
,
m5
,
2
,
m7
; l[25-31]*
movu
m7
,
[
lq
+
mmsize
*
3
-
2
]
; l[23-30]
LOWPASS
0
,
5
,
7
; L[24-31]
movu
m5
,
[
lq
+
mmsize
*
2
-
2
]
; l[15-22]
mova
m7
,
[
lq
+
mmsize
*
2
+
0
]
; l[16-23]
movu
m6
,
[
lq
+
mmsize
*
2
+
2
]
; l[17-24]
LOWPASS
5
,
7
,
6
; L[16-23]
psrld
m7
,
m0
,
16
psrld
m6
,
m5
,
16
pand
m0
,
[
pd_65535
]
pand
m5
,
[
pd_65535
]
packssdw
m6
,
m7
packssdw
m5
,
m0
SCRATCH
5
,
12
,
rsp
+
4
*
mmsize
SCRATCH
6
,
13
,
rsp
+
5
*
mmsize
movu
m6
,
[
lq
+
mmsize
*
1
-
2
]
; l[7-14]
mova
m0
,
[
lq
+
mmsize
*
1
+
0
]
; l[8-15]
movu
m5
,
[
lq
+
mmsize
*
1
+
2
]
; l[9-16]
LOWPASS
6
,
0
,
5
; L[8-15]
movu
m0
,
[
lq
+
mmsize
*
0
+
2
]
; l[1-8]
pslldq
m5
,
m0
,
2
; .l[1-7]
pslldq
m7
,
m0
,
4
; ..l[1-6]
LOWPASS
0
,
5
,
7
psrld
m5
,
m6
,
16
psrld
m7
,
m0
,
16
pand
m6
,
[
pd_65535
]
pand
m0
,
[
pd_65535
]
packssdw
m7
,
m5
packssdw
m0
,
m6
UNSCRATCH
6
,
13
,
rsp
+
5
*
mmsize
DEFINE_ARGS
dst
,
stride
,
stride16
,
cnt
,
stride17
mov
stride16q
,
strideq
mov
cntd
,
8
shl
stride16q
,
4
%if
ARCH_X86_64
lea
stride17q
,
[
stride16q
+
strideq
]
%endif
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m4
mova
[
dstq
+
strideq
*
0
+
16
]
,
m1
mova
[
dstq
+
strideq
*
0
+
32
]
,
m2
mova
[
dstq
+
strideq
*
0
+
48
]
,
m3
%if
ARCH_X86_64
mova
[
dstq
+
strideq
*
1
+
0
]
,
m11
mova
[
dstq
+
strideq
*
1
+
16
]
,
m10
mova
[
dstq
+
strideq
*
1
+
32
]
,
m9
mova
[
dstq
+
strideq
*
1
+
48
]
,
m8
%endif
mova
[
dstq
+
stride16q
+
0
]
,
m6
mova
[
dstq
+
stride16q
+
16
]
,
m4
mova
[
dstq
+
stride16q
+
32
]
,
m1
mova
[
dstq
+
stride16q
+
48
]
,
m2
%if
ARCH_X86_64
mova
[
dstq
+
stride17q
+
0
]
,
m12
mova
[
dstq
+
stride17q
+
16
]
,
m11
mova
[
dstq
+
stride17q
+
32
]
,
m10
mova
[
dstq
+
stride17q
+
48
]
,
m9
%endif
lea
dstq
,
[
dstq
+
strideq
*
2
]
PALIGNR
m3
,
m2
,
14
,
m5
PALIGNR
m2
,
m1
,
14
,
m5
PALIGNR
m1
,
m4
,
14
,
m5
PALIGNR
m4
,
m6
,
14
,
m5
PALIGNR
m6
,
m7
,
14
,
m5
pslldq
m7
,
2
%if
ARCH_X86_64
PALIGNR
m8
,
m9
,
14
,
m5
PALIGNR
m9
,
m10
,
14
,
m5
PALIGNR
m10
,
m11
,
14
,
m5
PALIGNR
m11
,
m12
,
14
,
m5
PALIGNR
m12
,
m0
,
14
,
m5
pslldq
m0
,
2
%endif
dec
cntd
jg
.
loop
%if
ARCH_X86_32
UNSCRATCH
5
,
12
,
rsp
+
4
*
mmsize
UNSCRATCH
4
,
11
,
rsp
+
3
*
mmsize
UNSCRATCH
3
,
10
,
rsp
+
2
*
mmsize
UNSCRATCH
2
,
9
,
rsp
+
1
*
mmsize
UNSCRATCH
1
,
8
,
rsp
+
0
*
mmsize
mov
dstq
,
dstm
mov
cntd
,
8
add
dstq
,
strideq
.
loop2
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m4
mova
[
dstq
+
strideq
*
0
+
16
]
,
m3
mova
[
dstq
+
strideq
*
0
+
32
]
,
m2
mova
[
dstq
+
strideq
*
0
+
48
]
,
m1
mova
[
dstq
+
stride16q
+
0
]
,
m5
mova
[
dstq
+
stride16q
+
16
]
,
m4
mova
[
dstq
+
stride16q
+
32
]
,
m3
mova
[
dstq
+
stride16q
+
48
]
,
m2
lea
dstq
,
[
dstq
+
strideq
*
2
]
PALIGNR
m1
,
m2
,
14
,
m6
PALIGNR
m2
,
m3
,
14
,
m6
PALIGNR
m3
,
m4
,
14
,
m6
PALIGNR
m4
,
m5
,
14
,
m6
PALIGNR
m5
,
m0
,
14
,
m6
pslldq
m0
,
2
dec
cntd
jg
.
loop2
%endif
RET
%endmacro
INIT_XMM
sse2
VR_FUNCS
INIT_XMM
ssse3
VR_FUNCS
INIT_XMM
avx
VR_FUNCS
%macro
HU_FUNCS
1
; stack_mem_for_32x32_32bit_function
cglobal
vp9_ipred_hu_4x4_16
,
3
,
3
,
3
,
dst
,
stride
,
l
,
a
movh
m0
,
[lq]
; abcd
%if
cpuflag
(
ssse3
)
pshufb
m0
,
[
pb_0to7_67x4
]
; abcddddd
%else
punpcklqdq
m0
,
m0
pshufhw
m0
,
m0
,
q3333
; abcddddd
%endif
psrldq
m1
,
m0
,
2
; bcddddd.
psrldq
m2
,
m0
,
4
; cddddd..
LOWPASS
2
,
1
,
0
; BCDddd..
pavgw
m1
,
m0
; abcddddd
SBUTTERFLY
wd
,
1
,
2
,
0
; aBbCcDdd, dddddddd
PALIGNR
m2
,
m1
,
4
,
m0
; bCcDdddd
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
movh
[
dstq
+
strideq
*
0
]
,
m1
; aBbC
movh
[
dstq
+
strideq
*
1
]
,
m2
; bCcD
movhps
[
dstq
+
strideq
*
2
]
,
m1
; cDdd
movhps
[
dstq
+
stride3q
]
,
m2
; dddd
RET
cglobal
vp9_ipred_hu_8x8_16
,
3
,
3
,
4
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
%if
cpuflag
(
ssse3
)
mova
m3
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m1
,
m2
,
m0
,
m3
LOWPASS
2
,
1
,
0
pavgw
m1
,
m0
SBUTTERFLY
wd
,
1
,
2
,
0
shufps
m0
,
m1
,
m2
,
q1032
pshufd
m3
,
m2
,
q3332
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
mova
[
dstq
+
strideq
*
0
]
,
m1
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
strideq
*
4
]
,
m2
mova
[
dstq
+
stride3q
*
2
]
,
m3
add
dstq
,
strideq
%if
cpuflag
(
avx
)
vpalignr
m1
,
m2
,
m1
,
4
%else
PALIGNR
m0
,
m2
,
m1
,
4
,
m3
mova
m1
,
m0
%endif
pshufd
m2
,
m2
,
q3321
shufps
m0
,
m1
,
m2
,
q1032
pshufd
m3
,
m2
,
q3332
mova
[
dstq
+
strideq
*
0
]
,
m1
mova
[
dstq
+
strideq
*
2
]
,
m0
mova
[
dstq
+
strideq
*
4
]
,
m2
mova
[
dstq
+
stride3q
*
2
]
,
m3
RET
cglobal
vp9_ipred_hu_16x16_16
,
3
,
4
,
6
+
notcpuflag
(
ssse3
),
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
mova
m3
,
[
lq
+
mmsize
]
movu
m1
,
[
lq
+
2
]
movu
m2
,
[
lq
+
4
]
LOWPASS
2
,
1
,
0
pavgw
m1
,
m0
SBUTTERFLY
wd
,
1
,
2
,
0
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m0
,
m4
,
m3
,
m5
LOWPASS
4
,
0
,
3
pavgw
m3
,
m0
SBUTTERFLY
wd
,
3
,
4
,
5
pshufd
m0
,
m0
,
q3333
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
mov
cntd
,
4
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m1
mova
[
dstq
+
strideq
*
0
+
16
]
,
m2
mova
[
dstq
+
strideq
*
4
+
0
]
,
m2
mova
[
dstq
+
strideq
*
4
+
16
]
,
m3
mova
[
dstq
+
strideq
*
8
+
0
]
,
m3
mova
[
dstq
+
strideq
*
8
+
16
]
,
m4
mova
[
dstq
+
stride3q
*
4
+
0
]
,
m4
mova
[
dstq
+
stride3q
*
4
+
16
]
,
m0
add
dstq
,
strideq
%if
cpuflag
(
avx
)
vpalignr
m1
,
m2
,
m1
,
4
vpalignr
m2
,
m3
,
m2
,
4
vpalignr
m3
,
m4
,
m3
,
4
vpalignr
m4
,
m0
,
m4
,
4
%else
PALIGNR
m5
,
m2
,
m1
,
4
,
m6
mova
m1
,
m5
PALIGNR
m5
,
m3
,
m2
,
4
,
m6
mova
m2
,
m5
PALIGNR
m5
,
m4
,
m3
,
4
,
m6
mova
m3
,
m5
PALIGNR
m5
,
m0
,
m4
,
4
,
m6
mova
m4
,
m5
%endif
dec
cntd
jg
.
loop
RET
cglobal
vp9_ipred_hu_32x32_16
,
3
,
7
,
10
+
notcpuflag
(
ssse3
),
\
%1
*
mmsize
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
mova
m2
,
[
lq
+
mmsize
*
0
+
0
]
movu
m1
,
[
lq
+
mmsize
*
0
+
2
]
movu
m0
,
[
lq
+
mmsize
*
0
+
4
]
LOWPASS
0
,
1
,
2
pavgw
m1
,
m2
SBUTTERFLY
wd
,
1
,
0
,
2
SCRATCH
1
,
8
,
rsp
+
0
*
mmsize
mova
m4
,
[
lq
+
mmsize
*
1
+
0
]
movu
m3
,
[
lq
+
mmsize
*
1
+
2
]
movu
m2
,
[
lq
+
mmsize
*
1
+
4
]
LOWPASS
2
,
3
,
4
pavgw
m3
,
m4
SBUTTERFLY
wd
,
3
,
2
,
4
mova
m6
,
[
lq
+
mmsize
*
2
+
0
]
movu
m5
,
[
lq
+
mmsize
*
2
+
2
]
movu
m4
,
[
lq
+
mmsize
*
2
+
4
]
LOWPASS
4
,
5
,
6
pavgw
m5
,
m6
SBUTTERFLY
wd
,
5
,
4
,
6
mova
m7
,
[
lq
+
mmsize
*
3
+
0
]
SCRATCH
0
,
9
,
rsp
+
1
*
mmsize
%if
cpuflag
(
ssse3
)
mova
m0
,
[
pb_2to15_14_15
]
%endif
SHIFT_RIGHTx2
m1
,
m6
,
m7
,
m0
LOWPASS
6
,
1
,
7
pavgw
m7
,
m1
SBUTTERFLY
wd
,
7
,
6
,
0
pshufd
m1
,
m1
,
q3333
UNSCRATCH
0
,
9
,
rsp
+
1
*
mmsize
DEFINE_ARGS
dst
,
stride
,
cnt
,
stride3
,
stride4
,
stride20
,
stride28
lea
stride3q
,
[
strideq
*
3
]
lea
stride4q
,
[
strideq
*
4
]
lea
stride28q
,
[
stride4q
*
8
]
lea
stride20q
,
[
stride4q
*
5
]
sub
stride28q
,
stride4q
mov
cntd
,
4
.
loop
:
%if
ARCH_X86_64
SWAP
1
,
8
%else
mova
[
rsp
+
1
*
mmsize
]
,
m1
mova
m1
,
[
rsp
+
0
*
mmsize
]
%endif
mova
[
dstq
+
strideq
*
0
+
0
]
,
m1
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
0
+
32
]
,
m3
mova
[
dstq
+
strideq
*
0
+
48
]
,
m2
mova
[
dstq
+
stride4q
*
1
+
0
]
,
m0
mova
[
dstq
+
stride4q
*
1
+
16
]
,
m3
mova
[
dstq
+
stride4q
*
1
+
32
]
,
m2
mova
[
dstq
+
stride4q
*
1
+
48
]
,
m5
mova
[
dstq
+
stride4q
*
2
+
0
]
,
m3
mova
[
dstq
+
stride4q
*
2
+
16
]
,
m2
mova
[
dstq
+
stride4q
*
2
+
32
]
,
m5
mova
[
dstq
+
stride4q
*
2
+
48
]
,
m4
%if
cpuflag
(
avx
)
vpalignr
m1
,
m0
,
m1
,
4
vpalignr
m0
,
m3
,
m0
,
4
vpalignr
m3
,
m2
,
m3
,
4
%else
SCRATCH
6
,
9
,
rsp
+
2
*
mmsize
%if
notcpuflag
(
ssse3
)
SCRATCH
7
,
10
,
rsp
+
3
*
mmsize
%endif
PALIGNR
m6
,
m0
,
m1
,
4
,
m7
mova
m1
,
m6
PALIGNR
m6
,
m3
,
m0
,
4
,
m7
mova
m0
,
m6
PALIGNR
m6
,
m2
,
m3
,
4
,
m7
mova
m3
,
m6
UNSCRATCH
6
,
9
,
rsp
+
2
*
mmsize
SCRATCH
0
,
9
,
rsp
+
2
*
mmsize
%if
notcpuflag
(
ssse3
)
UNSCRATCH
7
,
10
,
rsp
+
3
*
mmsize
SCRATCH
3
,
10
,
rsp
+
3
*
mmsize
%endif
%endif
%if
ARCH_X86_64
SWAP
1
,
8
%else
mova
[
rsp
+
0
*
mmsize
]
,
m1
mova
m1
,
[
rsp
+
1
*
mmsize
]
%endif
mova
[
dstq
+
stride3q
*
4
+
0
]
,
m2
mova
[
dstq
+
stride3q
*
4
+
16
]
,
m5
mova
[
dstq
+
stride3q
*
4
+
32
]
,
m4
mova
[
dstq
+
stride3q
*
4
+
48
]
,
m7
mova
[
dstq
+
stride4q
*
4
+
0
]
,
m5
mova
[
dstq
+
stride4q
*
4
+
16
]
,
m4
mova
[
dstq
+
stride4q
*
4
+
32
]
,
m7
mova
[
dstq
+
stride4q
*
4
+
48
]
,
m6
mova
[
dstq
+
stride20q
+
0
]
,
m4
mova
[
dstq
+
stride20q
+
16
]
,
m7
mova
[
dstq
+
stride20q
+
32
]
,
m6
mova
[
dstq
+
stride20q
+
48
]
,
m1
mova
[
dstq
+
stride3q
*
8
+
0
]
,
m7
mova
[
dstq
+
stride3q
*
8
+
16
]
,
m6
mova
[
dstq
+
stride3q
*
8
+
32
]
,
m1
mova
[
dstq
+
stride3q
*
8
+
48
]
,
m1
mova
[
dstq
+
stride28q
+
0
]
,
m6
mova
[
dstq
+
stride28q
+
16
]
,
m1
mova
[
dstq
+
stride28q
+
32
]
,
m1
mova
[
dstq
+
stride28q
+
48
]
,
m1
%if
cpuflag
(
avx
)
vpalignr
m2
,
m5
,
m2
,
4
vpalignr
m5
,
m4
,
m5
,
4
vpalignr
m4
,
m7
,
m4
,
4
vpalignr
m7
,
m6
,
m7
,
4
vpalignr
m6
,
m1
,
m6
,
4
%else
PALIGNR
m0
,
m5
,
m2
,
4
,
m3
mova
m2
,
m0
PALIGNR
m0
,
m4
,
m5
,
4
,
m3
mova
m5
,
m0
PALIGNR
m0
,
m7
,
m4
,
4
,
m3
mova
m4
,
m0
PALIGNR
m0
,
m6
,
m7
,
4
,
m3
mova
m7
,
m0
PALIGNR
m0
,
m1
,
m6
,
4
,
m3
mova
m6
,
m0
UNSCRATCH
0
,
9
,
rsp
+
2
*
mmsize
%if
notcpuflag
(
ssse3
)
UNSCRATCH
3
,
10
,
rsp
+
3
*
mmsize
%endif
%endif
add
dstq
,
strideq
dec
cntd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
HU_FUNCS
4
INIT_XMM
ssse3
HU_FUNCS
3
INIT_XMM
avx
HU_FUNCS
2
%macro
HD_FUNCS
0
cglobal
vp9_ipred_hd_4x4_16
,
4
,
4
,
4
,
dst
,
stride
,
l
,
a
movh
m0
,
[lq]
movhps
m0
,
[
aq
-
2
]
psrldq
m1
,
m0
,
2
psrldq
m2
,
m0
,
4
LOWPASS
2
,
1
,
0
pavgw
m1
,
m0
punpcklwd
m1
,
m2
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
movh
[
dstq
+
stride3q
]
,
m1
movhps
[
dstq
+
strideq
*
1
]
,
m1
movhlps
m2
,
m2
PALIGNR
m2
,
m1
,
4
,
m0
movh
[
dstq
+
strideq
*
2
]
,
m2
movhps
[
dstq
+
strideq
*
0
]
,
m2
RET
cglobal
vp9_ipred_hd_8x8_16
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
movu
m1
,
[
aq
-
2
]
PALIGNR
m2
,
m1
,
m0
,
2
,
m3
PALIGNR
m3
,
m1
,
m0
,
4
,
m4
LOWPASS
3
,
2
,
0
pavgw
m2
,
m0
SBUTTERFLY
wd
,
2
,
3
,
0
psrldq
m0
,
m1
,
2
psrldq
m4
,
m1
,
4
LOWPASS
1
,
0
,
4
DEFINE_ARGS
dst8
,
mstride
,
cnt
lea
dst8q
,
[
dst8q
+
mstrideq
*
8
]
neg
mstrideq
mov
cntd
,
4
.
loop
:
add
dst8q
,
mstrideq
mova
[
dst8q
+
mstrideq
*
0
]
,
m2
mova
[
dst8q
+
mstrideq
*
4
]
,
m3
%if
cpuflag
(
avx
)
vpalignr
m2
,
m3
,
m2
,
4
vpalignr
m3
,
m1
,
m3
,
4
%else
PALIGNR
m0
,
m3
,
m2
,
4
,
m4
mova
m2
,
m0
PALIGNR
m0
,
m1
,
m3
,
4
,
m4
mova
m3
,
m0
%endif
psrldq
m1
,
4
dec
cntd
jg
.
loop
RET
cglobal
vp9_ipred_hd_16x16_16
,
4
,
4
,
8
,
dst
,
stride
,
l
,
a
mova
m2
,
[lq]
movu
m1
,
[
lq
+
2
]
movu
m0
,
[
lq
+
4
]
LOWPASS
0
,
1
,
2
pavgw
m1
,
m2
mova
m4
,
[
lq
+
mmsize
]
movu
m5
,
[
aq
-
2
]
PALIGNR
m3
,
m5
,
m4
,
2
,
m6
PALIGNR
m2
,
m5
,
m4
,
4
,
m6
LOWPASS
2
,
3
,
4
pavgw
m3
,
m4
SBUTTERFLY
wd
,
1
,
0
,
4
SBUTTERFLY
wd
,
3
,
2
,
4
mova
m6
,
[aq]
movu
m4
,
[
aq
+
2
]
LOWPASS
4
,
6
,
5
movu
m5
,
[
aq
+
mmsize
-
2
]
psrldq
m6
,
m5
,
2
psrldq
m7
,
m5
,
4
LOWPASS
5
,
6
,
7
DEFINE_ARGS
dst
,
mstride
,
mstride3
,
cnt
lea
dstq
,
[
dstq
+
mstrideq
*
8
]
lea
dstq
,
[
dstq
+
mstrideq
*
8
]
neg
mstrideq
lea
mstride3q
,
[
mstrideq
*
3
]
mov
cntd
,
4
.
loop
:
add
dstq
,
mstrideq
mova
[
dstq
+
mstride3q
*
4
+
0
]
,
m2
mova
[
dstq
+
mstride3q
*
4
+
16
]
,
m4
mova
[
dstq
+
mstrideq
*
8
+
0
]
,
m3
mova
[
dstq
+
mstrideq
*
8
+
16
]
,
m2
mova
[
dstq
+
mstrideq
*
4
+
0
]
,
m0
mova
[
dstq
+
mstrideq
*
4
+
16
]
,
m3
mova
[
dstq
+
mstrideq
*
0
+
0
]
,
m1
mova
[
dstq
+
mstrideq
*
0
+
16
]
,
m0
%if
cpuflag
(
avx
)
vpalignr
m1
,
m0
,
m1
,
4
vpalignr
m0
,
m3
,
m0
,
4
vpalignr
m3
,
m2
,
m3
,
4
vpalignr
m2
,
m4
,
m2
,
4
vpalignr
m4
,
m5
,
m4
,
4
%else
PALIGNR
m6
,
m0
,
m1
,
4
,
m7
mova
m1
,
m6
PALIGNR
m6
,
m3
,
m0
,
4
,
m7
mova
m0
,
m6
PALIGNR
m6
,
m2
,
m3
,
4
,
m7
mova
m3
,
m6
PALIGNR
m6
,
m4
,
m2
,
4
,
m7
mova
m2
,
m6
PALIGNR
m6
,
m5
,
m4
,
4
,
m7
mova
m4
,
m6
%endif
psrldq
m5
,
4
dec
cntd
jg
.
loop
RET
cglobal
vp9_ipred_hd_32x32_16
,
4
,
4
+
3
*
ARCH_X86_64
,
14
,
\
10
*
mmsize
*
ARCH_X86_32
,
dst
,
stride
,
l
,
a
mova
m2
,
[
lq
+
mmsize
*
0
+
0
]
movu
m1
,
[
lq
+
mmsize
*
0
+
2
]
movu
m0
,
[
lq
+
mmsize
*
0
+
4
]
LOWPASS
0
,
1
,
2
pavgw
m1
,
m2
SBUTTERFLY
wd
,
1
,
0
,
2
mova
m4
,
[
lq
+
mmsize
*
1
+
0
]
movu
m3
,
[
lq
+
mmsize
*
1
+
2
]
movu
m2
,
[
lq
+
mmsize
*
1
+
4
]
LOWPASS
2
,
3
,
4
pavgw
m3
,
m4
SBUTTERFLY
wd
,
3
,
2
,
4
SCRATCH
0
,
8
,
rsp
+
0
*
mmsize
SCRATCH
1
,
9
,
rsp
+
1
*
mmsize
SCRATCH
2
,
10
,
rsp
+
2
*
mmsize
SCRATCH
3
,
11
,
rsp
+
3
*
mmsize
mova
m6
,
[
lq
+
mmsize
*
2
+
0
]
movu
m5
,
[
lq
+
mmsize
*
2
+
2
]
movu
m4
,
[
lq
+
mmsize
*
2
+
4
]
LOWPASS
4
,
5
,
6
pavgw
m5
,
m6
SBUTTERFLY
wd
,
5
,
4
,
6
mova
m0
,
[
lq
+
mmsize
*
3
+
0
]
movu
m1
,
[
aq
+
mmsize
*
0
-
2
]
PALIGNR
m7
,
m1
,
m0
,
2
,
m2
PALIGNR
m6
,
m1
,
m0
,
4
,
m2
LOWPASS
6
,
7
,
0
pavgw
m7
,
m0
SBUTTERFLY
wd
,
7
,
6
,
0
mova
m2
,
[
aq
+
mmsize
*
0
+
0
]
movu
m0
,
[
aq
+
mmsize
*
0
+
2
]
LOWPASS
0
,
2
,
1
movu
m1
,
[
aq
+
mmsize
*
1
-
2
]
mova
m2
,
[
aq
+
mmsize
*
1
+
0
]
movu
m3
,
[
aq
+
mmsize
*
1
+
2
]
LOWPASS
1
,
2
,
3
SCRATCH
6
,
12
,
rsp
+
6
*
mmsize
SCRATCH
7
,
13
,
rsp
+
7
*
mmsize
movu
m2
,
[
aq
+
mmsize
*
2
-
2
]
mova
m3
,
[
aq
+
mmsize
*
2
+
0
]
movu
m6
,
[
aq
+
mmsize
*
2
+
2
]
LOWPASS
2
,
3
,
6
movu
m3
,
[
aq
+
mmsize
*
3
-
2
]
psrldq
m6
,
m3
,
2
psrldq
m7
,
m3
,
4
LOWPASS
3
,
6
,
7
UNSCRATCH
6
,
12
,
rsp
+
6
*
mmsize
UNSCRATCH
7
,
13
,
rsp
+
7
*
mmsize
%if
ARCH_X86_32
mova
[
rsp
+
4
*
mmsize
]
,
m4
mova
[
rsp
+
5
*
mmsize
]
,
m5
; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
; to do it again here
%endif
DEFINE_ARGS
dst
,
stride
,
cnt
,
stride3
,
stride4
,
stride20
,
stride28
mov
cntd
,
4
lea
stride3q
,
[
strideq
*
3
]
%if
ARCH_X86_64
lea
stride4q
,
[
strideq
*
4
]
lea
stride28q
,
[
stride4q
*
8
]
lea
stride20q
,
[
stride4q
*
5
]
sub
stride28q
,
stride4q
%endif
add
dstq
,
stride3q
; x86-32 doesn't have enough registers, so on that platform, we split
; the loop in 2... Otherwise you spend most of the loop (un)scratching
.
loop
:
%if
ARCH_X86_64
mova
[
dstq
+
stride28q
+
0
]
,
m9
mova
[
dstq
+
stride28q
+
16
]
,
m8
mova
[
dstq
+
stride28q
+
32
]
,
m11
mova
[
dstq
+
stride28q
+
48
]
,
m10
mova
[
dstq
+
stride3q
*
8
+
0
]
,
m8
mova
[
dstq
+
stride3q
*
8
+
16
]
,
m11
mova
[
dstq
+
stride3q
*
8
+
32
]
,
m10
mova
[
dstq
+
stride3q
*
8
+
48
]
,
m5
mova
[
dstq
+
stride20q
+
0
]
,
m11
mova
[
dstq
+
stride20q
+
16
]
,
m10
mova
[
dstq
+
stride20q
+
32
]
,
m5
mova
[
dstq
+
stride20q
+
48
]
,
m4
mova
[
dstq
+
stride4q
*
4
+
0
]
,
m10
mova
[
dstq
+
stride4q
*
4
+
16
]
,
m5
mova
[
dstq
+
stride4q
*
4
+
32
]
,
m4
mova
[
dstq
+
stride4q
*
4
+
48
]
,
m7
%endif
mova
[
dstq
+
stride3q
*
4
+
0
]
,
m5
mova
[
dstq
+
stride3q
*
4
+
16
]
,
m4
mova
[
dstq
+
stride3q
*
4
+
32
]
,
m7
mova
[
dstq
+
stride3q
*
4
+
48
]
,
m6
mova
[
dstq
+
strideq
*
8
+
0
]
,
m4
mova
[
dstq
+
strideq
*
8
+
16
]
,
m7
mova
[
dstq
+
strideq
*
8
+
32
]
,
m6
mova
[
dstq
+
strideq
*
8
+
48
]
,
m0
mova
[
dstq
+
strideq
*
4
+
0
]
,
m7
mova
[
dstq
+
strideq
*
4
+
16
]
,
m6
mova
[
dstq
+
strideq
*
4
+
32
]
,
m0
mova
[
dstq
+
strideq
*
4
+
48
]
,
m1
mova
[
dstq
+
strideq
*
0
+
0
]
,
m6
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
0
+
32
]
,
m1
mova
[
dstq
+
strideq
*
0
+
48
]
,
m2
sub
dstq
,
strideq
%if
cpuflag
(
avx
)
%if
ARCH_X86_64
vpalignr
m9
,
m8
,
m9
,
4
vpalignr
m8
,
m11
,
m8
,
4
vpalignr
m11
,
m10
,
m11
,
4
vpalignr
m10
,
m5
,
m10
,
4
%endif
vpalignr
m5
,
m4
,
m5
,
4
vpalignr
m4
,
m7
,
m4
,
4
vpalignr
m7
,
m6
,
m7
,
4
vpalignr
m6
,
m0
,
m6
,
4
vpalignr
m0
,
m1
,
m0
,
4
vpalignr
m1
,
m2
,
m1
,
4
vpalignr
m2
,
m3
,
m2
,
4
%else
%if
ARCH_X86_64
PALIGNR
m12
,
m8
,
m9
,
4
,
m13
mova
m9
,
m12
PALIGNR
m12
,
m11
,
m8
,
4
,
m13
mova
m8
,
m12
PALIGNR
m12
,
m10
,
m11
,
4
,
m13
mova
m11
,
m12
PALIGNR
m12
,
m5
,
m10
,
4
,
m13
mova
m10
,
m12
%endif
SCRATCH
3
,
12
,
rsp
+
8
*
mmsize
,
sh
%if
notcpuflag
(
ssse3
)
SCRATCH
2
,
13
,
rsp
+
9
*
mmsize
%endif
PALIGNR
m3
,
m4
,
m5
,
4
,
m2
mova
m5
,
m3
PALIGNR
m3
,
m7
,
m4
,
4
,
m2
mova
m4
,
m3
PALIGNR
m3
,
m6
,
m7
,
4
,
m2
mova
m7
,
m3
PALIGNR
m3
,
m0
,
m6
,
4
,
m2
mova
m6
,
m3
PALIGNR
m3
,
m1
,
m0
,
4
,
m2
mova
m0
,
m3
%if
notcpuflag
(
ssse3
)
UNSCRATCH
2
,
13
,
rsp
+
9
*
mmsize
SCRATCH
0
,
13
,
rsp
+
9
*
mmsize
%endif
PALIGNR
m3
,
m2
,
m1
,
4
,
m0
mova
m1
,
m3
PALIGNR
m3
,
reg_sh
,
m2
,
4
,
m0
mova
m2
,
m3
%if
notcpuflag
(
ssse3
)
UNSCRATCH
0
,
13
,
rsp
+
9
*
mmsize
%endif
UNSCRATCH
3
,
12
,
rsp
+
8
*
mmsize
,
sh
%endif
psrldq
m3
,
4
dec
cntd
jg
.
loop
%if
ARCH_X86_32
UNSCRATCH
0
,
8
,
rsp
+
0
*
mmsize
UNSCRATCH
1
,
9
,
rsp
+
1
*
mmsize
UNSCRATCH
2
,
10
,
rsp
+
2
*
mmsize
UNSCRATCH
3
,
11
,
rsp
+
3
*
mmsize
mova
m4
,
[
rsp
+
4
*
mmsize
]
mova
m5
,
[
rsp
+
5
*
mmsize
]
mova
m6
,
[
rsp
+
6
*
mmsize
]
mova
m7
,
[
rsp
+
7
*
mmsize
]
DEFINE_ARGS
dst
,
stride
,
stride5
,
stride3
lea
stride5q
,
[
strideq
*
5
]
lea
dstq
,
[
dstq
+
stride5q
*
4
]
DEFINE_ARGS
dst
,
stride
,
cnt
,
stride3
mov
cntd
,
4
.
loop_2
:
mova
[
dstq
+
stride3q
*
4
+
0
]
,
m1
mova
[
dstq
+
stride3q
*
4
+
16
]
,
m0
mova
[
dstq
+
stride3q
*
4
+
32
]
,
m3
mova
[
dstq
+
stride3q
*
4
+
48
]
,
m2
mova
[
dstq
+
strideq
*
8
+
0
]
,
m0
mova
[
dstq
+
strideq
*
8
+
16
]
,
m3
mova
[
dstq
+
strideq
*
8
+
32
]
,
m2
mova
[
dstq
+
strideq
*
8
+
48
]
,
m5
mova
[
dstq
+
strideq
*
4
+
0
]
,
m3
mova
[
dstq
+
strideq
*
4
+
16
]
,
m2
mova
[
dstq
+
strideq
*
4
+
32
]
,
m5
mova
[
dstq
+
strideq
*
4
+
48
]
,
m4
mova
[
dstq
+
strideq
*
0
+
0
]
,
m2
mova
[
dstq
+
strideq
*
0
+
16
]
,
m5
mova
[
dstq
+
strideq
*
0
+
32
]
,
m4
mova
[
dstq
+
strideq
*
0
+
48
]
,
m7
sub
dstq
,
strideq
%if
cpuflag
(
avx
)
vpalignr
m1
,
m0
,
m1
,
4
vpalignr
m0
,
m3
,
m0
,
4
vpalignr
m3
,
m2
,
m3
,
4
vpalignr
m2
,
m5
,
m2
,
4
vpalignr
m5
,
m4
,
m5
,
4
vpalignr
m4
,
m7
,
m4
,
4
vpalignr
m7
,
m6
,
m7
,
4
%else
SCRATCH
6
,
12
,
rsp
+
8
*
mmsize
,
sh
%if
notcpuflag
(
ssse3
)
SCRATCH
7
,
13
,
rsp
+
9
*
mmsize
%endif
PALIGNR
m6
,
m0
,
m1
,
4
,
m7
mova
m1
,
m6
PALIGNR
m6
,
m3
,
m0
,
4
,
m7
mova
m0
,
m6
PALIGNR
m6
,
m2
,
m3
,
4
,
m7
mova
m3
,
m6
PALIGNR
m6
,
m5
,
m2
,
4
,
m7
mova
m2
,
m6
PALIGNR
m6
,
m4
,
m5
,
4
,
m7
mova
m5
,
m6
%if
notcpuflag
(
ssse3
)
UNSCRATCH
7
,
13
,
rsp
+
9
*
mmsize
SCRATCH
5
,
13
,
rsp
+
9
*
mmsize
%endif
PALIGNR
m6
,
m7
,
m4
,
4
,
m5
mova
m4
,
m6
PALIGNR
m6
,
reg_sh
,
m7
,
4
,
m5
mova
m7
,
m6
%if
notcpuflag
(
ssse3
)
UNSCRATCH
5
,
13
,
rsp
+
9
*
mmsize
%endif
UNSCRATCH
6
,
12
,
rsp
+
8
*
mmsize
,
sh
%endif
psrldq
m6
,
4
dec
cntd
jg
.
loop_2
%endif
RET
%endmacro
INIT_XMM
sse2
HD_FUNCS
INIT_XMM
ssse3
HD_FUNCS
INIT_XMM
avx
HD_FUNCS
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment