Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
db7786e8
Commit
db7786e8
authored
Sep 30, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: sse2/ssse3/avx 16bpp loopfilter x86 simd.
parent
254c64c5
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
918 additions
and
1 deletion
+918
-1
Makefile
libavcodec/x86/Makefile
+1
-0
constants.c
libavcodec/x86/constants.c
+2
-0
constants.h
libavcodec/x86/constants.h
+1
-0
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+90
-0
vp9lpf_16bpp.asm
libavcodec/x86/vp9lpf_16bpp.asm
+823
-0
vp9mc_16bpp.asm
libavcodec/x86/vp9mc_16bpp.asm
+1
-1
No files found.
libavcodec/x86/Makefile
View file @
db7786e8
...
...
@@ -160,6 +160,7 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER)
+=
x86/vp9intrapred.o
\
x86/vp9itxfm.o
\
x86/vp9lpf.o
\
x86/vp9lpf_16bpp.o
\
x86/vp9mc.o
\
x86/vp9mc_16bpp.o
YASM-OBJS-$(CONFIG_WEBP_DECODER)
+=
x86/vp8dsp.o
libavcodec/x86/constants.c
View file @
db7786e8
...
...
@@ -55,6 +55,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x040
0x0400040004000400ULL
,
0x0400040004000400ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_2048
)
=
{
0x0800080008000800ULL
,
0x0800080008000800ULL
,
0x0800080008000800ULL
,
0x0800080008000800ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_4095
)
=
{
0x0fff0fff0fff0fffULL
,
0x0fff0fff0fff0fffULL
,
0x0fff0fff0fff0fffULL
,
0x0fff0fff0fff0fffULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_4096
)
=
{
0x1000100010001000ULL
,
0x1000100010001000ULL
,
0x1000100010001000ULL
,
0x1000100010001000ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_8192
)
=
{
0x2000200020002000ULL
,
0x2000200020002000ULL
,
...
...
libavcodec/x86/constants.h
View file @
db7786e8
...
...
@@ -47,6 +47,7 @@ extern const ymm_reg ff_pw_512;
extern
const
ymm_reg
ff_pw_1023
;
extern
const
ymm_reg
ff_pw_1024
;
extern
const
ymm_reg
ff_pw_2048
;
extern
const
ymm_reg
ff_pw_4095
;
extern
const
ymm_reg
ff_pw_4096
;
extern
const
ymm_reg
ff_pw_8192
;
extern
const
ymm_reg
ff_pw_m1
;
...
...
libavcodec/x86/vp9dsp_init_16bpp_template.c
View file @
db7786e8
...
...
@@ -65,6 +65,62 @@ filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
filters_8tap_1d_fn2
(
avg
,
16
,
BPC
,
avx2
,
16
bpp
)
#endif
#define decl_lpf_func(dir, wd, bpp, opt) \
void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
int E, int I, int H)
#define decl_lpf_funcs(dir, wd, bpp) \
decl_lpf_func(dir, wd, bpp, sse2); \
decl_lpf_func(dir, wd, bpp, ssse3); \
decl_lpf_func(dir, wd, bpp, avx)
#define decl_lpf_funcs_wd(dir) \
decl_lpf_funcs(dir, 4, BPC); \
decl_lpf_funcs(dir, 8, BPC); \
decl_lpf_funcs(dir, 16, BPC)
decl_lpf_funcs_wd
(
h
);
decl_lpf_funcs_wd
(
v
);
#define lpf_16_wrapper(dir, off, bpp, opt) \
static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
int E, int I, int H) \
{ \
ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \
ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
}
#define lpf_16_wrappers(bpp, opt) \
lpf_16_wrapper(h, 8 * stride, bpp, opt); \
lpf_16_wrapper(v, 16, bpp, opt)
lpf_16_wrappers
(
BPC
,
sse2
);
lpf_16_wrappers
(
BPC
,
ssse3
);
lpf_16_wrappers
(
BPC
,
avx
);
#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
int E, int I, int H) \
{ \
ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \
E & 0xff, I & 0xff, H & 0xff); \
ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
E >> 8, I >> 8, H >> 8); \
}
#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt); \
lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt)
#define lpf_mix2_wrappers_set(bpp, opt) \
lpf_mix2_wrappers(4, 4, bpp, opt); \
lpf_mix2_wrappers(4, 8, bpp, opt); \
lpf_mix2_wrappers(8, 4, bpp, opt); \
lpf_mix2_wrappers(8, 8, bpp, opt); \
lpf_mix2_wrappers_set
(
BPC
,
sse2
);
lpf_mix2_wrappers_set
(
BPC
,
ssse3
);
lpf_mix2_wrappers_set
(
BPC
,
avx
);
#endif
/* HAVE_YASM */
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
)
...
...
@@ -72,9 +128,43 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
#if HAVE_YASM
int
cpu_flags
=
av_get_cpu_flags
();
#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
#define init_lpf_16_func(idx, dir, bpp, opt) \
dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
#define init_lpf_funcs(bpp, opt) \
init_lpf_8_func(0, 0, h, 4, bpp, opt); \
init_lpf_8_func(0, 1, v, 4, bpp, opt); \
init_lpf_8_func(1, 0, h, 8, bpp, opt); \
init_lpf_8_func(1, 1, v, 8, bpp, opt); \
init_lpf_8_func(2, 0, h, 16, bpp, opt); \
init_lpf_8_func(2, 1, v, 16, bpp, opt); \
init_lpf_16_func(0, h, bpp, opt); \
init_lpf_16_func(1, v, bpp, opt); \
init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
init_subpel3
(
0
,
put
,
BPC
,
sse2
);
init_subpel3
(
1
,
avg
,
BPC
,
sse2
);
init_lpf_funcs
(
BPC
,
sse2
);
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
init_lpf_funcs
(
BPC
,
ssse3
);
}
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
init_lpf_funcs
(
BPC
,
avx
);
}
if
(
EXTERNAL_AVX2
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9lpf_16bpp.asm
0 → 100644
View file @
db7786e8
;******************************************************************************
;* VP9 loop filter SIMD optimizations
;*
;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
pw_511
:
times
16
dw
511
pw_2047
:
times
16
dw
2047
pw_16384
:
times
16
dw
16384
pw_m512
:
times
16
dw
-
512
pw_m2048
:
times
16
dw
-
2048
cextern
pw_1
cextern
pw_3
cextern
pw_4
cextern
pw_8
cextern
pw_16
cextern
pw_256
cextern
pw_1023
cextern
pw_4095
cextern
pw_m1
SECTION
.
text
%macro
SCRATCH
3
-
4
%if
ARCH_X86_64
SWAP
%1
,
%2
%if
%0
==
4
%define
reg_
%4
m%2
%endif
%else
mova
[
%3
]
,
m%1
%if
%0
==
4
%define
reg_
%4
[
%3
]
%endif
%endif
%endmacro
%macro
UNSCRATCH
3
-
4
%if
ARCH_X86_64
SWAP
%1
,
%2
%else
mova
m%1
,
[
%3
]
%endif
%if
%0
==
4
%undef
reg_
%4
%endif
%endmacro
%macro
PRELOAD
2
-
3
%if
ARCH_X86_64
mova
m%1
,
[
%2
]
%if
%0
==
3
%define
reg_
%3
m%1
%endif
%elif
%0
==
3
%define
reg_
%3
[
%2
]
%endif
%endmacro
; calulate p or q portion of flat8out
%macro
FLAT8OUT_HALF
0
psubw
m4
,
m0
; q4-q0
psubw
m5
,
m0
; q5-q0
psubw
m6
,
m0
; q6-q0
psubw
m7
,
m0
; q7-q0
ABS2
m4
,
m5
,
m2
,
m3
; abs(q4-q0) | abs(q5-q0)
ABS2
m6
,
m7
,
m2
,
m3
; abs(q6-q0) | abs(q7-q0)
pcmpgtw
m4
,
reg_F
; abs(q4-q0) > F
pcmpgtw
m5
,
reg_F
; abs(q5-q0) > F
pcmpgtw
m6
,
reg_F
; abs(q6-q0) > F
pcmpgtw
m7
,
reg_F
; abs(q7-q0) > F
por
m5
,
m4
por
m7
,
m6
por
m7
,
m5
; !flat8out, q portion
%endmacro
; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
%macro
FLAT8IN_HALF
1
%if
%1
>
4
psubw
m4
,
m3
,
m0
; q3-q0
psubw
m5
,
m2
,
m0
; q2-q0
ABS2
m4
,
m5
,
m6
,
m7
; abs(q3-q0) | abs(q2-q0)
pcmpgtw
m4
,
reg_F
; abs(q3-q0) > F
pcmpgtw
m5
,
reg_F
; abs(q2-q0) > F
%endif
psubw
m3
,
m2
; q3-q2
psubw
m2
,
m1
; q2-q1
ABS2
m3
,
m2
,
m6
,
m7
; abs(q3-q2) | abs(q2-q1)
pcmpgtw
m3
,
reg_I
; abs(q3-q2) > I
pcmpgtw
m2
,
reg_I
; abs(q2-q1) > I
%if
%1
>
4
por
m4
,
m5
%endif
por
m2
,
m3
psubw
m3
,
m1
,
m0
; q1-q0
ABS1
m3
,
m5
; abs(q1-q0)
%if
%1
>
4
pcmpgtw
m6
,
m3
,
reg_F
; abs(q1-q0) > F
%endif
pcmpgtw
m7
,
m3
,
reg_H
; abs(q1-q0) > H
pcmpgtw
m3
,
reg_I
; abs(q1-q0) > I
%if
%1
>
4
por
m4
,
m6
%endif
por
m2
,
m3
%endmacro
; one step in filter_14/filter_6
;
; take sum $reg, downshift, apply mask and write into dst
;
; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
; step's sum $reg. This is omitted for the last row in each filter.
;
; if dont_store is set, don't write the result into memory, instead keep the
; values in register so we can write it out later
%macro
FILTER_STEP
6
-
10
""
,
""
,
""
,
0
; tmp, reg, mask, shift, dst, \
; src/sub1, sub2, add1, add2, dont_store
psrlw
%1
,
%2
,
%4
psubw
%1
,
%6
; abs->delta
%ifnidn
%7
,
""
psubw
%2
,
%6
psubw
%2
,
%7
paddw
%2
,
%8
paddw
%2
,
%9
%endif
pand
%1
,
reg_
%3
; apply mask
%if
%10
==
1
paddw
%6
,
%1
; delta->abs
%else
paddw
%1
,
%6
; delta->abs
mova
[
%5
]
,
%1
%endif
%endmacro
; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
%macro
LOOP_FILTER
3
; dir[h/v], wd[4/8/16], bpp[10/12]
%if
ARCH_X86_64
%if
%2
==
16
%assign
%%
num_xmm_regs
16
%elif
%2
==
8
%assign
%%
num_xmm_regs
15
%else
; %2 == 4
%assign
%%
num_xmm_regs
14
%endif
; %2
%assign
%%
bak_mem
0
%else
; ARCH_X86_32
%assign
%%
num_xmm_regs
8
%if
%2
==
16
%assign
%%
bak_mem
7
%elif
%2
==
8
%assign
%%
bak_mem
6
%else
; %2 == 4
%assign
%%
bak_mem
5
%endif
; %2
%endif
; ARCH_X86_64/32
%if
%2
==
16
%ifidn
%1
,
v
%assign
%%
num_gpr_regs
6
%else
; %1 == h
%assign
%%
num_gpr_regs
5
%endif
; %1
%assign
%%
wd_mem
6
%else
; %2 == 8/4
%assign
%%
num_gpr_regs
5
%if
ARCH_X86_32
&&
%2
==
8
%assign
%%
wd_mem
2
%else
; ARCH_X86_64 || %2 == 4
%assign
%%
wd_mem
0
%endif
; ARCH_X86_64/32 etc.
%endif
; %2
%ifidn
%1
,
v
%assign
%%
tsp_mem
0
%elif
%2
==
16
; && %1 == h
%assign
%%
tsp_mem
16
%else
; %1 == h && %1 == 8/4
%assign
%%
tsp_mem
8
%endif
; %1/%2
%assign
%%
off
%%
wd_mem
%assign
%%
tspoff
%%
bak_mem
+
%%
wd_mem
%assign
%%
stack_mem
((
%%
bak_mem
+
%%
wd_mem
+
%%
tsp_mem
)
*
mmsize
)
%if
%3
==
10
%define
%%
maxsgn
511
%define
%%
minsgn
m512
%define
%%
maxusgn
1023
%define
%%
maxf
4
%else
; %3 == 12
%define
%%
maxsgn
2047
%define
%%
minsgn
m2048
%define
%%
maxusgn
4095
%define
%%
maxf
16
%endif
; %3
cglobal
vp9_loop_filter_
%1
_
%2
_
%3
,
5
,
%%
num_gpr_regs
,
%%
num_xmm_regs
,
%%
stack_mem
,
dst
,
stride
,
E
,
I
,
H
; prepare E, I and H masks
shl
Ed
,
%3
-
8
shl
Id
,
%3
-
8
shl
Hd
,
%3
-
8
%if
cpuflag
(
ssse3
)
mova
m0
,
[
pw_256
]
%endif
movd
m1
,
Ed
movd
m2
,
Id
movd
m3
,
Hd
%if
cpuflag
(
ssse3
)
pshufb
m1
,
m0
; E << (bit_depth - 8)
pshufb
m2
,
m0
; I << (bit_depth - 8)
pshufb
m3
,
m0
; H << (bit_depth - 8)
%else
punpcklwd
m1
,
m1
punpcklwd
m2
,
m2
punpcklwd
m3
,
m3
pshufd
m1
,
m1
,
q0000
pshufd
m2
,
m2
,
q0000
pshufd
m3
,
m3
,
q0000
%endif
SCRATCH
1
,
8
,
rsp
+
(
%%
off
+
0
)
*
mmsize
,
E
SCRATCH
2
,
9
,
rsp
+
(
%%
off
+
1
)
*
mmsize
,
I
SCRATCH
3
,
10
,
rsp
+
(
%%
off
+
2
)
*
mmsize
,
H
%if
%2
>
4
PRELOAD
11
,
pw_
%
+
%%
maxf
,
F
%endif
; set up variables to load data
%ifidn
%1
,
v
DEFINE_ARGS
dst8
,
stride
,
stride3
,
dst0
,
dst4
,
dst12
lea
stride3q
,
[
strideq
*
3
]
neg
strideq
%if
%2
==
16
lea
dst0q
,
[
dst8q
+
strideq
*
8
]
%else
lea
dst4q
,
[
dst8q
+
strideq
*
4
]
%endif
neg
strideq
%if
%2
==
16
lea
dst12q
,
[
dst8q
+
strideq
*
4
]
lea
dst4q
,
[
dst0q
+
strideq
*
4
]
%endif
%if
%2
==
16
%define
%%
p7
dst0q
%define
%%
p6
dst0q
+
strideq
%define
%%
p5
dst0q
+
strideq
*
2
%define
%%
p4
dst0q
+
stride3q
%endif
%define
%%
p3
dst4q
%define
%%
p2
dst4q
+
strideq
%define
%%
p1
dst4q
+
strideq
*
2
%define
%%
p0
dst4q
+
stride3q
%define
%%
q0
dst8q
%define
%%
q1
dst8q
+
strideq
%define
%%
q2
dst8q
+
strideq
*
2
%define
%%
q3
dst8q
+
stride3q
%if
%2
==
16
%define
%%
q4
dst12q
%define
%%
q5
dst12q
+
strideq
%define
%%
q6
dst12q
+
strideq
*
2
%define
%%
q7
dst12q
+
stride3q
%endif
%else
; %1 == h
DEFINE_ARGS
dst0
,
stride
,
stride3
,
dst4
lea
stride3q
,
[
strideq
*
3
]
lea
dst4q
,
[
dst0q
+
strideq
*
4
]
%define
%%
p3
rsp
+
(
%%
tspoff
+
0
)
*
mmsize
%define
%%
p2
rsp
+
(
%%
tspoff
+
1
)
*
mmsize
%define
%%
p1
rsp
+
(
%%
tspoff
+
2
)
*
mmsize
%define
%%
p0
rsp
+
(
%%
tspoff
+
3
)
*
mmsize
%define
%%
q0
rsp
+
(
%%
tspoff
+
4
)
*
mmsize
%define
%%
q1
rsp
+
(
%%
tspoff
+
5
)
*
mmsize
%define
%%
q2
rsp
+
(
%%
tspoff
+
6
)
*
mmsize
%define
%%
q3
rsp
+
(
%%
tspoff
+
7
)
*
mmsize
%if
%2
<
16
movu
m0
,
[
dst0q
+
strideq
*
0
-
8
]
movu
m1
,
[
dst0q
+
strideq
*
1
-
8
]
movu
m2
,
[
dst0q
+
strideq
*
2
-
8
]
movu
m3
,
[
dst0q
+
stride3q
-
8
]
movu
m4
,
[
dst4q
+
strideq
*
0
-
8
]
movu
m5
,
[
dst4q
+
strideq
*
1
-
8
]
movu
m6
,
[
dst4q
+
strideq
*
2
-
8
]
movu
m7
,
[
dst4q
+
stride3q
-
8
]
%if
ARCH_X86_64
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
12
%else
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
[%%
p0
]
,
[%%
q0
]
%endif
mova
[%%
p3
]
,
m0
mova
[%%
p2
]
,
m1
mova
[%%
p1
]
,
m2
mova
[%%
p0
]
,
m3
%if
ARCH_X86_64
mova
[%%
q0
]
,
m4
%endif
mova
[%%
q1
]
,
m5
mova
[%%
q2
]
,
m6
mova
[%%
q3
]
,
m7
; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
; order here accordingly
%else
; %2 == 16
%define
%%
p7
rsp
+
(
%%
tspoff
+
8
)
*
mmsize
%define
%%
p6
rsp
+
(
%%
tspoff
+
9
)
*
mmsize
%define
%%
p5
rsp
+
(
%%
tspoff
+
10
)
*
mmsize
%define
%%
p4
rsp
+
(
%%
tspoff
+
11
)
*
mmsize
%define
%%
q4
rsp
+
(
%%
tspoff
+
12
)
*
mmsize
%define
%%
q5
rsp
+
(
%%
tspoff
+
13
)
*
mmsize
%define
%%
q6
rsp
+
(
%%
tspoff
+
14
)
*
mmsize
%define
%%
q7
rsp
+
(
%%
tspoff
+
15
)
*
mmsize
mova
m0
,
[
dst0q
+
strideq
*
0
-
16
]
mova
m1
,
[
dst0q
+
strideq
*
1
-
16
]
mova
m2
,
[
dst0q
+
strideq
*
2
-
16
]
mova
m3
,
[
dst0q
+
stride3q
-
16
]
mova
m4
,
[
dst4q
+
strideq
*
0
-
16
]
mova
m5
,
[
dst4q
+
strideq
*
1
-
16
]
%if
ARCH_X86_64
mova
m6
,
[
dst4q
+
strideq
*
2
-
16
]
%endif
mova
m7
,
[
dst4q
+
stride3q
-
16
]
%if
ARCH_X86_64
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
12
%else
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
[
dst4q
+
strideq
*
2
-
16
]
,
[%%
p3
]
,
1
%endif
mova
[%%
p7
]
,
m0
mova
[%%
p6
]
,
m1
mova
[%%
p5
]
,
m2
mova
[%%
p4
]
,
m3
%if
ARCH_X86_64
mova
[%%
p3
]
,
m4
%endif
mova
[%%
p2
]
,
m5
mova
[%%
p1
]
,
m6
mova
[%%
p0
]
,
m7
mova
m0
,
[
dst0q
+
strideq
*
0
]
mova
m1
,
[
dst0q
+
strideq
*
1
]
mova
m2
,
[
dst0q
+
strideq
*
2
]
mova
m3
,
[
dst0q
+
stride3q
]
mova
m4
,
[
dst4q
+
strideq
*
0
]
mova
m5
,
[
dst4q
+
strideq
*
1
]
%if
ARCH_X86_64
mova
m6
,
[
dst4q
+
strideq
*
2
]
%endif
mova
m7
,
[
dst4q
+
stride3q
]
%if
ARCH_X86_64
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
12
%else
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
[
dst4q
+
strideq
*
2
]
,
[%%
q4
]
,
1
%endif
mova
[%%
q0
]
,
m0
mova
[%%
q1
]
,
m1
mova
[%%
q2
]
,
m2
mova
[%%
q3
]
,
m3
%if
ARCH_X86_64
mova
[%%
q4
]
,
m4
%endif
mova
[%%
q5
]
,
m5
mova
[%%
q6
]
,
m6
mova
[%%
q7
]
,
m7
; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
; order here accordingly
%endif
; %2
%endif
; %1
; load q0|q4-7 data
mova
m0
,
[%%
q0
]
%if
%2
==
16
mova
m4
,
[%%
q4
]
mova
m5
,
[%%
q5
]
mova
m6
,
[%%
q6
]
mova
m7
,
[%%
q7
]
; flat8out q portion
FLAT8OUT_HALF
SCRATCH
7
,
15
,
rsp
+
(
%%
off
+
6
)
*
mmsize
,
F8O
%endif
; load q1-3 data
mova
m1
,
[%%
q1
]
mova
m2
,
[%%
q2
]
mova
m3
,
[%%
q3
]
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flatout[q]
; m12-14=free
; m0-3=q0-q3
; m4-7=free
; flat8in|fm|hev q portion
FLAT8IN_HALF
%2
SCRATCH
7
,
13
,
rsp
+
(
%%
off
+
4
)
*
mmsize
,
HEV
%if
%2
>
4
SCRATCH
4
,
14
,
rsp
+
(
%%
off
+
5
)
*
mmsize
,
F8I
%endif
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flat8out[q]
; r10[m13]=hev[q]
; r11[m14]=!flat8in[q]
; m2=!fm[q]
; m0,1=q0-q1
; m2-7=free
; m12=free
; load p0-1
mova
m3
,
[%%
p0
]
mova
m4
,
[%%
p1
]
; fm mb_edge portion
psubw
m5
,
m3
,
m0
; q0-p0
psubw
m6
,
m4
,
m1
; q1-p1
%if
ARCH_X86_64
ABS2
m5
,
m6
,
m7
,
m12
; abs(q0-p0) | abs(q1-p1)
%else
ABS1
m5
,
m7
; abs(q0-p0)
ABS1
m6
,
m7
; abs(q1-p1)
%endif
paddw
m5
,
m5
psraw
m6
,
1
paddw
m6
,
m5
; abs(q0-p0)*2+(abs(q1-p1)>>1)
pcmpgtw
m6
,
reg_E
por
m2
,
m6
SCRATCH
2
,
12
,
rsp
+
(
%%
off
+
3
)
*
mmsize
,
FM
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flat8out[q]
; r10[m13]=hev[q]
; r11[m14]=!flat8in[q]
; r12[m12]=!fm[q]
; m3-4=q0-1
; m0-2/5-7=free
; load p4-7 data
SWAP
3
,
0
; p0
SWAP
4
,
1
; p1
%if
%2
==
16
mova
m7
,
[%%
p7
]
mova
m6
,
[%%
p6
]
mova
m5
,
[%%
p5
]
mova
m4
,
[%%
p4
]
; flat8out p portion
FLAT8OUT_HALF
por
m7
,
reg_F8O
SCRATCH
7
,
15
,
rsp
+
(
%%
off
+
6
)
*
mmsize
,
F8O
%endif
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flat8out
; r10[m13]=hev[q]
; r11[m14]=!flat8in[q]
; r12[m12]=!fm[q]
; m0=p0
; m1-7=free
; load p2-3 data
mova
m2
,
[%%
p2
]
mova
m3
,
[%%
p3
]
; flat8in|fm|hev p portion
FLAT8IN_HALF
%2
por
m7
,
reg_HEV
%if
%2
>
4
por
m4
,
reg_F8I
%endif
por
m2
,
reg_FM
%if
%2
>
4
por
m4
,
m2
; !flat8|!fm
%if
%2
==
16
por
m5
,
m4
,
reg_F8O
; !flat16|!fm
pandn
m2
,
m4
; filter4_mask
pandn
m4
,
m5
; filter8_mask
pxor
m5
,
[
pw_m1
]
; filter16_mask
SCRATCH
5
,
15
,
rsp
+
(
%%
off
+
6
)
*
mmsize
,
F16M
%else
pandn
m2
,
m4
; filter4_mask
pxor
m4
,
[
pw_m1
]
; filter8_mask
%endif
SCRATCH
4
,
14
,
rsp
+
(
%%
off
+
5
)
*
mmsize
,
F8M
%else
pxor
m2
,
[
pw_m1
]
; filter4_mask
%endif
SCRATCH
7
,
13
,
rsp
+
(
%%
off
+
4
)
*
mmsize
,
HEV
SCRATCH
2
,
12
,
rsp
+
(
%%
off
+
3
)
*
mmsize
,
F4M
; r9[m15]=filter16_mask
; r10[m13]=hev
; r11[m14]=filter8_mask
; r12[m12]=filter4_mask
; m0,1=p0-p1
; m2-7=free
; m8-11=free
%if
%2
>
4
%if
%2
==
16
; filter_14
mova
m2
,
[%%
p7
]
mova
m3
,
[%%
p6
]
mova
m6
,
[%%
p5
]
mova
m7
,
[%%
p4
]
PRELOAD
8
,
%%
p3
,
P3
PRELOAD
9
,
%%
p2
,
P2
%endif
PRELOAD
10
,
%%
q0
,
Q0
PRELOAD
11
,
%%
q1
,
Q1
%if
%2
==
16
psllw
m4
,
m2
,
3
paddw
m5
,
m3
,
m3
paddw
m4
,
m6
paddw
m5
,
m7
paddw
m4
,
reg_P3
paddw
m5
,
reg_P2
paddw
m4
,
m1
paddw
m5
,
m0
paddw
m4
,
reg_Q0
; q0+p1+p3+p5+p7*8
psubw
m5
,
m2
; p0+p2+p4+p6*2-p7
paddw
m4
,
[
pw_8
]
paddw
m5
,
m4
; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
; at the end of the filter
mova
[
rsp
+
0
*
mmsize
]
,
m3
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
p6
,
m3
,
m2
,
m6
,
reg_Q1
%endif
mova
m3
,
[%%
q2
]
%if
%2
==
16
mova
[
rsp
+
1
*
mmsize
]
,
m6
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
p5
,
m6
,
m2
,
m7
,
m3
%endif
mova
m6
,
[%%
q3
]
%if
%2
==
16
mova
[
rsp
+
2
*
mmsize
]
,
m7
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
p4
,
m7
,
m2
,
reg_P3
,
m6
mova
m7
,
[%%
q4
]
%if
ARCH_X86_64
mova
[
rsp
+
3
*
mmsize
]
,
reg_P3
%else
mova
m4
,
reg_P3
mova
[
rsp
+
3
*
mmsize
]
,
m4
%endif
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
p3
,
reg_P3
,
m2
,
reg_P2
,
m7
PRELOAD
8
,
%%
q5
,
Q5
%if
ARCH_X86_64
mova
[
rsp
+
4
*
mmsize
]
,
reg_P2
%else
mova
m4
,
reg_P2
mova
[
rsp
+
4
*
mmsize
]
,
m4
%endif
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
p2
,
reg_P2
,
m2
,
m1
,
reg_Q5
PRELOAD
9
,
%%
q6
,
Q6
mova
[
rsp
+
5
*
mmsize
]
,
m1
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
p1
,
m1
,
m2
,
m0
,
reg_Q6
mova
m1
,
[%%
q7
]
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
p0
,
m0
,
m2
,
reg_Q0
,
m1
,
1
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
q0
,
reg_Q0
,
[
rsp
+
0
*
mmsize
]
,
reg_Q1
,
m1
,
ARCH_X86_64
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
q1
,
reg_Q1
,
[
rsp
+
1
*
mmsize
]
,
m3
,
m1
,
ARCH_X86_64
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
q2
,
m3
,
[
rsp
+
2
*
mmsize
]
,
m6
,
m1
,
1
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
q3
,
m6
,
[
rsp
+
3
*
mmsize
]
,
m7
,
m1
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
q4
,
m7
,
[
rsp
+
4
*
mmsize
]
,
reg_Q5
,
m1
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
q5
,
reg_Q5
,
[
rsp
+
5
*
mmsize
]
,
reg_Q6
,
m1
FILTER_STEP
m4
,
m5
,
F16M
,
4
,
%%
q6
,
reg_Q6
mova
m7
,
[%%
p1
]
%else
SWAP
1
,
7
%endif
mova
m2
,
[%%
p3
]
mova
m1
,
[%%
p2
]
; reg_Q0-1 (m10-m11)
; m0=p0
; m1=p2
; m2=p3
; m3=q2
; m4-5=free
; m6=q3
; m7=p1
; m8-9 unused
; filter_6
psllw
m4
,
m2
,
2
paddw
m5
,
m1
,
m1
paddw
m4
,
m7
psubw
m5
,
m2
paddw
m4
,
m0
paddw
m5
,
reg_Q0
paddw
m4
,
[
pw_4
]
paddw
m5
,
m4
%if
ARCH_X86_64
mova
m8
,
m1
mova
m9
,
m7
%else
mova
[
rsp
+
0
*
mmsize
]
,
m1
mova
[
rsp
+
1
*
mmsize
]
,
m7
%endif
%ifidn
%1
,
v
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
p2
,
m1
,
m2
,
m7
,
reg_Q1
%else
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
p2
,
m1
,
m2
,
m7
,
reg_Q1
,
1
%endif
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
p1
,
m7
,
m2
,
m0
,
m3
,
1
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
p0
,
m0
,
m2
,
reg_Q0
,
m6
,
1
%if
ARCH_X86_64
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
q0
,
reg_Q0
,
m8
,
reg_Q1
,
m6
,
ARCH_X86_64
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
q1
,
reg_Q1
,
m9
,
m3
,
m6
,
ARCH_X86_64
%else
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
q0
,
reg_Q0
,
[
rsp
+
0
*
mmsize
]
,
reg_Q1
,
m6
,
ARCH_X86_64
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
q1
,
reg_Q1
,
[
rsp
+
1
*
mmsize
]
,
m3
,
m6
,
ARCH_X86_64
%endif
FILTER_STEP
m4
,
m5
,
F8M
,
3
,
%%
q2
,
m3
UNSCRATCH
2
,
10
,
%%
q0
UNSCRATCH
6
,
11
,
%%
q1
%else
SWAP
1
,
7
mova
m2
,
[%%
q0
]
mova
m6
,
[%%
q1
]
%endif
UNSCRATCH
3
,
13
,
rsp
+
(
%%
off
+
4
)
*
mmsize
,
HEV
; m0=p0
; m1=p2
; m2=q0
; m3=hev_mask
; m4-5=free
; m6=q1
; m7=p1
; filter_4
psubw
m4
,
m7
,
m6
; p1-q1
psubw
m5
,
m2
,
m0
; q0-p0
pand
m4
,
m3
pminsw
m4
,
[
pw_
%
+
%%
maxsgn
]
pmaxsw
m4
,
[
pw_
%
+
%%
minsgn
]
; clip_intp2(p1-q1, 9) -> f
paddw
m4
,
m5
paddw
m5
,
m5
paddw
m4
,
m5
; 3*(q0-p0)+f
pminsw
m4
,
[
pw_
%
+
%%
maxsgn
]
pmaxsw
m4
,
[
pw_
%
+
%%
minsgn
]
; clip_intp2(3*(q0-p0)+f, 9) -> f
pand
m4
,
reg_F4M
paddw
m5
,
m4
,
[
pw_4
]
paddw
m4
,
[
pw_3
]
pminsw
m5
,
[
pw_
%
+
%%
maxsgn
]
pminsw
m4
,
[
pw_
%
+
%%
maxsgn
]
psraw
m5
,
3
; min_intp2(f+4, 9)>>3 -> f1
psraw
m4
,
3
; min_intp2(f+3, 9)>>3 -> f2
psubw
m2
,
m5
; q0-f1
paddw
m0
,
m4
; p0+f2
pandn
m3
,
m5
; f1 & !hev (for p1/q1 adj)
pxor
m4
,
m4
mova
m5
,
[
pw_
%
+
%%
maxusgn
]
pmaxsw
m2
,
m4
pmaxsw
m0
,
m4
pminsw
m2
,
m5
pminsw
m0
,
m5
%if
cpuflag
(
ssse3
)
pmulhrsw
m3
,
[
pw_16384
]
; (f1+1)>>1
%else
paddw
m3
,
[
pw_1
]
psraw
m3
,
1
%endif
paddw
m7
,
m3
; p1+f
psubw
m6
,
m3
; q1-f
pmaxsw
m7
,
m4
pmaxsw
m6
,
m4
pminsw
m7
,
m5
pminsw
m6
,
m5
; store
%ifidn
%1
,
v
mova
[%%
p1
]
,
m7
mova
[%%
p0
]
,
m0
mova
[%%
q0
]
,
m2
mova
[%%
q1
]
,
m6
%else
; %1 == h
%if
%2
==
4
TRANSPOSE4x4W
7
,
0
,
2
,
6
,
1
movh
[
dst0q
+
strideq
*
0
-
4
]
,
m7
movhps
[
dst0q
+
strideq
*
1
-
4
]
,
m7
movh
[
dst0q
+
strideq
*
2
-
4
]
,
m0
movhps
[
dst0q
+
stride3q
-
4
]
,
m0
movh
[
dst4q
+
strideq
*
0
-
4
]
,
m2
movhps
[
dst4q
+
strideq
*
1
-
4
]
,
m2
movh
[
dst4q
+
strideq
*
2
-
4
]
,
m6
movhps
[
dst4q
+
stride3q
-
4
]
,
m6
%elif
%2
==
8
mova
m3
,
[%%
p3
]
mova
m4
,
[%%
q2
]
mova
m5
,
[%%
q3
]
%if
ARCH_X86_64
TRANSPOSE8x8W
3
,
1
,
7
,
0
,
2
,
6
,
4
,
5
,
8
%else
TRANSPOSE8x8W
3
,
1
,
7
,
0
,
2
,
6
,
4
,
5
,
[%%
q2
]
,
[%%
q0
]
,
1
mova
m2
,
[%%
q0
]
%endif
movu
[
dst0q
+
strideq
*
0
-
8
]
,
m3
movu
[
dst0q
+
strideq
*
1
-
8
]
,
m1
movu
[
dst0q
+
strideq
*
2
-
8
]
,
m7
movu
[
dst0q
+
stride3q
-
8
]
,
m0
movu
[
dst4q
+
strideq
*
0
-
8
]
,
m2
movu
[
dst4q
+
strideq
*
1
-
8
]
,
m6
movu
[
dst4q
+
strideq
*
2
-
8
]
,
m4
movu
[
dst4q
+
stride3q
-
8
]
,
m5
%else
; %2 == 16
SCRATCH
2
,
8
,
%%
q0
SCRATCH
6
,
9
,
%%
q1
mova
m2
,
[%%
p7
]
mova
m3
,
[%%
p6
]
mova
m4
,
[%%
p5
]
mova
m5
,
[%%
p4
]
mova
m6
,
[%%
p3
]
%if
ARCH_X86_64
TRANSPOSE8x8W
2
,
3
,
4
,
5
,
6
,
1
,
7
,
0
,
10
%else
mova
[%%
p1
]
,
m7
TRANSPOSE8x8W
2
,
3
,
4
,
5
,
6
,
1
,
7
,
0
,
[%%
p1
]
,
[
dst4q
+
strideq
*
0
-
16
]
,
1
%endif
mova
[
dst0q
+
strideq
*
0
-
16
]
,
m2
mova
[
dst0q
+
strideq
*
1
-
16
]
,
m3
mova
[
dst0q
+
strideq
*
2
-
16
]
,
m4
mova
[
dst0q
+
stride3q
-
16
]
,
m5
%if
ARCH_X86_64
mova
[
dst4q
+
strideq
*
0
-
16
]
,
m6
%endif
mova
[
dst4q
+
strideq
*
1
-
16
]
,
m1
mova
[
dst4q
+
strideq
*
2
-
16
]
,
m7
mova
[
dst4q
+
stride3q
-
16
]
,
m0
UNSCRATCH
2
,
8
,
%%
q0
UNSCRATCH
6
,
9
,
%%
q1
mova
m0
,
[%%
q2
]
mova
m1
,
[%%
q3
]
mova
m3
,
[%%
q4
]
mova
m4
,
[%%
q5
]
%if
ARCH_X86_64
mova
m5
,
[%%
q6
]
%endif
mova
m7
,
[%%
q7
]
%if
ARCH_X86_64
TRANSPOSE8x8W
2
,
6
,
0
,
1
,
3
,
4
,
5
,
7
,
8
%else
TRANSPOSE8x8W
2
,
6
,
0
,
1
,
3
,
4
,
5
,
7
,
[%%
q6
]
,
[
dst4q
+
strideq
*
0
]
,
1
%endif
mova
[
dst0q
+
strideq
*
0
]
,
m2
mova
[
dst0q
+
strideq
*
1
]
,
m6
mova
[
dst0q
+
strideq
*
2
]
,
m0
mova
[
dst0q
+
stride3q
]
,
m1
%if
ARCH_X86_64
mova
[
dst4q
+
strideq
*
0
]
,
m3
%endif
mova
[
dst4q
+
strideq
*
1
]
,
m4
mova
[
dst4q
+
strideq
*
2
]
,
m5
mova
[
dst4q
+
stride3q
]
,
m7
%endif
; %2
%endif
; %1
RET
%endmacro
%macro
LOOP_FILTER_CPUSETS
3
INIT_XMM
sse2
LOOP_FILTER
%1
,
%2
,
%3
INIT_XMM
ssse3
LOOP_FILTER
%1
,
%2
,
%3
INIT_XMM
avx
LOOP_FILTER
%1
,
%2
,
%3
%endmacro
%macro
LOOP_FILTER_WDSETS
2
LOOP_FILTER_CPUSETS
%1
,
4
,
%2
LOOP_FILTER_CPUSETS
%1
,
8
,
%2
LOOP_FILTER_CPUSETS
%1
,
16
,
%2
%endmacro
LOOP_FILTER_WDSETS
h
,
10
LOOP_FILTER_WDSETS
v
,
10
LOOP_FILTER_WDSETS
h
,
12
LOOP_FILTER_WDSETS
v
,
12
libavcodec/x86/vp9mc_16bpp.asm
View file @
db7786e8
...
...
@@ -24,10 +24,10 @@
SECTION_RODATA
32
pw_4095
:
times
16
dw
0xfff
pd_64
:
times
8
dd
64
cextern
pw_1023
cextern
pw_4095
SECTION
.
text
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment