Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
1c3be325
Commit
1c3be325
authored
Oct 06, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: add 10/12bpp mmxext-optimized iwht_iwht_4x4 function.
parent
b6594a96
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
187 additions
and
22 deletions
+187
-22
Makefile
libavcodec/x86/Makefile
+1
-0
vp9dsp_init.c
libavcodec/x86/vp9dsp_init.c
+2
-2
vp9dsp_init.h
libavcodec/x86/vp9dsp_init.h
+11
-4
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+15
-1
vp9itxfm.asm
libavcodec/x86/vp9itxfm.asm
+1
-15
vp9itxfm_16bpp.asm
libavcodec/x86/vp9itxfm_16bpp.asm
+120
-0
vp9itxfm_template.asm
libavcodec/x86/vp9itxfm_template.asm
+37
-0
No files found.
libavcodec/x86/Makefile
View file @
1c3be325
...
...
@@ -165,6 +165,7 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER)
+=
x86/vp9intrapred.o
\
x86/vp9intrapred_16bpp.o
\
x86/vp9itxfm.o
\
x86/vp9itxfm_16bpp.o
\
x86/vp9lpf.o
\
x86/vp9lpf_16bpp.o
\
x86/vp9mc.o
\
...
...
libavcodec/x86/vp9dsp_init.c
View file @
1c3be325
...
...
@@ -216,10 +216,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
int
cpu_flags
;
if
(
bpp
==
10
)
{
ff_vp9dsp_init_10bpp_x86
(
dsp
);
ff_vp9dsp_init_10bpp_x86
(
dsp
,
bitexact
);
return
;
}
else
if
(
bpp
==
12
)
{
ff_vp9dsp_init_12bpp_x86
(
dsp
);
ff_vp9dsp_init_12bpp_x86
(
dsp
,
bitexact
);
return
;
}
...
...
libavcodec/x86/vp9dsp_init.h
View file @
1c3be325
...
...
@@ -25,6 +25,9 @@
#include "libavcodec/vp9dsp.h"
// hack to force-expand BPC
#define cat(a, bpp, b) a##bpp##b
#define decl_fpel_func(avg, sz, bpp, opt) \
void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
...
...
@@ -53,6 +56,12 @@ decl_ipred_fn(type, 8, bpp, opt8_16_32); \
decl_ipred_fn(type, 16, bpp, opt8_16_32); \
decl_ipred_fn(type, 32, bpp, opt8_16_32)
#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
ptrdiff_t stride, \
int16_t *block, \
int eob)
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
...
...
@@ -154,8 +163,6 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
#define cat(a, bpp, b) a##bpp##b
#define init_ipred_func(type, enum, sz, bpp, opt) \
dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
...
...
@@ -169,8 +176,8 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_ipred_func(type, enum, 4, bpp, opt); \
init_8_16_32_ipred_funcs(type, enum, bpp, opt)
void
ff_vp9dsp_init_10bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_12bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_10bpp_x86
(
VP9DSPContext
*
dsp
,
int
bitexact
);
void
ff_vp9dsp_init_12bpp_x86
(
VP9DSPContext
*
dsp
,
int
bitexact
);
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
);
#endif
/* AVCODEC_X86_VP9DSP_INIT_H */
libavcodec/x86/vp9dsp_init_16bpp_template.c
View file @
1c3be325
...
...
@@ -123,9 +123,11 @@ lpf_mix2_wrappers_set(BPC, ssse3);
lpf_mix2_wrappers_set
(
BPC
,
avx
);
decl_ipred_fns
(
tm
,
BPC
,
mmxext
,
sse2
);
decl_itxfm_func
(
iwht
,
iwht
,
4
,
BPC
,
mmxext
);
#endif
/* HAVE_YASM */
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
)
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
,
int
bitexact
)
{
#if HAVE_YASM
int
cpu_flags
=
av_get_cpu_flags
();
...
...
@@ -155,8 +157,20 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \
dsp->itxfm_add[idxa][idxb] = \
ff_vp9_##typea##_##typeb##_##size##x##size##_add_##bpp##_##opt;
#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \
init_itx_func(idx, DCT_DCT, typea, typeb, size, bpp, opt); \
init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \
init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \
init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
if
(
EXTERNAL_MMXEXT
(
cpu_flags
))
{
init_ipred_func
(
tm
,
TM_VP8
,
4
,
BPC
,
mmxext
);
if
(
!
bitexact
)
{
init_itx_func_one
(
4
/* lossless */
,
iwht
,
iwht
,
4
,
BPC
,
mmxext
);
}
}
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9itxfm.asm
View file @
1c3be325
...
...
@@ -22,6 +22,7 @@
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
%include
"vp9itxfm_template.asm"
SECTION_RODATA
...
...
@@ -164,21 +165,6 @@ SECTION .text
; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;-------------------------------------------------------------------------------------------
%macro
VP9_IWHT4_1D
0
SWAP
1
,
2
,
3
paddw
m0
,
m2
psubw
m3
,
m1
psubw
m4
,
m0
,
m3
psraw
m4
,
1
psubw
m5
,
m4
,
m1
SWAP
5
,
1
psubw
m4
,
m2
SWAP
4
,
2
psubw
m0
,
m1
paddw
m3
,
m2
SWAP
3
,
2
,
1
%endmacro
INIT_MMX
mmx
cglobal
vp9_iwht_iwht_4x4_add
,
3
,
3
,
0
,
dst
,
stride
,
block
,
eob
mova
m0
,
[
blockq
+
0
*
8
]
...
...
libavcodec/x86/vp9itxfm_16bpp.asm
0 → 100644
View file @
1c3be325
;******************************************************************************
;* VP9 inverse transform x86 SIMD optimizations
;*
;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
%include
"vp9itxfm_template.asm"
SECTION_RODATA
cextern
pw_1023
cextern
pw_4095
SECTION
.
text
%macro
VP9_STORE_2X
6
-
7
dstq
; reg1, reg2, tmp1, tmp2, min, max, dst
mova
m%3
,
[
%7
]
mova
m%4
,
[
%7
+
strideq
]
paddw
m%3
,
m%1
paddw
m%4
,
m%2
pmaxsw
m%3
,
m%5
pmaxsw
m%4
,
m%5
pminsw
m%3
,
m%6
pminsw
m%4
,
m%6
mova
[
%7
]
,
m%3
mova
[
%7
+
strideq
]
,
m%4
%endmacro
%macro
ZERO_BLOCK
4
; mem, stride, nnzcpl, zero_reg
%assign
%%
y
0
%rep
%3
%assign
%%
x
0
%rep
%3
*
4
/
mmsize
mova
[
%1
+
%%
y
+
%%
x
]
,
%4
%assign
%%
x
(
%%
x
+
mmsize
)
%endrep
%assign
%%
y
(
%%
y
+
%2
)
%endrep
%endmacro
; the input coefficients are scaled up by 2 bit (which we downscale immediately
; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
; add 2 bits, we need to scale before converting to word in 12bpp, since the
; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
; we can scale after converting to words (which is half the instructions),
; since the input is only 14+sign bit, which fits in 15+sign words directly.
%macro
IWHT4_FN
2
; bpp, max
cglobal
vp9_iwht_iwht_4x4_add_
%1
,
3
,
3
,
8
,
dst
,
stride
,
block
,
eob
mova
m7
,
[
pw_
%2
]
mova
m0
,
[
blockq
+
0
*
16
+
0
]
mova
m1
,
[
blockq
+
1
*
16
+
0
]
%if
%1
>=
12
mova
m4
,
[
blockq
+
0
*
16
+
8
]
mova
m5
,
[
blockq
+
1
*
16
+
8
]
psrad
m0
,
2
psrad
m1
,
2
psrad
m4
,
2
psrad
m5
,
2
packssdw
m0
,
m4
packssdw
m1
,
m5
%else
packssdw
m0
,
[
blockq
+
0
*
16
+
8
]
packssdw
m1
,
[
blockq
+
1
*
16
+
8
]
psraw
m0
,
2
psraw
m1
,
2
%endif
mova
m2
,
[
blockq
+
2
*
16
+
0
]
mova
m3
,
[
blockq
+
3
*
16
+
0
]
%if
%1
>=
12
mova
m4
,
[
blockq
+
2
*
16
+
8
]
mova
m5
,
[
blockq
+
3
*
16
+
8
]
psrad
m2
,
2
psrad
m3
,
2
psrad
m4
,
2
psrad
m5
,
2
packssdw
m2
,
m4
packssdw
m3
,
m5
%else
packssdw
m2
,
[
blockq
+
2
*
16
+
8
]
packssdw
m3
,
[
blockq
+
3
*
16
+
8
]
psraw
m2
,
2
psraw
m3
,
2
%endif
VP9_IWHT4_1D
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
VP9_IWHT4_1D
pxor
m6
,
m6
VP9_STORE_2X
0
,
1
,
4
,
5
,
6
,
7
lea
dstq
,
[
dstq
+
strideq
*
2
]
VP9_STORE_2X
2
,
3
,
4
,
5
,
6
,
7
ZERO_BLOCK
blockq
,
16
,
4
,
m6
RET
%endmacro
INIT_MMX
mmxext
IWHT4_FN
10
,
1023
INIT_MMX
mmxext
IWHT4_FN
12
,
4095
libavcodec/x86/vp9itxfm_template.asm
0 → 100644
View file @
1c3be325
;******************************************************************************
;* VP9 IDCT SIMD optimizations
;*
;* Copyright (C) 2013 Clément Bœsch <u pkh me>
;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%macro
VP9_IWHT4_1D
0
SWAP
1
,
2
,
3
paddw
m0
,
m2
psubw
m3
,
m1
psubw
m4
,
m0
,
m3
psraw
m4
,
1
psubw
m5
,
m4
,
m1
SWAP
5
,
1
psubw
m4
,
m2
SWAP
4
,
2
psubw
m0
,
m1
paddw
m3
,
m2
SWAP
3
,
2
,
1
%endmacro
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment