Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
e7078e84
Commit
e7078e84
authored
Jul 24, 2015
by
Anton Khirnov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
hevcdsp: add x86 SIMD for MC
parent
0cef06df
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1125 additions
and
15 deletions
+1125
-15
hevc.c
libavcodec/hevc.c
+3
-3
hevc.h
libavcodec/hevc.h
+1
-1
hevcdsp.c
libavcodec/hevcdsp.c
+23
-1
hevcdsp.h
libavcodec/hevcdsp.h
+4
-1
hevcdsp_template.c
libavcodec/hevcdsp_template.c
+4
-4
Makefile
libavcodec/x86/Makefile
+2
-1
hevc_mc.asm
libavcodec/x86/hevc_mc.asm
+851
-0
hevcdsp_init.c
libavcodec/x86/hevcdsp_init.c
+237
-4
No files found.
libavcodec/hevc.c
View file @
e7078e84
...
@@ -38,9 +38,9 @@
...
@@ -38,9 +38,9 @@
#include "golomb.h"
#include "golomb.h"
#include "hevc.h"
#include "hevc.h"
const
uint8_t
ff_hevc_qpel_extra_before
[
4
]
=
{
0
,
3
,
3
,
2
};
const
uint8_t
ff_hevc_qpel_extra_before
[
4
]
=
{
0
,
3
,
3
,
3
};
const
uint8_t
ff_hevc_qpel_extra_after
[
4
]
=
{
0
,
3
,
4
,
4
};
const
uint8_t
ff_hevc_qpel_extra_after
[
4
]
=
{
0
,
4
,
4
,
4
};
const
uint8_t
ff_hevc_qpel_extra
[
4
]
=
{
0
,
6
,
7
,
6
};
const
uint8_t
ff_hevc_qpel_extra
[
4
]
=
{
0
,
7
,
7
,
7
};
static
const
uint8_t
scan_1x1
[
1
]
=
{
0
};
static
const
uint8_t
scan_1x1
[
1
]
=
{
0
};
...
...
libavcodec/hevc.h
View file @
e7078e84
...
@@ -740,7 +740,7 @@ typedef struct HEVCPredContext {
...
@@ -740,7 +740,7 @@ typedef struct HEVCPredContext {
}
HEVCPredContext
;
}
HEVCPredContext
;
typedef
struct
HEVCLocalContext
{
typedef
struct
HEVCLocalContext
{
DECLARE_ALIGNED
(
16
,
int16_t
,
mc_buffer
[(
MAX_PB_SIZE
+
7
)
*
MAX_PB_SIZE
]);
DECLARE_ALIGNED
(
16
,
int16_t
,
mc_buffer
[(
MAX_PB_SIZE
+
24
)
*
MAX_PB_SIZE
]);
uint8_t
cabac_state
[
HEVC_CONTEXTS
];
uint8_t
cabac_state
[
HEVC_CONTEXTS
];
uint8_t
first_qp_group
;
uint8_t
first_qp_group
;
...
...
libavcodec/hevcdsp.c
View file @
e7078e84
...
@@ -89,7 +89,7 @@ static const int8_t transform[32][32] = {
...
@@ -89,7 +89,7 @@ static const int8_t transform[32][32] = {
90
,
-
90
,
88
,
-
85
,
82
,
-
78
,
73
,
-
67
,
61
,
-
54
,
46
,
-
38
,
31
,
-
22
,
13
,
-
4
},
90
,
-
90
,
88
,
-
85
,
82
,
-
78
,
73
,
-
67
,
61
,
-
54
,
46
,
-
38
,
31
,
-
22
,
13
,
-
4
},
};
};
DECLARE_ALIGNED
(
16
,
const
int
8_t
,
ff_hevc_epel_filter
s
[
7
][
16
])
=
{
DECLARE_ALIGNED
(
16
,
const
int
16_t
,
ff_hevc_epel_coeff
s
[
7
][
16
])
=
{
{
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
},
{
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
},
{
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
},
{
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
},
{
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
},
{
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
},
...
@@ -99,6 +99,28 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
...
@@ -99,6 +99,28 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
{
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
},
{
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
},
};
};
DECLARE_ALIGNED
(
16
,
const
int8_t
,
ff_hevc_epel_coeffs8
[
7
][
16
])
=
{
{
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
},
{
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
},
{
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
},
{
-
4
,
36
,
36
,
-
4
,
-
4
,
36
,
36
,
-
4
,
-
4
,
36
,
36
,
-
4
,
-
4
,
36
,
36
,
-
4
},
{
-
4
,
28
,
46
,
-
6
,
-
4
,
28
,
46
,
-
6
,
-
4
,
28
,
46
,
-
6
,
-
4
,
28
,
46
,
-
6
},
{
-
2
,
16
,
54
,
-
4
,
-
2
,
16
,
54
,
-
4
,
-
2
,
16
,
54
,
-
4
,
-
2
,
16
,
54
,
-
4
},
{
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
},
};
DECLARE_ALIGNED
(
16
,
const
int16_t
,
ff_hevc_qpel_coeffs
[
3
][
8
])
=
{
{
-
1
,
4
,
-
10
,
58
,
17
,
-
5
,
1
,
0
},
{
-
1
,
4
,
-
11
,
40
,
40
,
-
11
,
4
,
-
1
},
{
0
,
1
,
-
5
,
17
,
58
,
-
10
,
4
,
-
1
},
};
DECLARE_ALIGNED
(
16
,
const
int8_t
,
ff_hevc_qpel_coeffs8
[
3
][
16
])
=
{
{
-
1
,
4
,
-
10
,
58
,
17
,
-
5
,
1
,
0
,
-
1
,
4
,
-
10
,
58
,
17
,
-
5
,
1
,
0
},
{
-
1
,
4
,
-
11
,
40
,
40
,
-
11
,
4
,
-
1
,
-
1
,
4
,
-
11
,
40
,
40
,
-
11
,
4
,
-
1
},
{
0
,
1
,
-
5
,
17
,
58
,
-
10
,
4
,
-
1
,
0
,
1
,
-
5
,
17
,
58
,
-
10
,
4
,
-
1
},
};
#define BIT_DEPTH 8
#define BIT_DEPTH 8
#include "hevcdsp_template.c"
#include "hevcdsp_template.c"
#undef BIT_DEPTH
#undef BIT_DEPTH
...
...
libavcodec/hevcdsp.h
View file @
e7078e84
...
@@ -118,6 +118,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
...
@@ -118,6 +118,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
void
ff_hevc_dsp_init_x86
(
HEVCDSPContext
*
c
,
const
int
bit_depth
);
void
ff_hevc_dsp_init_x86
(
HEVCDSPContext
*
c
,
const
int
bit_depth
);
extern
const
int8_t
ff_hevc_epel_filters
[
7
][
16
];
extern
const
int16_t
ff_hevc_epel_coeffs
[
7
][
16
];
extern
const
int8_t
ff_hevc_epel_coeffs8
[
7
][
16
];
extern
const
int16_t
ff_hevc_qpel_coeffs
[
3
][
8
];
extern
const
int8_t
ff_hevc_qpel_coeffs8
[
3
][
16
];
#endif
/* AVCODEC_HEVCDSP_H */
#endif
/* AVCODEC_HEVCDSP_H */
libavcodec/hevcdsp_template.c
View file @
e7078e84
...
@@ -1018,7 +1018,7 @@ static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
...
@@ -1018,7 +1018,7 @@ static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
int
x
,
y
;
int
x
,
y
;
pixel
*
src
=
(
pixel
*
)
_src
;
pixel
*
src
=
(
pixel
*
)
_src
;
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
const
int
8_t
*
filter
=
ff_hevc_epel_filter
s
[
mx
-
1
];
const
int
16_t
*
filter
=
ff_hevc_epel_coeff
s
[
mx
-
1
];
int8_t
filter_0
=
filter
[
0
];
int8_t
filter_0
=
filter
[
0
];
int8_t
filter_1
=
filter
[
1
];
int8_t
filter_1
=
filter
[
1
];
int8_t
filter_2
=
filter
[
2
];
int8_t
filter_2
=
filter
[
2
];
...
@@ -1040,7 +1040,7 @@ static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
...
@@ -1040,7 +1040,7 @@ static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
int
x
,
y
;
int
x
,
y
;
pixel
*
src
=
(
pixel
*
)
_src
;
pixel
*
src
=
(
pixel
*
)
_src
;
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
const
int
8_t
*
filter
=
ff_hevc_epel_filter
s
[
my
-
1
];
const
int
16_t
*
filter
=
ff_hevc_epel_coeff
s
[
my
-
1
];
int8_t
filter_0
=
filter
[
0
];
int8_t
filter_0
=
filter
[
0
];
int8_t
filter_1
=
filter
[
1
];
int8_t
filter_1
=
filter
[
1
];
int8_t
filter_2
=
filter
[
2
];
int8_t
filter_2
=
filter
[
2
];
...
@@ -1063,8 +1063,8 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
...
@@ -1063,8 +1063,8 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
int
x
,
y
;
int
x
,
y
;
pixel
*
src
=
(
pixel
*
)
_src
;
pixel
*
src
=
(
pixel
*
)
_src
;
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
const
int
8_t
*
filter_h
=
ff_hevc_epel_filter
s
[
mx
-
1
];
const
int
16_t
*
filter_h
=
ff_hevc_epel_coeff
s
[
mx
-
1
];
const
int
8_t
*
filter_v
=
ff_hevc_epel_filter
s
[
my
-
1
];
const
int
16_t
*
filter_v
=
ff_hevc_epel_coeff
s
[
my
-
1
];
int8_t
filter_0
=
filter_h
[
0
];
int8_t
filter_0
=
filter_h
[
0
];
int8_t
filter_1
=
filter_h
[
1
];
int8_t
filter_1
=
filter_h
[
1
];
int8_t
filter_2
=
filter_h
[
2
];
int8_t
filter_2
=
filter_h
[
2
];
...
...
libavcodec/x86/Makefile
View file @
e7078e84
...
@@ -113,7 +113,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
...
@@ -113,7 +113,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
YASM-OBJS-$(CONFIG_AAC_DECODER)
+=
x86/sbrdsp.o
YASM-OBJS-$(CONFIG_AAC_DECODER)
+=
x86/sbrdsp.o
YASM-OBJS-$(CONFIG_APE_DECODER)
+=
x86/apedsp.o
YASM-OBJS-$(CONFIG_APE_DECODER)
+=
x86/apedsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER)
+=
x86/dcadsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER)
+=
x86/dcadsp.o
YASM-OBJS-$(CONFIG_HEVC_DECODER)
+=
x86/hevc_deblock.o
YASM-OBJS-$(CONFIG_HEVC_DECODER)
+=
x86/hevc_deblock.o
\
x86/hevc_mc.o
YASM-OBJS-$(CONFIG_PNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_PNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER)
+=
x86/proresdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER)
+=
x86/proresdsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER)
+=
x86/rv40dsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER)
+=
x86/rv40dsp.o
...
...
libavcodec/x86/hevc_mc.asm
0 → 100644
View file @
e7078e84
;*****************************************************************************
;* x86-optimized HEVC MC
;* Copyright 2015 Anton Khirnov
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION
.
rodata
pw_1023
:
times
8
dw
1023
cextern
hevc_qpel_coeffs
cextern
hevc_qpel_coeffs8
cextern
hevc_epel_coeffs
cextern
hevc_epel_coeffs8
cextern
pw_8
cextern
pw_16
cextern
pw_32
cextern
pw_64
SECTION
.
text
; %1: width
; %2: bit depth
%macro
COMMON_DEFS
2
%
assign
blocksize
8
%
assign
nb_blocks
((
%1
+
blocksize
-
1
)
/
blocksize
)
%
define
last_block_truncated
(
blocksize
*
nb_blocks
>
%1
)
%
if
%2
>
8
%
define
LOAD_BLOCK
movu
%
define
LOAD_HALFBLOCK
movq
%
assign
pixelsize
2
%
else
%
define
LOAD_BLOCK
movq
%
define
LOAD_HALFBLOCK
movd
%
assign
pixelsize
1
%
endif
%
define
STORE_BLOCK
mova
%
define
STORE_HALFBLOCK
movq
%endmacro
; %1: block index
%macro
BLOCK_DEFS
1
%
if
last_block_truncated
&&
%1
==
nb_blocks
-
1
%
define
block_truncated
1
%
define
LOAD
LOAD_HALFBLOCK
%
define
STORE
STORE_HALFBLOCK
%
else
%
define
block_truncated
0
%
define
LOAD
LOAD_BLOCK
%
define
STORE
STORE_BLOCK
%
endif
%endmacro
; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
; pixel *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; %1: block width
; %2: bit depth
; %3: log2 of height unroll
%macro
GET_PIXELS
3
cglobal
hevc_get_pixels_
%
+
%1
%
+
_
%
+
%2
,
5
,
5
,
2
,
dst
,
dststride
,
src
,
srcstride
,
height
; rest of the args unused
%
assign
shift
14
-
%2
COMMON_DEFS
%1
,
%2
%if
pixelsize
==
1
pxor
m0
,
m0
%endif
shr
heightd
,
%3
.
loop
:
%assign
i
0
%rep
(
1
<<
%3
)
%assign
j
0
%rep
nb_blocks
BLOCK_DEFS
j
LOAD
m1
,
[
srcq
+
j
*
pixelsize
*
blocksize
]
%if
pixelsize
==
1
punpcklbw
m1
,
m0
%endif
psllw
m1
,
shift
STORE
[
dstq
+
j
*
2
*
blocksize
]
,
m1
%assign
j
(
j
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%assign
i
(
i
+
1
)
%endrep
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
GET_PIXELS
4
,
8
,
1
GET_PIXELS
8
,
8
,
1
GET_PIXELS
12
,
8
,
3
GET_PIXELS
16
,
8
,
2
GET_PIXELS
24
,
8
,
3
GET_PIXELS
32
,
8
,
3
GET_PIXELS
48
,
8
,
3
GET_PIXELS
64
,
8
,
3
GET_PIXELS
4
,
10
,
1
GET_PIXELS
8
,
10
,
1
GET_PIXELS
12
,
10
,
3
GET_PIXELS
16
,
10
,
2
GET_PIXELS
24
,
10
,
3
GET_PIXELS
32
,
10
,
3
GET_PIXELS
48
,
10
,
3
GET_PIXELS
64
,
10
,
3
; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; 8-bit qpel interpolation
; %1: block width
; %2: 0 - horizontal; 1 - vertical
%macro
QPEL_8
2
%if
%2
%
define
postfix
v
%
define
mvfrac
myq
%
define
coeffsaddr
r5q
%
define
pixstride
srcstrideq
%
define
pixstride3
r5q
%
define
src_m3
r6q
%else
%
define
postfix
h
%
define
mvfrac
mxq
%
define
coeffsaddr
r6q
%
define
pixstride
1
%
define
pixstride3
3
%
define
src_m3
(
srcq
-
3
)
%endif
COMMON_DEFS
%1
,
8
cglobal
hevc_qpel_
%
+
postfix
%
+
_
%
+
%1
%
+
_8
,
7
,
7
,
7
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
and
mvfrac
,
0x3
dec
mvfrac
shl
mvfrac
,
4
lea
coeffsaddr
,
[
hevc_qpel_coeffs8
]
mova
m0
,
[
coeffsaddr
+
mvfrac
]
SPLATW
m1
,
m0
,
1
SPLATW
m2
,
m0
,
2
SPLATW
m3
,
m0
,
3
SPLATW
m0
,
m0
,
0
%if
%2
lea
pixstride3
,
[
srcstrideq
+
2
*
srcstrideq
]
mov
src_m3
,
srcq
sub
src_m3
,
pixstride3
%endif
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m4
,
[
src_m3
+
i
*
blocksize
]
LOAD
m5
,
[
src_m3
+
i
*
blocksize
+
1
*
pixstride
]
punpcklbw
m4
,
m5
pmaddubsw
m4
,
m0
LOAD
m5
,
[
src_m3
+
i
*
blocksize
+
2
*
pixstride
]
LOAD
m6
,
[
srcq
+
i
*
blocksize
]
punpcklbw
m5
,
m6
pmaddubsw
m5
,
m1
paddsw
m4
,
m5
LOAD
m5
,
[
srcq
+
i
*
blocksize
+
1
*
pixstride
]
LOAD
m6
,
[
srcq
+
i
*
blocksize
+
2
*
pixstride
]
punpcklbw
m5
,
m6
pmaddubsw
m5
,
m2
paddsw
m4
,
m5
LOAD
m5
,
[
srcq
+
i
*
blocksize
+
pixstride3
]
LOAD
m6
,
[
srcq
+
i
*
blocksize
+
4
*
pixstride
]
punpcklbw
m5
,
m6
pmaddubsw
m5
,
m3
paddsw
m4
,
m5
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m4
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%if
%2
add
src_m3
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
ssse3
QPEL_8
4
,
0
QPEL_8
8
,
0
QPEL_8
12
,
0
QPEL_8
16
,
0
QPEL_8
24
,
0
QPEL_8
32
,
0
QPEL_8
48
,
0
QPEL_8
64
,
0
QPEL_8
4
,
1
QPEL_8
8
,
1
QPEL_8
12
,
1
QPEL_8
16
,
1
QPEL_8
24
,
1
QPEL_8
32
,
1
QPEL_8
48
,
1
QPEL_8
64
,
1
; 16-bit qpel interpolation
; %1: block width
; %2: shift applied to the result
; %3: 0 - horizontal; 1 - vertical
%macro
QPEL_16
3
%if
%3
%
define
mvfrac
myq
%
define
pixstride
srcstrideq
%
define
pixstride3
sstride3q
%
define
src_m3
srcm3q
%else
%
define
mvfrac
mxq
%
define
pixstride
2
%
define
pixstride3
6
%
define
src_m3
(
srcq
-
6
)
%endif
COMMON_DEFS
%1
,
16
and
mvfrac
,
0x3
dec
mvfrac
shl
mvfrac
,
4
lea
coeffsregq
,
[
hevc_qpel_coeffs
]
mova
m0
,
[
coeffsregq
+
mvfrac
]
pshufd
m1
,
m0
,
0x55
pshufd
m2
,
m0
,
0xaa
pshufd
m3
,
m0
,
0xff
pshufd
m0
,
m0
,
0x00
%if
%3
lea
sstride3q
,
[
srcstrideq
+
2
*
srcstrideq
]
mov
srcm3q
,
srcq
sub
srcm3q
,
sstride3q
%endif
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m4
,
[
src_m3
+
i
*
2
*
blocksize
]
LOAD
m5
,
[
src_m3
+
i
*
2
*
blocksize
+
1
*
pixstride
]
LOAD
m6
,
[
src_m3
+
i
*
2
*
blocksize
+
2
*
pixstride
]
LOAD
m7
,
[
srcq
+
i
*
2
*
blocksize
+
0
*
pixstride
]
LOAD
m8
,
[
srcq
+
i
*
2
*
blocksize
+
1
*
pixstride
]
LOAD
m9
,
[
srcq
+
i
*
2
*
blocksize
+
2
*
pixstride
]
LOAD
m10
,
[
srcq
+
i
*
2
*
blocksize
+
pixstride3
]
LOAD
m11
,
[
srcq
+
i
*
2
*
blocksize
+
4
*
pixstride
]
punpcklwd
m12
,
m4
,
m5
pmaddwd
m12
,
m0
punpcklwd
m13
,
m6
,
m7
pmaddwd
m13
,
m1
paddd
m12
,
m13
punpcklwd
m13
,
m8
,
m9
pmaddwd
m13
,
m2
paddd
m12
,
m13
punpcklwd
m13
,
m10
,
m11
pmaddwd
m13
,
m3
paddd
m12
,
m13
psrad
m12
,
%2
%
if
block_truncated
==
0
punpckhwd
m4
,
m5
pmaddwd
m4
,
m0
punpckhwd
m6
,
m7
pmaddwd
m6
,
m1
paddd
m4
,
m6
punpckhwd
m8
,
m9
pmaddwd
m8
,
m2
paddd
m4
,
m8
punpckhwd
m10
,
m11
pmaddwd
m10
,
m3
paddd
m4
,
m10
psrad
m4
,
%2
%
endif
packssdw
m12
,
m4
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m12
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%if
%3
add
srcm3q
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
%if
ARCH_X86_64
%macro
QPEL_H_10
1
cglobal
hevc_qpel_h_
%
+
%1
%
+
_10
,
7
,
9
,
14
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
mcbuffer
,
coeffsreg
QPEL_16
%1
,
2
,
0
%endmacro
INIT_XMM
avx
QPEL_H_10
4
QPEL_H_10
8
QPEL_H_10
12
QPEL_H_10
16
QPEL_H_10
24
QPEL_H_10
32
QPEL_H_10
48
QPEL_H_10
64
%macro
QPEL_V_10
1
cglobal
hevc_qpel_v_
%
+
%1
%
+
_10
,
7
,
10
,
14
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
srcm3
,
coeffsreg
QPEL_16
%1
,
2
,
1
%endmacro
INIT_XMM
avx
QPEL_V_10
4
QPEL_V_10
8
QPEL_V_10
12
QPEL_V_10
16
QPEL_V_10
24
QPEL_V_10
32
QPEL_V_10
48
QPEL_V_10
64
; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
%macro
QPEL_HV
1
cglobal
hevc_qpel_hv_
%
+
%1
,
7
,
10
,
14
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
srcm3
,
coeffsreg
QPEL_16
%1
,
6
,
1
%endmacro
INIT_XMM
avx
QPEL_HV
4
QPEL_HV
8
QPEL_HV
12
QPEL_HV
16
QPEL_HV
24
QPEL_HV
32
QPEL_HV
48
QPEL_HV
64
%endif
; ARCH_X86_64
; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; 8-bit epel interpolation
; %1: block width
; %2: 0 - horizontal; 1 - vertical
%macro
EPEL_8
2
%if
%2
%
define
postfix
v
%
define
mvfrac
myq
%
define
coeffsaddr
r5q
%
define
pixstride
srcstrideq
%
define
pixstride3
r5q
%else
%
define
postfix
h
%
define
mvfrac
mxq
%
define
coeffsaddr
r6q
%
define
pixstride
1
%
define
pixstride3
3
%endif
COMMON_DEFS
%1
,
8
cglobal
hevc_epel_
%
+
postfix
%
+
_
%
+
%1
%
+
_8
,
7
,
7
,
6
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
and
mvfrac
,
0x7
dec
mvfrac
shl
mvfrac
,
4
lea
coeffsaddr
,
[
hevc_epel_coeffs8
]
movq
m0
,
[
coeffsaddr
+
mvfrac
]
SPLATW
m1
,
m0
,
1
SPLATW
m0
,
m0
,
0
%if
%2
lea
pixstride3
,
[
srcstrideq
+
2
*
srcstrideq
]
%endif
sub
srcq
,
pixstride
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m2
,
[
srcq
+
i
*
blocksize
+
0
*
pixstride
]
LOAD
m3
,
[
srcq
+
i
*
blocksize
+
1
*
pixstride
]
LOAD
m4
,
[
srcq
+
i
*
blocksize
+
2
*
pixstride
]
LOAD
m5
,
[
srcq
+
i
*
blocksize
+
pixstride3
]
punpcklbw
m2
,
m3
punpcklbw
m4
,
m5
pmaddubsw
m2
,
m0
pmaddubsw
m4
,
m1
paddsw
m2
,
m4
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m2
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
ssse3
EPEL_8
4
,
0
EPEL_8
8
,
0
EPEL_8
12
,
0
EPEL_8
16
,
0
EPEL_8
24
,
0
EPEL_8
32
,
0
EPEL_8
4
,
1
EPEL_8
8
,
1
EPEL_8
12
,
1
EPEL_8
16
,
1
EPEL_8
24
,
1
EPEL_8
32
,
1
%macro
EPEL_16
3
%if
%3
%
define
mvfrac
myq
%
define
pixstride
srcstrideq
%
define
pixstride3
sstride3q
%else
%
define
mvfrac
mxq
%
define
pixstride
2
%
define
pixstride3
6
%endif
COMMON_DEFS
%1
,
16
and
mvfrac
,
0x7
dec
mvfrac
shl
mvfrac
,
5
lea
coeffsregq
,
[
hevc_epel_coeffs
]
mova
m0
,
[
coeffsregq
+
mvfrac
]
pshufd
m1
,
m0
,
0x55
pshufd
m0
,
m0
,
0x00
%if
%3
lea
sstride3q
,
[
srcstrideq
+
2
*
srcstrideq
]
%endif
sub
srcq
,
pixstride
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m2
,
[
srcq
+
i
*
2
*
blocksize
+
0
*
pixstride
]
LOAD
m3
,
[
srcq
+
i
*
2
*
blocksize
+
1
*
pixstride
]
LOAD
m4
,
[
srcq
+
i
*
2
*
blocksize
+
2
*
pixstride
]
LOAD
m5
,
[
srcq
+
i
*
2
*
blocksize
+
pixstride3
]
punpcklwd
m6
,
m2
,
m3
punpcklwd
m7
,
m4
,
m5
pmaddwd
m6
,
m0
pmaddwd
m7
,
m1
paddd
m6
,
m7
psrad
m6
,
%2
%
if
block_truncated
==
0
punpckhwd
m2
,
m3
punpckhwd
m4
,
m5
pmaddwd
m2
,
m0
pmaddwd
m4
,
m1
paddd
m2
,
m4
psrad
m2
,
%2
%
endif
packssdw
m6
,
m2
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m6
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
heightd
jg
.
loop
RET
%endmacro
%if
ARCH_X86_64
%macro
EPEL_H_10
1
cglobal
hevc_epel_h_
%
+
%1
%
+
_10
,
8
,
9
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
coeffsreg
EPEL_16
%1
,
2
,
0
%endmacro
INIT_XMM
avx
EPEL_H_10
4
EPEL_H_10
8
EPEL_H_10
12
EPEL_H_10
16
EPEL_H_10
24
EPEL_H_10
32
%macro
EPEL_V_10
1
cglobal
hevc_epel_v_
%
+
%1
%
+
_10
,
8
,
9
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
coeffsreg
EPEL_16
%1
,
2
,
1
%endmacro
INIT_XMM
avx
EPEL_V_10
4
EPEL_V_10
8
EPEL_V_10
12
EPEL_V_10
16
EPEL_V_10
24
EPEL_V_10
32
; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
; int16_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
%macro
EPEL_HV
1
cglobal
hevc_epel_hv_
%
+
%1
,
8
,
9
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
coeffsreg
EPEL_16
%1
,
6
,
1
%endmacro
INIT_XMM
avx
EPEL_HV
4
EPEL_HV
8
EPEL_HV
12
EPEL_HV
16
EPEL_HV
24
EPEL_HV
32
%endif
; ARCH_X86_64
; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride,
; int16_t *src, ptrdiff_t srcstride,
; int height)
%macro
AVG
5
%
if
%3
%
if
%4
==
4
movq
%5
,
%2
paddsw
%1
,
%5
%
else
paddsw
%1
,
%2
%
endif
%
endif
%endmacro
; %1: 0 - one source; 1 - two sources
; %2: width
; %3: bit depth
%macro
PUT_PRED
3
%if
%1
cglobal
hevc_put_unweighted_pred_avg_
%
+
%2
%
+
_
%
+
%3
,
6
,
6
,
4
,
dst
,
dststride
,
src
,
src2
,
srcstride
,
height
%else
cglobal
hevc_put_unweighted_pred_
%
+
%2
%
+
_
%
+
%3
,
5
,
5
,
4
,
dst
,
dststride
,
src
,
srcstride
,
height
%endif
%assign
shift
14
+
%1
-
%3
%assign
offset
(
1
<<
(
shift
-
1
))
%define
offset_data
pw_
%
+
offset
mova
m0
,
[
offset_data
]
%if
%3
>
8
%
define
STORE_BLOCK
movu
%
define
STORE_HALF
movq
%
assign
pixel_max
((
1
<<
%3
)
-
1
)
%
define
pw_pixel_max
pw_
%
+
pixel_max
pxor
m1
,
m1
mova
m2
,
[
pw_pixel_max
]
%else
%
define
STORE_BLOCK
movq
%
define
STORE_HALF
movd
%endif
.
loop
%assign
i
0
%rep
(
%2
+
7
)
/
8
%
if
(
i
+
1
)
*
8
>
%2
%
define
LOAD
movq
%
define
STORE
STORE_HALF
%
else
%
define
LOAD
mova
%
define
STORE
STORE_BLOCK
%
endif
LOAD
m3
,
[
srcq
+
16
*
i
]
AVG
m3
,
[
src2q
+
16
*
i
]
,
%1
,
%3
-
i
*
8
,
m4
paddsw
m3
,
m0
psraw
m3
,
shift
%
if
%3
==
8
packuswb
m3
,
m3
STORE
[
dstq
+
8
*
i
]
,
m3
%
else
CLIPW
m3
,
m1
,
m2
STORE
[
dstq
+
16
*
i
]
,
m3
%
endif
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%if
%1
add
src2q
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
PUT_PRED
0
,
4
,
8
PUT_PRED
1
,
4
,
8
PUT_PRED
0
,
8
,
8
PUT_PRED
1
,
8
,
8
PUT_PRED
0
,
12
,
8
PUT_PRED
1
,
12
,
8
PUT_PRED
0
,
16
,
8
PUT_PRED
1
,
16
,
8
PUT_PRED
0
,
24
,
8
PUT_PRED
1
,
24
,
8
PUT_PRED
0
,
32
,
8
PUT_PRED
1
,
32
,
8
PUT_PRED
0
,
48
,
8
PUT_PRED
1
,
48
,
8
PUT_PRED
0
,
64
,
8
PUT_PRED
1
,
64
,
8
PUT_PRED
0
,
4
,
10
PUT_PRED
1
,
4
,
10
PUT_PRED
0
,
8
,
10
PUT_PRED
1
,
8
,
10
PUT_PRED
0
,
12
,
10
PUT_PRED
1
,
12
,
10
PUT_PRED
0
,
16
,
10
PUT_PRED
1
,
16
,
10
PUT_PRED
0
,
24
,
10
PUT_PRED
1
,
24
,
10
PUT_PRED
0
,
32
,
10
PUT_PRED
1
,
32
,
10
PUT_PRED
0
,
48
,
10
PUT_PRED
1
,
48
,
10
PUT_PRED
0
,
64
,
10
PUT_PRED
1
,
64
,
10
%macro
PUT_WEIGHTED_PRED
3
%if
%1
cglobal
hevc_put_weighted_pred_avg_
%
+
%2
%
+
_
%
+
%3
,
11
,
11
,
8
,
denom
,
weight0
,
weight1
,
offset0
,
offset1
,
dst
,
dststride
,
src0
,
src1
,
srcstride
,
height
%else
cglobal
hevc_put_weighted_pred_
%
+
%2
%
+
_
%
+
%3
,
8
,
8
,
8
,
denom
,
weight0
,
offset0
,
dst
,
dststride
,
src0
,
srcstride
,
height
%endif
and
denomd
,
0xff
movsx
weight0d
,
weight0w
movsx
offset0d
,
offset0w
%if
%1
movsx
weight1d
,
weight1w
movsx
offset1d
,
offset1w
%endif
add
denomd
,
14
+
%1
-
%3
movd
m0
,
denomd
%if
%3
>
8
%
assign
pixel_max
((
1
<<
%3
)
-
1
)
%
define
pw_pixel_max
pw_
%
+
pixel_max
pxor
m4
,
m4
mova
m5
,
[
pw_pixel_max
]
shl
offset0d
,
%3
-
8
%if
%1
shl
offset1d
,
%3
-
8
%endif
%endif
%if
%1
lea
offset0d
,
[
offset0d
+
offset1d
+
1
]
%else
lea
offset0d
,
[
2
*
offset0d
+
1
]
%endif
movd
m1
,
offset0d
SPLATD
m1
pslld
m1
,
m0
psrad
m1
,
1
movd
m2
,
weight0d
SPLATD
m2
%if
%1
movd
m3
,
weight1d
SPLATD
m3
%endif
.
loop
%assign
i
0
%rep
(
%2
+
3
)
/
4
pmovsxwd
m6
,
[
src0q
+
8
*
i
]
pmulld
m6
,
m2
%if
%1
pmovsxwd
m7
,
[
src1q
+
8
*
i
]
pmulld
m7
,
m3
paddd
m6
,
m7
%endif
paddd
m6
,
m1
psrad
m6
,
m0
packssdw
m6
,
m6
%if
%3
>
8
CLIPW
m6
,
m4
,
m5
movq
[
dstq
+
8
*
i
]
,
m6
%else
packuswb
m6
,
m6
movd
[
dstq
+
4
*
i
]
,
m6
%endif
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
src0q
,
srcstrideq
%if
%1
add
src1q
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
%if
ARCH_X86_64
INIT_XMM
sse4
PUT_WEIGHTED_PRED
0
,
4
,
8
PUT_WEIGHTED_PRED
1
,
4
,
8
PUT_WEIGHTED_PRED
0
,
8
,
8
PUT_WEIGHTED_PRED
1
,
8
,
8
PUT_WEIGHTED_PRED
0
,
12
,
8
PUT_WEIGHTED_PRED
1
,
12
,
8
PUT_WEIGHTED_PRED
0
,
16
,
8
PUT_WEIGHTED_PRED
1
,
16
,
8
PUT_WEIGHTED_PRED
0
,
24
,
8
PUT_WEIGHTED_PRED
1
,
24
,
8
PUT_WEIGHTED_PRED
0
,
32
,
8
PUT_WEIGHTED_PRED
1
,
32
,
8
PUT_WEIGHTED_PRED
0
,
48
,
8
PUT_WEIGHTED_PRED
1
,
48
,
8
PUT_WEIGHTED_PRED
0
,
64
,
8
PUT_WEIGHTED_PRED
1
,
64
,
8
PUT_WEIGHTED_PRED
0
,
4
,
10
PUT_WEIGHTED_PRED
1
,
4
,
10
PUT_WEIGHTED_PRED
0
,
8
,
10
PUT_WEIGHTED_PRED
1
,
8
,
10
PUT_WEIGHTED_PRED
0
,
12
,
10
PUT_WEIGHTED_PRED
1
,
12
,
10
PUT_WEIGHTED_PRED
0
,
16
,
10
PUT_WEIGHTED_PRED
1
,
16
,
10
PUT_WEIGHTED_PRED
0
,
24
,
10
PUT_WEIGHTED_PRED
1
,
24
,
10
PUT_WEIGHTED_PRED
0
,
32
,
10
PUT_WEIGHTED_PRED
1
,
32
,
10
PUT_WEIGHTED_PRED
0
,
48
,
10
PUT_WEIGHTED_PRED
1
,
48
,
10
PUT_WEIGHTED_PRED
0
,
64
,
10
PUT_WEIGHTED_PRED
1
,
64
,
10
%endif
; ARCH_X86_64
libavcodec/x86/hevcdsp_init.c
View file @
e7078e84
...
@@ -45,27 +45,260 @@ LFC_FUNCS(uint8_t, 10)
...
@@ -45,27 +45,260 @@ LFC_FUNCS(uint8_t, 10)
LFL_FUNCS
(
uint8_t
,
8
)
LFL_FUNCS
(
uint8_t
,
8
)
LFL_FUNCS
(
uint8_t
,
10
)
LFL_FUNCS
(
uint8_t
,
10
)
#define GET_PIXELS(width, depth, cf) \
void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer);
GET_PIXELS
(
4
,
8
,
sse2
)
GET_PIXELS
(
8
,
8
,
sse2
)
GET_PIXELS
(
12
,
8
,
sse2
)
GET_PIXELS
(
16
,
8
,
sse2
)
GET_PIXELS
(
24
,
8
,
sse2
)
GET_PIXELS
(
32
,
8
,
sse2
)
GET_PIXELS
(
48
,
8
,
sse2
)
GET_PIXELS
(
64
,
8
,
sse2
)
GET_PIXELS
(
4
,
10
,
sse2
)
GET_PIXELS
(
8
,
10
,
sse2
)
GET_PIXELS
(
12
,
10
,
sse2
)
GET_PIXELS
(
16
,
10
,
sse2
)
GET_PIXELS
(
24
,
10
,
sse2
)
GET_PIXELS
(
32
,
10
,
sse2
)
GET_PIXELS
(
48
,
10
,
sse2
)
GET_PIXELS
(
64
,
10
,
sse2
)
/* those are independent of the bit depth, so declared separately */
#define INTERP_HV_FUNC(width, cf) \
void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer);
INTERP_HV_FUNC
(
4
,
avx
)
INTERP_HV_FUNC
(
8
,
avx
)
INTERP_HV_FUNC
(
12
,
avx
)
INTERP_HV_FUNC
(
16
,
avx
)
INTERP_HV_FUNC
(
24
,
avx
)
INTERP_HV_FUNC
(
32
,
avx
)
INTERP_HV_FUNC
(
48
,
avx
)
INTERP_HV_FUNC
(
64
,
avx
)
#if ARCH_X86_64
#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer) \
{ \
const ptrdiff_t stride = FFALIGN(width + 7, 8); \
ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
height + 7, mx, my, mcbuffer); \
ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \
height, mx, my, mcbuffer); \
}
#else
#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
#endif
#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
QPEL_FUNCS
(
4
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
8
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
12
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
16
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
24
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
32
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
48
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
64
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
4
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
8
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
12
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
16
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
24
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
32
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
48
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
64
,
10
,
avx
,
avx
,
avx
)
#if ARCH_X86_64
#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer) \
{ \
const ptrdiff_t stride = FFALIGN(width + 3, 8); \
ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \
height + 3, mx, my, mcbuffer); \
ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \
height, mx, my, mcbuffer); \
}
#else
#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
#endif
#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
EPEL_FUNCS
(
4
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
8
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
12
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
16
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
24
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
32
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
4
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
8
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
12
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
16
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
24
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
32
,
10
,
avx
,
avx
,
avx
)
#define PUT_PRED(width, depth, cf_uw, cf_w) \
void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height); \
void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
int16_t *src1, int16_t *src2, \
ptrdiff_t srcstride, int height); \
void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \
uint8_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height); \
void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \
int16_t offset0, int16_t offset1, \
uint8_t *dst, ptrdiff_t dststride, \
int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \
int height);
PUT_PRED
(
4
,
8
,
sse2
,
sse4
)
PUT_PRED
(
8
,
8
,
sse2
,
sse4
)
PUT_PRED
(
12
,
8
,
sse2
,
sse4
)
PUT_PRED
(
16
,
8
,
sse2
,
sse4
)
PUT_PRED
(
24
,
8
,
sse2
,
sse4
)
PUT_PRED
(
32
,
8
,
sse2
,
sse4
)
PUT_PRED
(
48
,
8
,
sse2
,
sse4
)
PUT_PRED
(
64
,
8
,
sse2
,
sse4
)
PUT_PRED
(
4
,
10
,
sse2
,
sse4
)
PUT_PRED
(
8
,
10
,
sse2
,
sse4
)
PUT_PRED
(
12
,
10
,
sse2
,
sse4
)
PUT_PRED
(
16
,
10
,
sse2
,
sse4
)
PUT_PRED
(
24
,
10
,
sse2
,
sse4
)
PUT_PRED
(
32
,
10
,
sse2
,
sse4
)
PUT_PRED
(
48
,
10
,
sse2
,
sse4
)
PUT_PRED
(
64
,
10
,
sse2
,
sse4
)
void
ff_hevc_dsp_init_x86
(
HEVCDSPContext
*
c
,
const
int
bit_depth
)
void
ff_hevc_dsp_init_x86
(
HEVCDSPContext
*
c
,
const
int
bit_depth
)
{
{
int
cpu_flags
=
av_get_cpu_flags
();
int
cpu_flags
=
av_get_cpu_flags
();
#define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \
c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \
c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \
c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \
c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \
c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \
c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf)
#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
if
(
bit_depth
==
8
)
{
if
(
bit_depth
==
8
)
{
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_chroma
=
ff_hevc_v_loop_filter_chroma_8_sse2
;
c
->
hevc_v_loop_filter_chroma
=
ff_hevc_v_loop_filter_chroma_8_sse2
;
c
->
hevc_h_loop_filter_chroma
=
ff_hevc_h_loop_filter_chroma_8_sse2
;
c
->
hevc_h_loop_filter_chroma
=
ff_hevc_h_loop_filter_chroma_8_sse2
;
SET_QPEL_FUNCS
(
0
,
0
,
8
,
sse2
,
ff_hevc_get_pixels
);
SET_EPEL_FUNCS
(
0
,
0
,
8
,
sse2
,
ff_hevc_get_pixels
);
SET_LUMA_FUNCS
(
put_unweighted_pred
,
ff_hevc_put_unweighted_pred
,
8
,
sse2
);
SET_LUMA_FUNCS
(
put_unweighted_pred_avg
,
ff_hevc_put_unweighted_pred_avg
,
8
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_chroma
,
ff_hevc_put_unweighted_pred
,
8
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_avg_chroma
,
ff_hevc_put_unweighted_pred_avg
,
8
,
sse2
);
}
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
)
&&
ARCH_X86_64
)
{
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_luma
=
ff_hevc_v_loop_filter_luma_8_ssse3
;
SET_QPEL_FUNCS
(
0
,
1
,
8
,
ssse3
,
ff_hevc_qpel_h
);
c
->
hevc_h_loop_filter_luma
=
ff_hevc_h_loop_filter_luma_8_ssse3
;
SET_QPEL_FUNCS
(
1
,
0
,
8
,
ssse3
,
ff_hevc_qpel_v
);
SET_EPEL_FUNCS
(
0
,
1
,
8
,
ssse3
,
ff_hevc_epel_h
);
SET_EPEL_FUNCS
(
1
,
0
,
8
,
ssse3
,
ff_hevc_epel_v
);
}
}
}
else
if
(
bit_depth
==
10
)
{
}
else
if
(
bit_depth
==
10
)
{
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_chroma
=
ff_hevc_v_loop_filter_chroma_10_sse2
;
c
->
hevc_v_loop_filter_chroma
=
ff_hevc_v_loop_filter_chroma_10_sse2
;
c
->
hevc_h_loop_filter_chroma
=
ff_hevc_h_loop_filter_chroma_10_sse2
;
c
->
hevc_h_loop_filter_chroma
=
ff_hevc_h_loop_filter_chroma_10_sse2
;
SET_QPEL_FUNCS
(
0
,
0
,
10
,
sse2
,
ff_hevc_get_pixels
);
SET_EPEL_FUNCS
(
0
,
0
,
10
,
sse2
,
ff_hevc_get_pixels
);
SET_LUMA_FUNCS
(
put_unweighted_pred
,
ff_hevc_put_unweighted_pred
,
10
,
sse2
);
SET_LUMA_FUNCS
(
put_unweighted_pred_avg
,
ff_hevc_put_unweighted_pred_avg
,
10
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_chroma
,
ff_hevc_put_unweighted_pred
,
10
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_avg_chroma
,
ff_hevc_put_unweighted_pred_avg
,
10
,
sse2
);
}
}
#if ARCH_X86_64
if
(
bit_depth
==
8
)
{
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_luma
=
ff_hevc_v_loop_filter_luma_8_ssse3
;
c
->
hevc_h_loop_filter_luma
=
ff_hevc_h_loop_filter_luma_8_ssse3
;
}
if
(
EXTERNAL_SSE4
(
cpu_flags
))
{
SET_LUMA_FUNCS
(
weighted_pred
,
ff_hevc_put_weighted_pred
,
8
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_chroma
,
ff_hevc_put_weighted_pred
,
8
,
sse4
);
SET_LUMA_FUNCS
(
weighted_pred_avg
,
ff_hevc_put_weighted_pred_avg
,
8
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_avg_chroma
,
ff_hevc_put_weighted_pred_avg
,
8
,
sse4
);
}
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
)
&&
ARCH_X86_64
)
{
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
SET_QPEL_FUNCS
(
1
,
1
,
8
,
avx
,
hevc_qpel_hv
);
SET_EPEL_FUNCS
(
1
,
1
,
8
,
avx
,
hevc_epel_hv
);
}
}
else
if
(
bit_depth
==
10
)
{
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_luma
=
ff_hevc_v_loop_filter_luma_10_ssse3
;
c
->
hevc_v_loop_filter_luma
=
ff_hevc_v_loop_filter_luma_10_ssse3
;
c
->
hevc_h_loop_filter_luma
=
ff_hevc_h_loop_filter_luma_10_ssse3
;
c
->
hevc_h_loop_filter_luma
=
ff_hevc_h_loop_filter_luma_10_ssse3
;
}
}
if
(
EXTERNAL_SSE4
(
cpu_flags
))
{
SET_LUMA_FUNCS
(
weighted_pred
,
ff_hevc_put_weighted_pred
,
10
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_chroma
,
ff_hevc_put_weighted_pred
,
10
,
sse4
);
SET_LUMA_FUNCS
(
weighted_pred_avg
,
ff_hevc_put_weighted_pred_avg
,
10
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_avg_chroma
,
ff_hevc_put_weighted_pred_avg
,
10
,
sse4
);
}
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
SET_QPEL_FUNCS
(
0
,
1
,
10
,
avx
,
ff_hevc_qpel_h
);
SET_QPEL_FUNCS
(
1
,
0
,
10
,
avx
,
ff_hevc_qpel_v
);
SET_QPEL_FUNCS
(
1
,
1
,
10
,
avx
,
hevc_qpel_hv
);
SET_EPEL_FUNCS
(
0
,
1
,
10
,
avx
,
ff_hevc_epel_h
);
SET_EPEL_FUNCS
(
1
,
0
,
10
,
avx
,
ff_hevc_epel_v
);
SET_EPEL_FUNCS
(
1
,
1
,
10
,
avx
,
hevc_epel_hv
);
}
}
}
#endif
/* ARCH_X86_64 */
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment