Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
2cba1825
Commit
2cba1825
authored
Jan 15, 2016
by
James Darnley
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
avcodec/v210: add avx2 version of the 10-bit line encoder
Around 25% faster than the ssse3 version.
parent
3836f404
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
31 additions
and
9 deletions
+31
-9
v210enc.c
libavcodec/v210enc.c
+9
-2
constants.c
libavcodec/x86/constants.c
+2
-1
constants.h
libavcodec/x86/constants.h
+1
-1
v210enc.asm
libavcodec/x86/v210enc.asm
+15
-5
v210enc_init.c
libavcodec/x86/v210enc_init.c
+4
-0
No files found.
libavcodec/v210enc.c
View file @
2cba1825
...
...
@@ -135,13 +135,20 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
const
uint16_t
*
v
=
(
const
uint16_t
*
)
pic
->
data
[
2
];
for
(
h
=
0
;
h
<
avctx
->
height
;
h
++
)
{
uint32_t
val
;
w
=
(
avctx
->
width
/
6
)
*
6
;
w
=
(
avctx
->
width
/
(
6
*
s
->
sample_factor
))
*
6
*
s
->
sample_factor
;
s
->
pack_line_10
(
y
,
u
,
v
,
dst
,
w
);
y
+=
w
;
u
+=
w
>>
1
;
v
+=
w
>>
1
;
dst
+=
(
w
/
6
)
*
16
;
dst
+=
(
w
/
(
6
*
s
->
sample_factor
))
*
16
*
s
->
sample_factor
;
for
(;
w
<
avctx
->
width
-
5
;
w
+=
6
)
{
WRITE_PIXELS
(
u
,
y
,
v
);
WRITE_PIXELS
(
y
,
u
,
y
);
WRITE_PIXELS
(
v
,
y
,
u
);
WRITE_PIXELS
(
y
,
v
,
y
);
}
if
(
w
<
avctx
->
width
-
1
)
{
WRITE_PIXELS
(
u
,
y
,
v
);
...
...
libavcodec/x86/constants.c
View file @
2cba1825
...
...
@@ -27,7 +27,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x000
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_2
)
=
{
0x0002000200020002ULL
,
0x0002000200020002ULL
,
0x0002000200020002ULL
,
0x0002000200020002ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_3
)
=
{
0x0003000300030003ULL
,
0x0003000300030003ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_4
)
=
{
0x0004000400040004ULL
,
0x0004000400040004ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_4
)
=
{
0x0004000400040004ULL
,
0x0004000400040004ULL
,
0x0004000400040004ULL
,
0x0004000400040004ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_5
)
=
{
0x0005000500050005ULL
,
0x0005000500050005ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_8
)
=
{
0x0008000800080008ULL
,
0x0008000800080008ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_9
)
=
{
0x0009000900090009ULL
,
0x0009000900090009ULL
};
...
...
libavcodec/x86/constants.h
View file @
2cba1825
...
...
@@ -28,7 +28,7 @@
extern
const
ymm_reg
ff_pw_1
;
extern
const
ymm_reg
ff_pw_2
;
extern
const
xmm_reg
ff_pw_3
;
extern
const
x
mm_reg
ff_pw_4
;
extern
const
y
mm_reg
ff_pw_4
;
extern
const
xmm_reg
ff_pw_5
;
extern
const
xmm_reg
ff_pw_8
;
extern
const
xmm_reg
ff_pw_9
;
...
...
libavcodec/x86/v210enc.asm
View file @
2cba1825
...
...
@@ -51,7 +51,7 @@ SECTION .text
%macro
v210_planar_pack_10
0
; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
cglobal
v210_planar_pack_10
,
5
,
5
,
4
,
y
,
u
,
v
,
dst
,
width
cglobal
v210_planar_pack_10
,
5
,
5
,
4
+
cpuflag
(
avx2
)
,
y
,
u
,
v
,
dst
,
width
lea
r0
,
[
yq
+
2
*
widthq
]
add
uq
,
widthq
add
vq
,
widthq
...
...
@@ -61,11 +61,19 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
mova
m3
,
[
v210_enc_max_10
]
.
loop
:
movu
m0
,
[
yq
+
2
*
widthq
]
movu
xm0
,
[
yq
+
2
*
widthq
]
%if
cpuflag
(
avx2
)
vinserti128
m0
,
m0
,
[
yq
+
widthq
*
2
+
12
]
,
1
%endif
CLIPW
m0
,
m2
,
m3
movq
m1
,
[
uq
+
widthq
]
movhps
m1
,
[
vq
+
widthq
]
movq
xm1
,
[
uq
+
widthq
]
movhps
xm1
,
[
vq
+
widthq
]
%if
cpuflag
(
avx2
)
movq
xm4
,
[
uq
+
widthq
+
6
]
movhps
xm4
,
[
vq
+
widthq
+
6
]
vinserti128
m1
,
m1
,
xm4
,
1
%endif
CLIPW
m1
,
m2
,
m3
pmullw
m0
,
[
v210_enc_luma_mult_10
]
...
...
@@ -79,7 +87,7 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
movu
[dstq],
m0
add
dstq
,
mmsize
add
widthq
,
6
add
widthq
,
(
mmsize
*
3
)
/
8
jl
.
loop
RET
...
...
@@ -87,6 +95,8 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
INIT_XMM
ssse3
v210_planar_pack_10
INIT_YMM
avx2
v210_planar_pack_10
%macro
v210_planar_pack_8
0
...
...
libavcodec/x86/v210enc_init.c
View file @
2cba1825
...
...
@@ -29,6 +29,9 @@ void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
void
ff_v210_planar_pack_10_ssse3
(
const
uint16_t
*
y
,
const
uint16_t
*
u
,
const
uint16_t
*
v
,
uint8_t
*
dst
,
ptrdiff_t
width
);
void
ff_v210_planar_pack_10_avx2
(
const
uint16_t
*
y
,
const
uint16_t
*
u
,
const
uint16_t
*
v
,
uint8_t
*
dst
,
ptrdiff_t
width
);
av_cold
void
ff_v210enc_init_x86
(
V210EncContext
*
s
)
{
...
...
@@ -44,6 +47,7 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
if
(
EXTERNAL_AVX2
(
cpu_flags
))
{
s
->
pack_line_8
=
ff_v210_planar_pack_8_avx2
;
s
->
pack_line_10
=
ff_v210_planar_pack_10_avx2
;
s
->
sample_factor
=
2
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment