Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
5ce703a6
Commit
5ce703a6
authored
Apr 06, 2016
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vf_colorspace: x86-64 SIMD (SSE2) optimizations.
parent
2e2e08a3
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1543 additions
and
0 deletions
+1543
-0
colorspacedsp.c
libavfilter/colorspacedsp.c
+3
-0
colorspacedsp.h
libavfilter/colorspacedsp.h
+3
-0
Makefile
libavfilter/x86/Makefile
+2
-0
colorspacedsp.asm
libavfilter/x86/colorspacedsp.asm
+1097
-0
colorspacedsp_init.c
libavfilter/x86/colorspacedsp_init.c
+119
-0
Makefile
tests/checkasm/Makefile
+1
-0
checkasm.c
tests/checkasm/checkasm.c
+3
-0
checkasm.h
tests/checkasm/checkasm.h
+1
-0
vf_colorspace.c
tests/checkasm/vf_colorspace.c
+314
-0
No files found.
libavfilter/colorspacedsp.c
View file @
5ce703a6
...
@@ -128,4 +128,7 @@ void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp)
...
@@ -128,4 +128,7 @@ void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp)
init_yuv2yuv_fns
(
2
,
12
);
init_yuv2yuv_fns
(
2
,
12
);
dsp
->
multiply3x3
=
multiply3x3_c
;
dsp
->
multiply3x3
=
multiply3x3_c
;
if
(
ARCH_X86
)
ff_colorspacedsp_x86_init
(
dsp
);
}
}
libavfilter/colorspacedsp.h
View file @
5ce703a6
...
@@ -48,4 +48,7 @@ typedef struct ColorSpaceDSPContext {
...
@@ -48,4 +48,7 @@ typedef struct ColorSpaceDSPContext {
void
ff_colorspacedsp_init
(
ColorSpaceDSPContext
*
dsp
);
void
ff_colorspacedsp_init
(
ColorSpaceDSPContext
*
dsp
);
/* internal */
void
ff_colorspacedsp_x86_init
(
ColorSpaceDSPContext
*
dsp
);
#endif
/* AVFILTER_COLORSPACEDSP_H */
#endif
/* AVFILTER_COLORSPACEDSP_H */
libavfilter/x86/Makefile
View file @
5ce703a6
OBJS-$(CONFIG_BLEND_FILTER)
+=
x86/vf_blend_init.o
OBJS-$(CONFIG_BLEND_FILTER)
+=
x86/vf_blend_init.o
OBJS-$(CONFIG_BWDIF_FILTER)
+=
x86/vf_bwdif_init.o
OBJS-$(CONFIG_BWDIF_FILTER)
+=
x86/vf_bwdif_init.o
OBJS-$(CONFIG_COLORSPACE_FILTER)
+=
x86/colorspacedsp_init.o
OBJS-$(CONFIG_EQ_FILTER)
+=
x86/vf_eq.o
OBJS-$(CONFIG_EQ_FILTER)
+=
x86/vf_eq.o
OBJS-$(CONFIG_FSPP_FILTER)
+=
x86/vf_fspp_init.o
OBJS-$(CONFIG_FSPP_FILTER)
+=
x86/vf_fspp_init.o
OBJS-$(CONFIG_GRADFUN_FILTER)
+=
x86/vf_gradfun_init.o
OBJS-$(CONFIG_GRADFUN_FILTER)
+=
x86/vf_gradfun_init.o
...
@@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
...
@@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
YASM-OBJS-$(CONFIG_BLEND_FILTER)
+=
x86/vf_blend.o
YASM-OBJS-$(CONFIG_BLEND_FILTER)
+=
x86/vf_blend.o
YASM-OBJS-$(CONFIG_BWDIF_FILTER)
+=
x86/vf_bwdif.o
YASM-OBJS-$(CONFIG_BWDIF_FILTER)
+=
x86/vf_bwdif.o
YASM-OBJS-$(CONFIG_COLORSPACE_FILTER)
+=
x86/colorspacedsp.o
YASM-OBJS-$(CONFIG_FSPP_FILTER)
+=
x86/vf_fspp.o
YASM-OBJS-$(CONFIG_FSPP_FILTER)
+=
x86/vf_fspp.o
YASM-OBJS-$(CONFIG_GRADFUN_FILTER)
+=
x86/vf_gradfun.o
YASM-OBJS-$(CONFIG_GRADFUN_FILTER)
+=
x86/vf_gradfun.o
YASM-OBJS-$(CONFIG_HQDN3D_FILTER)
+=
x86/vf_hqdn3d.o
YASM-OBJS-$(CONFIG_HQDN3D_FILTER)
+=
x86/vf_hqdn3d.o
...
...
libavfilter/x86/colorspacedsp.asm
0 → 100644
View file @
5ce703a6
;*****************************************************************************
;* x86-optimized functions for colorspace filter
;*
;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
pw_1
:
times
8
dw
1
pw_2
:
times
8
dw
2
pw_4
:
times
8
dw
4
pw_8
:
times
8
dw
8
pw_16
:
times
8
dw
16
pw_64
:
times
8
dw
64
pw_128
:
times
8
dw
128
pw_256
:
times
8
dw
256
pw_512
:
times
8
dw
512
pw_1023
:
times
8
dw
1023
pw_1024
:
times
8
dw
1024
pw_2048
:
times
8
dw
2048
pw_4095
:
times
8
dw
4095
pw_8192
:
times
8
dw
8192
pw_16384
:
times
8
dw
16384
pd_1
:
times
4
dd
1
pd_2
:
times
4
dd
2
pd_128
:
times
4
dd
128
pd_512
:
times
4
dd
512
pd_2048
:
times
4
dd
2048
pd_8192
:
times
4
dd
8192
pd_32768
:
times
4
dd
32768
pd_131072
:
times
4
dd
131072
SECTION
.
text
; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
; const int16_t yuv_offset[2][8])
%if
ARCH_X86_64
%macro
YUV2YUV_FN
4
; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign
%%
sh
(
14
+
%1
-
%2
)
%assign
%%
rnd
(
1
<<
(
%%
sh
-
1
))
%assign
%%
uvinoff
(
128
<<
(
%1
-
8
))
%assign
%%
uvoutoff
(
128
<<
(
%2
-
8
))
%if
%3
==
0
%assign
%%
ss
444
%elif
%4
==
0
%assign
%%
ss
422
%else
; %4 == 1
%assign
%%
ss
420
%endif
; %3/%4
%if
%2
!
=
8
%assign
%%
maxval
(
1
<<
%2
)
-
1
%endif
; %2 != 8
%assign
%%
ypsh
%%
sh
-
1
%if
%%
ypsh
>
14
%assign
%%
yoffsh
%%
ypsh
-
13
%assign
%%
ypsh
14
%else
%assign
%%
yoffsh
1
%endif
%assign
%%
yprnd
(
1
<<
(
%%
yoffsh
-
1
))
%assign
%%
ypmul
(
1
<<
%%
ypsh
)
cglobal
yuv2yuv_
%
+
%%
ss
%
+
p%1to%2
,
8
,
14
,
16
,
0
-
(
4
*
mmsize
),
\
yo
,
yos
,
yi
,
yis
,
w
,
h
,
c
,
yoff
,
ui
,
vi
,
uo
,
vo
%if
%3
==
1
inc
wd
sar
wd
,
1
%if
%4
==
1
inc
hd
sar
hd
,
1
%endif
; %4 == 1
%endif
; %3 == 1
mov
[
rsp
+
3
*
mmsize
+
0
]
,
wd
mov
[
rsp
+
3
*
mmsize
+
4
]
,
hd
mova
m10
,
[cq]
pxor
m11
,
m11
mova
m12
,
[
pd_
%
+
%%
uvoutoff
]
pslld
m12
,
%%
sh
paddd
m12
,
[
pd_
%
+
%%
rnd
]
mova
m13
,
[
pw_
%
+
%%
uvinoff
]
mova
m14
,
[
yoffq
+
0
]
; y_off_in
mova
m15
,
[
yoffq
+
16
]
; y_off_out
%if
%%
yoffsh
!
=
0
psllw
m15
,
%%
yoffsh
%endif
paddw
m15
,
[
pw_
%
+
%%
yprnd
]
punpcklwd
m10
,
m15
mova
m15
,
[
pw_
%
+
%%
ypmul
]
movh
m0
,
[
cq
+
1
*
16
]
; cyu
movh
m1
,
[
cq
+
2
*
16
]
; cyv
movh
m2
,
[
cq
+
4
*
16
]
; cuu
movh
m3
,
[
cq
+
5
*
16
]
; cuv
movh
m4
,
[
cq
+
7
*
16
]
; cvu
movh
m5
,
[
cq
+
8
*
16
]
; cvv
punpcklwd
m0
,
m1
punpcklwd
m2
,
m3
punpcklwd
m4
,
m5
mova
[
rsp
+
0
*
mmsize
]
,
m0
mova
[
rsp
+
1
*
mmsize
]
,
m2
mova
[
rsp
+
2
*
mmsize
]
,
m4
DEFINE_ARGS
yo
,
yos
,
yi
,
yis
,
ui
,
vi
,
uo
,
vo
,
uis
,
vis
,
uos
,
vos
,
x
,
tmp
mov
uiq
,
[
yiq
+
gprsize
*
1
]
mov
viq
,
[
yiq
+
gprsize
*
2
]
mov
yiq
,
[
yiq
+
gprsize
*
0
]
mov
uoq
,
[
yoq
+
gprsize
*
1
]
mov
voq
,
[
yoq
+
gprsize
*
2
]
mov
yoq
,
[
yoq
+
gprsize
*
0
]
mov
uisq
,
[
yisq
+
gprsize
*
1
]
mov
visq
,
[
yisq
+
gprsize
*
2
]
mov
yisq
,
[
yisq
+
gprsize
*
0
]
mov
uosq
,
[
yosq
+
gprsize
*
1
]
mov
vosq
,
[
yosq
+
gprsize
*
2
]
mov
yosq
,
[
yosq
+
gprsize
*
0
]
.
loop_v
:
xor
xq
,
xq
.
loop_h
:
%if
%4
==
1
lea
tmpq
,
[
yiq
+
yisq
]
%endif
; %4 == 1
%if
%1
==
8
movu
m0
,
[
yiq
+
xq
*
(
1
<<
%3
)
]
; y00/01
%if
%4
==
1
movu
m2
,
[
tmpq
+
xq
*
2
]
; y10/11
%endif
; %4 == 1
%if
%3
==
1
movh
m4
,
[
uiq
+
xq
]
; u
movh
m5
,
[
viq
+
xq
]
; v
%else
; %3 != 1
movu
m4
,
[
uiq
+
xq
]
; u
movu
m5
,
[
viq
+
xq
]
; v
%endif
; %3 ==/!= 1
punpckhbw
m1
,
m0
,
m11
punpcklbw
m0
,
m11
%if
%4
==
1
punpckhbw
m3
,
m2
,
m11
punpcklbw
m2
,
m11
%endif
; %4 == 1
%if
%3
==
0
punpckhbw
m2
,
m4
,
m11
punpckhbw
m3
,
m5
,
m11
%endif
; %3 == 0
punpcklbw
m4
,
m11
punpcklbw
m5
,
m11
%else
; %1 != 8
movu
m0
,
[
yiq
+
xq
*
(
2
<<
%3
)
]
; y00/01
movu
m1
,
[
yiq
+
xq
*
(
2
<<
%3
)
+
mmsize
]
; y00/01
%if
%4
==
1
movu
m2
,
[
tmpq
+
xq
*
4
]
; y10/11
movu
m3
,
[
tmpq
+
xq
*
4
+
mmsize
]
; y10/11
%endif
; %4 == 1
movu
m4
,
[
uiq
+
xq
*
2
]
; u
movu
m5
,
[
viq
+
xq
*
2
]
; v
%if
%3
==
0
movu
m2
,
[
uiq
+
xq
*
2
+
mmsize
]
movu
m3
,
[
viq
+
xq
*
2
+
mmsize
]
%endif
; %3 == 0
%endif
; %1 ==/!= 8
psubw
m0
,
m14
psubw
m1
,
m14
%if
%4
==
1
psubw
m2
,
m14
psubw
m3
,
m14
%endif
; %4 == 1
psubw
m4
,
m13
psubw
m5
,
m13
%if
%3
==
0
psubw
m2
,
m13
psubw
m3
,
m13
%endif
; %3 == 0
SBUTTERFLY
wd
,
4
,
5
,
6
pmaddwd
m6
,
m4
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m7
,
m5
,
[
rsp
+
1
*
mmsize
]
%if
%3
==
0
SBUTTERFLY
wd
,
2
,
3
,
8
pmaddwd
m8
,
m2
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m9
,
m3
,
[
rsp
+
1
*
mmsize
]
%else
; %3 != 0
pmaddwd
m8
,
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m5
,
[
rsp
+
2
*
mmsize
]
%endif
paddd
m6
,
m12
paddd
m7
,
m12
paddd
m8
,
m12
paddd
m9
,
m12
psrad
m6
,
%%
sh
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
packssdw
m6
,
m7
packssdw
m8
,
m9
%if
%2
==
8
packuswb
m6
,
m8
%if
%3
==
0
movu
[
uoq
+
xq
]
,
m6
%else
; %3 != 0
movh
[
uoq
+
xq
]
,
m6
movhps
[
voq
+
xq
]
,
m6
%endif
; %3 ==/!= 0
%else
; %2 != 8
CLIPW
m6
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
uoq
+
xq
*
2
]
,
m6
%if
%3
==
0
movu
[
uoq
+
xq
*
2
+
mmsize
]
,
m8
%else
; %3 != 0
movu
[
voq
+
xq
*
2
]
,
m8
%endif
; %3 ==/!= 0
%endif
; %2 ==/!= 8
%if
%3
==
0
pmaddwd
m6
,
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m7
,
m5
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m8
,
m2
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m3
,
[
rsp
+
2
*
mmsize
]
paddd
m6
,
m12
paddd
m7
,
m12
paddd
m8
,
m12
paddd
m9
,
m12
psrad
m6
,
%%
sh
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
packssdw
m6
,
m7
packssdw
m8
,
m9
%if
%2
==
8
packuswb
m6
,
m8
movu
[
voq
+
xq
]
,
m6
%else
; %2 != 8
CLIPW
m6
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
voq
+
xq
*
2
]
,
m6
movu
[
voq
+
xq
*
2
+
mmsize
]
,
m8
%endif
; %2 ==/!= 8
%endif
; %3 == 0
pmaddwd
m4
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m5
,
[
rsp
+
0
*
mmsize
]
; uv_val
%if
%3
==
0
pmaddwd
m2
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
0
*
mmsize
]
%endif
; %3 == 0
; unpack y pixels with m15 (shifted round + offset), then multiply
; by m10, add uv pixels, and we're done!
%if
%3
==
1
punpckhdq
m8
,
m4
,
m4
punpckldq
m4
,
m4
punpckhdq
m9
,
m5
,
m5
punpckldq
m5
,
m5
%else
; %3 != 1
SWAP
8
,
5
,
2
SWAP
3
,
9
%endif
; %3 ==/!= 1
%if
%4
==
1
punpckhwd
m6
,
m2
,
m15
punpcklwd
m2
,
m15
punpckhwd
m7
,
m3
,
m15
punpcklwd
m3
,
m15
pmaddwd
m2
,
m10
pmaddwd
m6
,
m10
pmaddwd
m3
,
m10
pmaddwd
m7
,
m10
paddd
m2
,
m4
paddd
m6
,
m8
paddd
m3
,
m5
paddd
m7
,
m9
psrad
m2
,
%%
sh
psrad
m6
,
%%
sh
psrad
m3
,
%%
sh
psrad
m7
,
%%
sh
packssdw
m2
,
m6
packssdw
m3
,
m7
lea
tmpq
,
[
yoq
+
yosq
]
%if
%2
==
8
packuswb
m2
,
m3
movu
[
tmpq
+
xq
*
2
]
,
m2
%else
; %2 != 8
CLIPW
m2
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m3
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
tmpq
+
xq
*
4
]
,
m2
movu
[
tmpq
+
xq
*
4
+
mmsize
]
,
m3
%endif
; %2 ==/!= 8
%endif
; %4 == 1
punpckhwd
m6
,
m0
,
m15
punpcklwd
m0
,
m15
punpckhwd
m7
,
m1
,
m15
punpcklwd
m1
,
m15
pmaddwd
m0
,
m10
pmaddwd
m6
,
m10
pmaddwd
m1
,
m10
pmaddwd
m7
,
m10
paddd
m0
,
m4
paddd
m6
,
m8
paddd
m1
,
m5
paddd
m7
,
m9
psrad
m0
,
%%
sh
psrad
m6
,
%%
sh
psrad
m1
,
%%
sh
psrad
m7
,
%%
sh
packssdw
m0
,
m6
packssdw
m1
,
m7
%if
%2
==
8
packuswb
m0
,
m1
movu
[
yoq
+
xq
*
(
1
<<
%3
)
]
,
m0
%else
; %2 != 8
CLIPW
m0
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m1
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
yoq
+
xq
*
(
2
<<
%3
)
]
,
m0
movu
[
yoq
+
xq
*
(
2
<<
%3
)
+
mmsize
]
,
m1
%endif
; %2 ==/!= 8
add
xq
,
mmsize
>>
%3
cmp
xd
,
dword
[
rsp
+
3
*
mmsize
+
0
]
jl
.
loop_h
%if
%4
==
1
lea
yiq
,
[
yiq
+
yisq
*
2
]
lea
yoq
,
[
yoq
+
yosq
*
2
]
%else
; %4 != 1
add
yiq
,
yisq
add
yoq
,
yosq
%endif
; %4 ==/!= 1
add
uiq
,
uisq
add
viq
,
visq
add
uoq
,
uosq
add
voq
,
vosq
dec
dword
[
rsp
+
3
*
mmsize
+
4
]
jg
.
loop_v
RET
%endmacro
%macro
YUV2YUV_FNS
2
; ss_w, ss_h
YUV2YUV_FN
8
,
8
,
%1
,
%2
YUV2YUV_FN
10
,
8
,
%1
,
%2
YUV2YUV_FN
12
,
8
,
%1
,
%2
YUV2YUV_FN
8
,
10
,
%1
,
%2
YUV2YUV_FN
10
,
10
,
%1
,
%2
YUV2YUV_FN
12
,
10
,
%1
,
%2
YUV2YUV_FN
8
,
12
,
%1
,
%2
YUV2YUV_FN
10
,
12
,
%1
,
%2
YUV2YUV_FN
12
,
12
,
%1
,
%2
%endmacro
INIT_XMM
sse2
YUV2YUV_FNS
0
,
0
YUV2YUV_FNS
1
,
0
YUV2YUV_FNS
1
,
1
; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
; uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
; const int16_t yuv_offset[8])
%macro
YUV2RGB_FN
3
; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign
%%
sh
(
%1
-
1
)
%assign
%%
rnd
(
1
<<
(
%%
sh
-
1
))
%assign
%%
uvoff
(
1
<<
(
%1
-
1
))
%if
%2
==
0
%assign
%%
ss
444
%elif
%3
==
0
%assign
%%
ss
422
%else
; %3 == 1
%assign
%%
ss
420
%endif
; %2/%3
cglobal
yuv2rgb_
%
+
%%
ss
%
+
p%1
,
8
,
14
,
16
,
0
-
8
*
mmsize
,
\
rgb
,
rgbs
,
yuv
,
yuvs
,
ww
,
h
,
c
,
yoff
%if
%2
==
1
inc
wwd
sar
wwd
,
1
%endif
; %2 == 1
%if
%3
==
1
inc
hd
sar
hd
,
1
%endif
; %3 == 1
pxor
m11
,
m11
mova
m15
,
[yoffq]
; yoff
movh
m14
,
[
cq
+
0
]
; cy
movh
m10
,
[
cq
+
32
]
; crv
movh
m13
,
[
cq
+
112
]
; cbu
movh
m12
,
[
cq
+
64
]
; cgu
movh
m9
,
[
cq
+
80
]
; cgv
punpcklwd
m14
,
[
pw_
%
+
%%
rnd
]
; cy, rnd
punpcklwd
m13
,
m11
; cbu, 0
punpcklwd
m11
,
m10
; 0, crv
punpcklwd
m12
,
m9
; cgu, cgv
mova
[
rsp
+
0
*
mmsize
]
,
m11
mova
[
rsp
+
1
*
mmsize
]
,
m12
mova
[
rsp
+
2
*
mmsize
]
,
m13
mova
[
rsp
+
3
*
mmsize
]
,
m14
pxor
m14
,
m14
DEFINE_ARGS
r
,
rgbs
,
y
,
ys
,
ww
,
h
,
g
,
b
,
u
,
v
,
us
,
vs
,
x
,
tmp
mov
gq
,
[
rq
+
1
*
gprsize
]
mov
bq
,
[
rq
+
2
*
gprsize
]
mov
rq
,
[
rq
+
0
*
gprsize
]
mov
uq
,
[
yq
+
1
*
gprsize
]
mov
vq
,
[
yq
+
2
*
gprsize
]
mov
yq
,
[
yq
+
0
*
gprsize
]
mov
usq
,
[
ysq
+
1
*
gprsize
]
mov
vsq
,
[
ysq
+
2
*
gprsize
]
mov
ysq
,
[
ysq
+
0
*
gprsize
]
.
loop_v
:
xor
xq
,
xq
.
loop_h
:
%if
%3
==
1
lea
tmpq
,
[
yq
+
ysq
]
%endif
; %3 == 1
%if
%1
==
8
movu
m0
,
[
yq
+
xq
*
(
1
<<
%2
)
]
%if
%3
==
1
movu
m2
,
[
tmpq
+
xq
*
2
]
%endif
; %3 == 1
%if
%2
==
1
movh
m4
,
[
uq
+
xq
]
movh
m5
,
[
vq
+
xq
]
%else
; %2 != 1
movu
m4
,
[
uq
+
xq
]
movu
m5
,
[
vq
+
xq
]
%endif
; %2 ==/!= 1
punpckhbw
m1
,
m0
,
m14
punpcklbw
m0
,
m14
%if
%3
==
1
punpckhbw
m3
,
m2
,
m14
punpcklbw
m2
,
m14
%endif
; %3 == 1
%if
%2
==
0
punpckhbw
m2
,
m4
,
m14
punpckhbw
m3
,
m5
,
m14
%endif
; %2 == 0
punpcklbw
m4
,
m14
punpcklbw
m5
,
m14
%else
; %1 != 8
movu
m0
,
[
yq
+
xq
*
(
2
<<
%2
)
]
movu
m1
,
[
yq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
%if
%3
==
1
movu
m2
,
[
tmpq
+
xq
*
4
]
movu
m3
,
[
tmpq
+
xq
*
4
+
mmsize
]
%endif
; %3 == 1
movu
m4
,
[
uq
+
xq
*
2
]
movu
m5
,
[
vq
+
xq
*
2
]
%if
%2
==
0
movu
m2
,
[
uq
+
xq
*
2
+
mmsize
]
movu
m3
,
[
vq
+
xq
*
2
+
mmsize
]
%endif
; %2 == 0
%endif
; %1 ==/!= 8
psubw
m0
,
m15
psubw
m1
,
m15
%if
%3
==
1
psubw
m2
,
m15
psubw
m3
,
m15
%endif
; %3 == 1
psubw
m4
,
[
pw_
%
+
%%
uvoff
]
psubw
m5
,
[
pw_
%
+
%%
uvoff
]
SBUTTERFLY
wd
,
4
,
5
,
6
%if
%2
==
0
psubw
m2
,
[
pw_
%
+
%%
uvoff
]
psubw
m3
,
[
pw_
%
+
%%
uvoff
]
SBUTTERFLY
wd
,
2
,
3
,
6
%endif
; %2 == 0
; calculate y+rnd full-resolution [0-3,6-9]
punpckhwd
m6
,
m0
,
[
pw_1
]
; y, 1
punpcklwd
m0
,
[
pw_1
]
; y, 1
punpckhwd
m7
,
m1
,
[
pw_1
]
; y, 1
punpcklwd
m1
,
[
pw_1
]
; y, 1
pmaddwd
m0
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m1
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m7
,
[
rsp
+
3
*
mmsize
]
%if
%3
==
1
punpckhwd
m8
,
m2
,
[
pw_1
]
; y, 1
punpcklwd
m2
,
[
pw_1
]
; y, 1
punpckhwd
m9
,
m3
,
[
pw_1
]
; y, 1
punpcklwd
m3
,
[
pw_1
]
; y, 1
pmaddwd
m2
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m8
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m9
,
[
rsp
+
3
*
mmsize
]
mova
[
rsp
+
4
*
mmsize
]
,
m2
mova
[
rsp
+
5
*
mmsize
]
,
m8
mova
[
rsp
+
6
*
mmsize
]
,
m3
mova
[
rsp
+
7
*
mmsize
]
,
m9
%endif
; %3 == 1
; calculate r offsets (un-subsampled, then duplicate)
pmaddwd
m10
,
m4
,
[
rsp
+
0
*
mmsize
]
%if
%2
==
1
pmaddwd
m12
,
m5
,
[
rsp
+
0
*
mmsize
]
punpckhdq
m11
,
m10
,
m10
punpckldq
m10
,
m10
punpckhdq
m13
,
m12
,
m12
punpckldq
m12
,
m12
%else
; %2 != 1
pmaddwd
m11
,
m5
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m12
,
m2
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m13
,
m3
,
[
rsp
+
0
*
mmsize
]
%endif
; %2 ==/!= 1
%if
%3
==
1
paddd
m2
,
m10
,
[
rsp
+
4
*
mmsize
]
paddd
m3
,
m11
,
[
rsp
+
5
*
mmsize
]
paddd
m8
,
m12
,
[
rsp
+
6
*
mmsize
]
paddd
m9
,
m13
,
[
rsp
+
7
*
mmsize
]
%endif
paddd
m10
,
m0
paddd
m11
,
m6
paddd
m12
,
m1
paddd
m13
,
m7
%if
%3
==
1
psrad
m2
,
%%
sh
psrad
m3
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
%endif
; %3 == 1
psrad
m10
,
%%
sh
psrad
m11
,
%%
sh
psrad
m12
,
%%
sh
psrad
m13
,
%%
sh
%if
%3
==
1
lea
tmpq
,
[
rq
+
rgbsq
*
2
]
packssdw
m2
,
m3
packssdw
m8
,
m9
mova
[
tmpq
+
xq
*
4
]
,
m2
mova
[
tmpq
+
xq
*
4
+
mmsize
]
,
m8
%endif
; %3 == 1
packssdw
m10
,
m11
packssdw
m12
,
m13
mova
[
rq
+
xq
*
(
2
<<
%2
)
]
,
m10
mova
[
rq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m12
; calculate g offsets (un-subsampled, then duplicate)
pmaddwd
m10
,
m4
,
[
rsp
+
1
*
mmsize
]
%if
%2
==
1
pmaddwd
m12
,
m5
,
[
rsp
+
1
*
mmsize
]
punpckhdq
m11
,
m10
,
m10
punpckldq
m10
,
m10
punpckhdq
m13
,
m12
,
m12
punpckldq
m12
,
m12
%else
; %2 != 1
pmaddwd
m11
,
m5
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m12
,
m2
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m13
,
m3
,
[
rsp
+
1
*
mmsize
]
%endif
; %2 ==/!= 1
%if
%3
==
1
paddd
m2
,
m10
,
[
rsp
+
4
*
mmsize
]
paddd
m3
,
m11
,
[
rsp
+
5
*
mmsize
]
paddd
m8
,
m12
,
[
rsp
+
6
*
mmsize
]
paddd
m9
,
m13
,
[
rsp
+
7
*
mmsize
]
%endif
; %3 == 1
paddd
m10
,
m0
paddd
m11
,
m6
paddd
m12
,
m1
paddd
m13
,
m7
%if
%3
==
1
psrad
m2
,
%%
sh
psrad
m3
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
%endif
; %3 == 1
psrad
m10
,
%%
sh
psrad
m11
,
%%
sh
psrad
m12
,
%%
sh
psrad
m13
,
%%
sh
%if
%3
==
1
lea
tmpq
,
[
gq
+
rgbsq
*
2
]
packssdw
m2
,
m3
packssdw
m8
,
m9
mova
[
tmpq
+
xq
*
4
]
,
m2
mova
[
tmpq
+
xq
*
4
+
mmsize
]
,
m8
%endif
; %3 == 1
packssdw
m10
,
m11
packssdw
m12
,
m13
mova
[
gq
+
xq
*
(
2
<<
%2
)
]
,
m10
mova
[
gq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m12
; calculate b offsets (un-subsampled, then duplicate)
pmaddwd
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m5
,
[
rsp
+
2
*
mmsize
]
%if
%2
==
1
punpckhdq
m2
,
m4
,
m4
punpckldq
m4
,
m4
punpckhdq
m3
,
m5
,
m5
punpckldq
m5
,
m5
%else
; %2 != 1
pmaddwd
m2
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
2
*
mmsize
]
SWAP
2
,
5
%endif
; %2 ==/!= 1
paddd
m0
,
m4
paddd
m6
,
m2
paddd
m1
,
m5
paddd
m7
,
m3
%if
%3
==
1
paddd
m4
,
[
rsp
+
4
*
mmsize
]
paddd
m2
,
[
rsp
+
5
*
mmsize
]
paddd
m5
,
[
rsp
+
6
*
mmsize
]
paddd
m3
,
[
rsp
+
7
*
mmsize
]
%endif
; %3 == 1
psrad
m0
,
%%
sh
psrad
m6
,
%%
sh
psrad
m1
,
%%
sh
psrad
m7
,
%%
sh
%if
%3
==
1
psrad
m4
,
%%
sh
psrad
m2
,
%%
sh
psrad
m5
,
%%
sh
psrad
m3
,
%%
sh
%endif
; %3 == 1
packssdw
m0
,
m6
packssdw
m1
,
m7
movu
[
bq
+
xq
*
(
2
<<
%2
)
]
,
m0
movu
[
bq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m1
%if
%3
==
1
lea
tmpq
,
[
bq
+
rgbsq
*
2
]
packssdw
m4
,
m2
packssdw
m5
,
m3
movu
[
tmpq
+
xq
*
4
]
,
m4
movu
[
tmpq
+
xq
*
4
+
mmsize
]
,
m5
%endif
; %3 == 1
add
xd
,
mmsize
>>
%2
cmp
xd
,
wwd
jl
.
loop_h
lea
rq
,
[
rq
+
rgbsq
*
(
2
<<
%3
)
]
lea
gq
,
[
gq
+
rgbsq
*
(
2
<<
%3
)
]
lea
bq
,
[
bq
+
rgbsq
*
(
2
<<
%3
)
]
%if
%3
==
1
lea
yq
,
[
yq
+
ysq
*
2
]
%else
; %3 != 0
add
yq
,
ysq
%endif
; %3 ==/!= 1
add
uq
,
usq
add
vq
,
vsq
dec
hd
jg
.
loop_v
RET
%endmacro
%macro
YUV2RGB_FNS
2
YUV2RGB_FN
8
,
%1
,
%2
YUV2RGB_FN
10
,
%1
,
%2
YUV2RGB_FN
12
,
%1
,
%2
%endmacro
INIT_XMM
sse2
YUV2RGB_FNS
0
,
0
YUV2RGB_FNS
1
,
0
YUV2RGB_FNS
1
,
1
%macro
RGB2YUV_FN
3
; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign
%%
sh
29
-
%1
%assign
%%
rnd
(
1
<<
(
%%
sh
-
15
))
%assign
%%
uvrnd
((
128
<<
(
%1
-
8
))
<<
(
%%
sh
-
14
))
%if
%1
!
=
8
%assign
%%
maxval
((
1
<<
%1
)
-
1
)
%endif
; %1 != 8
%if
%2
==
0
%assign
%%
ss
444
%elif
%3
==
0
%assign
%%
ss
422
%else
; %3 == 1
%assign
%%
ss
420
%endif
; %2/%3
cglobal
rgb2yuv_
%
+
%%
ss
%
+
p%1
,
8
,
14
,
16
,
0
-
6
*
mmsize
,
\
yuv
,
yuvs
,
rgb
,
rgbs
,
ww
,
h
,
c
,
off
%if
%2
==
1
inc
wwd
sar
wwd
,
1
%endif
; %2 == 1
%if
%3
==
1
inc
hd
sar
hd
,
1
%endif
; %3 == 1
; prepare coeffs
movh
m8
,
[offq]
movh
m9
,
[
pw_
%
+
%%
uvrnd
]
psllw
m8
,
%%
sh
-
14
paddw
m9
,
[
pw_
%
+
%%
rnd
]
paddw
m8
,
[
pw_
%
+
%%
rnd
]
movh
m0
,
[
cq
+
0
]
movh
m1
,
[
cq
+
16
]
movh
m2
,
[
cq
+
32
]
movh
m3
,
[
cq
+
48
]
movh
m4
,
[
cq
+
64
]
movh
m5
,
[
cq
+
80
]
movh
m6
,
[
cq
+
112
]
movh
m7
,
[
cq
+
128
]
punpcklwd
m0
,
m1
punpcklwd
m2
,
m8
punpcklwd
m3
,
m4
punpcklwd
m4
,
m5
,
m9
punpcklwd
m5
,
m6
punpcklwd
m7
,
m9
mova
[
rsp
+
0
*
mmsize
]
,
m0
; cry, cgy
mova
[
rsp
+
1
*
mmsize
]
,
m2
; cby, off + rnd
mova
[
rsp
+
2
*
mmsize
]
,
m3
; cru, cgu
mova
[
rsp
+
3
*
mmsize
]
,
m4
; cburv, uvoff + rnd
mova
[
rsp
+
4
*
mmsize
]
,
m5
; cburv, cgv
mova
[
rsp
+
5
*
mmsize
]
,
m7
; cbv, uvoff + rnd
DEFINE_ARGS
y
,
ys
,
r
,
rgbs
,
ww
,
h
,
u
,
v
,
us
,
vs
,
g
,
b
,
tmp
,
x
mov
gq
,
[
rq
+
gprsize
*
1
]
mov
bq
,
[
rq
+
gprsize
*
2
]
mov
rq
,
[
rq
+
gprsize
*
0
]
mov
uq
,
[
yq
+
gprsize
*
1
]
mov
vq
,
[
yq
+
gprsize
*
2
]
mov
yq
,
[
yq
+
gprsize
*
0
]
mov
usq
,
[
ysq
+
gprsize
*
1
]
mov
vsq
,
[
ysq
+
gprsize
*
2
]
mov
ysq
,
[
ysq
+
gprsize
*
0
]
pxor
m15
,
m15
.
loop_v
:
xor
xd
,
xd
.
loop_h
:
; top line y
mova
m0
,
[
rq
+
xq
*
(
2
<<
%2
)
]
mova
m3
,
[
rq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
mova
m1
,
[
gq
+
xq
*
(
2
<<
%2
)
]
mova
m4
,
[
gq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
mova
m2
,
[
bq
+
xq
*
(
2
<<
%2
)
]
mova
m5
,
[
bq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
punpcklwd
m6
,
m0
,
m1
punpckhwd
m7
,
m0
,
m1
punpcklwd
m8
,
m3
,
m4
punpckhwd
m9
,
m3
,
m4
punpcklwd
m10
,
m2
,
[
pw_16384
]
punpckhwd
m11
,
m2
,
[
pw_16384
]
punpcklwd
m12
,
m5
,
[
pw_16384
]
punpckhwd
m13
,
m5
,
[
pw_16384
]
pmaddwd
m6
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m7
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m8
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m9
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m10
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m11
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m12
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m13
,
[
rsp
+
1
*
mmsize
]
paddd
m6
,
m10
paddd
m7
,
m11
paddd
m8
,
m12
paddd
m9
,
m13
psrad
m6
,
%%
sh
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
packssdw
m6
,
m7
packssdw
m8
,
m9
%if
%1
==
8
packuswb
m6
,
m8
movu
[
yq
+
xq
*
(
1
<<
%2
)
]
,
m6
%else
CLIPW
m6
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
yq
+
xq
*
(
2
<<
%2
)
]
,
m6
movu
[
yq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m8
%endif
%if
%2
==
1
; subsampling cached data
pmaddwd
m0
,
[
pw_1
]
pmaddwd
m1
,
[
pw_1
]
pmaddwd
m2
,
[
pw_1
]
pmaddwd
m3
,
[
pw_1
]
pmaddwd
m4
,
[
pw_1
]
pmaddwd
m5
,
[
pw_1
]
%if
%3
==
1
; bottom line y, r/g portion only
lea
tmpq
,
[
rgbsq
+
xq
*
2
]
mova
m6
,
[
rq
+
tmpq
*
2
]
mova
m9
,
[
rq
+
tmpq
*
2
+
mmsize
]
mova
m7
,
[
gq
+
tmpq
*
2
]
mova
m10
,
[
gq
+
tmpq
*
2
+
mmsize
]
mova
m8
,
[
bq
+
tmpq
*
2
]
mova
m11
,
[
bq
+
tmpq
*
2
+
mmsize
]
punpcklwd
m12
,
m6
,
m7
punpckhwd
m13
,
m6
,
m7
punpcklwd
m14
,
m9
,
m10
punpckhwd
m15
,
m9
,
m10
; release two more registers
pmaddwd
m6
,
[
pw_1
]
pmaddwd
m7
,
[
pw_1
]
pmaddwd
m9
,
[
pw_1
]
pmaddwd
m10
,
[
pw_1
]
paddd
m0
,
m6
paddd
m3
,
m9
paddd
m1
,
m7
paddd
m4
,
m10
; bottom line y, b/rnd portion only
punpcklwd
m6
,
m8
,
[
pw_16384
]
punpckhwd
m7
,
m8
,
[
pw_16384
]
punpcklwd
m9
,
m11
,
[
pw_16384
]
punpckhwd
m10
,
m11
,
[
pw_16384
]
pmaddwd
m12
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m13
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m14
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m15
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m7
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m9
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m10
,
[
rsp
+
1
*
mmsize
]
paddd
m12
,
m6
paddd
m13
,
m7
paddd
m14
,
m9
paddd
m15
,
m10
psrad
m12
,
%%
sh
psrad
m13
,
%%
sh
psrad
m14
,
%%
sh
psrad
m15
,
%%
sh
packssdw
m12
,
m13
packssdw
m14
,
m15
lea
tmpq
,
[
yq
+
ysq
]
%if
%1
==
8
packuswb
m12
,
m14
movu
[
tmpq
+
xq
*
2
]
,
m12
%else
pxor
m15
,
m15
CLIPW
m12
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m14
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
tmpq
+
xq
*
4
]
,
m12
movu
[
tmpq
+
xq
*
4
+
mmsize
]
,
m14
%endif
; complete subsampling of r/g/b pixels for u/v
pmaddwd
m8
,
[
pw_1
]
pmaddwd
m11
,
[
pw_1
]
paddd
m2
,
m8
paddd
m5
,
m11
paddd
m0
,
[
pd_2
]
paddd
m1
,
[
pd_2
]
paddd
m2
,
[
pd_2
]
paddd
m3
,
[
pd_2
]
paddd
m4
,
[
pd_2
]
paddd
m5
,
[
pd_2
]
psrad
m0
,
2
psrad
m1
,
2
psrad
m2
,
2
psrad
m3
,
2
psrad
m4
,
2
psrad
m5
,
2
%else
; %3 != 1
paddd
m0
,
[
pd_1
]
paddd
m1
,
[
pd_1
]
paddd
m2
,
[
pd_1
]
paddd
m3
,
[
pd_1
]
paddd
m4
,
[
pd_1
]
paddd
m5
,
[
pd_1
]
psrad
m0
,
1
psrad
m1
,
1
psrad
m2
,
1
psrad
m3
,
1
psrad
m4
,
1
psrad
m5
,
1
%endif
; %3 ==/!= 1
packssdw
m0
,
m3
packssdw
m1
,
m4
packssdw
m2
,
m5
%endif
; %2 == 1
; convert u/v pixels
SBUTTERFLY
wd
,
0
,
1
,
6
punpckhwd
m6
,
m2
,
[
pw_16384
]
punpcklwd
m2
,
[
pw_16384
]
pmaddwd
m7
,
m0
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m8
,
m1
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m2
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m10
,
m6
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m0
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m1
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m2
,
[
rsp
+
5
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
5
*
mmsize
]
paddd
m7
,
m9
paddd
m8
,
m10
paddd
m0
,
m2
paddd
m1
,
m6
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m0
,
%%
sh
psrad
m1
,
%%
sh
packssdw
m7
,
m8
packssdw
m0
,
m1
%if
%2
==
1
%if
%1
==
8
packuswb
m7
,
m0
movh
[
uq
+
xq
]
,
m7
movhps
[
vq
+
xq
]
,
m7
%else
CLIPW
m7
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m0
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
uq
+
xq
*
2
]
,
m7
movu
[
vq
+
xq
*
2
]
,
m0
%endif
%else
; %2 != 1
; second set of u/v pixels
SBUTTERFLY
wd
,
3
,
4
,
6
punpckhwd
m6
,
m5
,
[
pw_16384
]
punpcklwd
m5
,
[
pw_16384
]
pmaddwd
m8
,
m3
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m10
,
m5
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m11
,
m6
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m4
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m5
,
[
rsp
+
5
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
5
*
mmsize
]
paddd
m8
,
m10
paddd
m9
,
m11
paddd
m3
,
m5
paddd
m4
,
m6
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
psrad
m3
,
%%
sh
psrad
m4
,
%%
sh
packssdw
m8
,
m9
packssdw
m3
,
m4
%if
%1
==
8
packuswb
m7
,
m8
packuswb
m0
,
m3
movu
[
uq
+
xq
]
,
m7
movu
[
vq
+
xq
]
,
m0
%else
CLIPW
m7
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m0
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m3
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
uq
+
xq
*
2
]
,
m7
movu
[
uq
+
xq
*
2
+
mmsize
]
,
m8
movu
[
vq
+
xq
*
2
]
,
m0
movu
[
vq
+
xq
*
2
+
mmsize
]
,
m3
%endif
%endif
; %2 ==/!= 1
add
xq
,
mmsize
>>
%2
cmp
xd
,
wwd
jl
.
loop_h
%if
%3
==
0
add
yq
,
ysq
%else
; %3 != 0
lea
yq
,
[
yq
+
ysq
*
2
]
%endif
; %3 ==/!= 0
add
uq
,
usq
add
vq
,
vsq
lea
rq
,
[
rq
+
rgbsq
*
(
2
<<
%3
)
]
lea
gq
,
[
gq
+
rgbsq
*
(
2
<<
%3
)
]
lea
bq
,
[
bq
+
rgbsq
*
(
2
<<
%3
)
]
dec
hd
jg
.
loop_v
RET
%endmacro
%macro
RGB2YUV_FNS
2
RGB2YUV_FN
8
,
%1
,
%2
RGB2YUV_FN
10
,
%1
,
%2
RGB2YUV_FN
12
,
%1
,
%2
%endmacro
INIT_XMM
sse2
RGB2YUV_FNS
0
,
0
RGB2YUV_FNS
1
,
0
RGB2YUV_FNS
1
,
1
; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
; int w, int h, const int16_t coeff[3][3][8])
INIT_XMM
sse2
cglobal
multiply3x3
,
5
,
7
,
16
,
data
,
stride
,
ww
,
h
,
c
movh
m0
,
[
cq
+
0
]
movh
m1
,
[
cq
+
32
]
movh
m2
,
[
cq
+
48
]
movh
m3
,
[
cq
+
80
]
movh
m4
,
[
cq
+
96
]
movh
m5
,
[
cq
+
128
]
punpcklwd
m0
,
[
cq
+
16
]
punpcklwd
m1
,
[
pw_8192
]
punpcklwd
m2
,
[
cq
+
64
]
punpcklwd
m3
,
[
pw_8192
]
punpcklwd
m4
,
[
cq
+
112
]
punpcklwd
m5
,
[
pw_8192
]
DEFINE_ARGS
data0
,
stride
,
ww
,
h
,
data1
,
data2
,
x
shl
strideq
,
1
mov
data1q
,
[
data0q
+
gprsize
*
1
]
mov
data2q
,
[
data0q
+
gprsize
*
2
]
mov
data0q
,
[
data0q
+
gprsize
*
0
]
.
loop_v
:
xor
xd
,
xd
.
loop_h
:
mova
m6
,
[
data0q
+
xq
*
2
]
mova
m7
,
[
data1q
+
xq
*
2
]
mova
m8
,
[
data2q
+
xq
*
2
]
SBUTTERFLY
wd
,
6
,
7
,
9
punpckhwd
m9
,
m8
,
[
pw_1
]
punpcklwd
m8
,
[
pw_1
]
pmaddwd
m10
,
m6
,
m0
pmaddwd
m11
,
m7
,
m0
pmaddwd
m12
,
m8
,
m1
pmaddwd
m13
,
m9
,
m1
paddd
m10
,
m12
paddd
m11
,
m13
psrad
m10
,
14
psrad
m11
,
14
pmaddwd
m12
,
m6
,
m2
pmaddwd
m13
,
m7
,
m2
pmaddwd
m14
,
m8
,
m3
pmaddwd
m15
,
m9
,
m3
paddd
m12
,
m14
paddd
m13
,
m15
psrad
m12
,
14
psrad
m13
,
14
pmaddwd
m6
,
m4
pmaddwd
m7
,
m4
pmaddwd
m8
,
m5
pmaddwd
m9
,
m5
paddd
m6
,
m8
paddd
m7
,
m9
psrad
m6
,
14
psrad
m7
,
14
packssdw
m10
,
m11
packssdw
m12
,
m13
packssdw
m6
,
m7
mova
[
data0q
+
xq
*
2
]
,
m10
mova
[
data1q
+
xq
*
2
]
,
m12
mova
[
data2q
+
xq
*
2
]
,
m6
add
xd
,
mmsize
/
2
cmp
xd
,
wwd
jl
.
loop_h
add
data0q
,
strideq
add
data1q
,
strideq
add
data2q
,
strideq
dec
hd
jg
.
loop_v
RET
%endif
libavfilter/x86/colorspacedsp_init.c
0 → 100644
View file @
5ce703a6
/*
* Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/cpu.h"
#include "libavfilter/colorspacedsp.h"
#define decl_yuv2yuv_fn(t) \
void ff_yuv2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], \
uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], \
int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], \
const int16_t yuv_offset[2][8])
#define decl_yuv2yuv_fns(ss) \
decl_yuv2yuv_fn(ss##p8to8); \
decl_yuv2yuv_fn(ss##p10to8); \
decl_yuv2yuv_fn(ss##p12to8); \
decl_yuv2yuv_fn(ss##p8to10); \
decl_yuv2yuv_fn(ss##p10to10); \
decl_yuv2yuv_fn(ss##p12to10); \
decl_yuv2yuv_fn(ss##p8to12); \
decl_yuv2yuv_fn(ss##p10to12); \
decl_yuv2yuv_fn(ss##p12to12)
decl_yuv2yuv_fns
(
420
);
decl_yuv2yuv_fns
(
422
);
decl_yuv2yuv_fns
(
444
);
#define decl_yuv2rgb_fn(t) \
void ff_yuv2rgb_##t##_sse2(int16_t *rgb_out[3], ptrdiff_t rgb_stride, \
uint8_t *yuv_in[3], ptrdiff_t yuv_stride[3], \
int w, int h, const int16_t coeff[3][3][8], \
const int16_t yuv_offset[8])
#define decl_yuv2rgb_fns(ss) \
decl_yuv2rgb_fn(ss##p8); \
decl_yuv2rgb_fn(ss##p10); \
decl_yuv2rgb_fn(ss##p12)
decl_yuv2rgb_fns
(
420
);
decl_yuv2rgb_fns
(
422
);
decl_yuv2rgb_fns
(
444
);
#define decl_rgb2yuv_fn(t) \
void ff_rgb2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_stride[3], \
int16_t *rgb_in[3], ptrdiff_t rgb_stride, \
int w, int h, const int16_t coeff[3][3][8], \
const int16_t yuv_offset[8])
#define decl_rgb2yuv_fns(ss) \
decl_rgb2yuv_fn(ss##p8); \
decl_rgb2yuv_fn(ss##p10); \
decl_rgb2yuv_fn(ss##p12)
decl_rgb2yuv_fns
(
420
);
decl_rgb2yuv_fns
(
422
);
decl_rgb2yuv_fns
(
444
);
void
ff_multiply3x3_sse2
(
int16_t
*
data
[
3
],
ptrdiff_t
stride
,
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
]);
void
ff_colorspacedsp_x86_init
(
ColorSpaceDSPContext
*
dsp
)
{
int
cpu_flags
=
av_get_cpu_flags
();
if
(
ARCH_X86_64
&&
EXTERNAL_SSE2
(
cpu_flags
))
{
#define assign_yuv2yuv_fns(idx, ss) \
dsp->yuv2yuv[0][0][idx] = ff_yuv2yuv_##ss##p8to8_sse2; \
dsp->yuv2yuv[0][1][idx] = ff_yuv2yuv_##ss##p8to10_sse2; \
dsp->yuv2yuv[0][2][idx] = ff_yuv2yuv_##ss##p8to12_sse2; \
dsp->yuv2yuv[1][0][idx] = ff_yuv2yuv_##ss##p10to8_sse2; \
dsp->yuv2yuv[1][1][idx] = ff_yuv2yuv_##ss##p10to10_sse2; \
dsp->yuv2yuv[1][2][idx] = ff_yuv2yuv_##ss##p10to12_sse2; \
dsp->yuv2yuv[2][0][idx] = ff_yuv2yuv_##ss##p12to8_sse2; \
dsp->yuv2yuv[2][1][idx] = ff_yuv2yuv_##ss##p12to10_sse2; \
dsp->yuv2yuv[2][2][idx] = ff_yuv2yuv_##ss##p12to12_sse2
assign_yuv2yuv_fns
(
2
,
420
);
assign_yuv2yuv_fns
(
1
,
422
);
assign_yuv2yuv_fns
(
0
,
444
);
#define assign_yuv2rgb_fns(idx, ss) \
dsp->yuv2rgb[0][idx] = ff_yuv2rgb_##ss##p8_sse2; \
dsp->yuv2rgb[1][idx] = ff_yuv2rgb_##ss##p10_sse2; \
dsp->yuv2rgb[2][idx] = ff_yuv2rgb_##ss##p12_sse2
assign_yuv2rgb_fns
(
2
,
420
);
assign_yuv2rgb_fns
(
1
,
422
);
assign_yuv2rgb_fns
(
0
,
444
);
#define assign_rgb2yuv_fns(idx, ss) \
dsp->rgb2yuv[0][idx] = ff_rgb2yuv_##ss##p8_sse2; \
dsp->rgb2yuv[1][idx] = ff_rgb2yuv_##ss##p10_sse2; \
dsp->rgb2yuv[2][idx] = ff_rgb2yuv_##ss##p12_sse2
assign_rgb2yuv_fns
(
2
,
420
);
assign_rgb2yuv_fns
(
1
,
422
);
assign_rgb2yuv_fns
(
0
,
444
);
dsp
->
multiply3x3
=
ff_multiply3x3_sse2
;
}
}
tests/checkasm/Makefile
View file @
5ce703a6
...
@@ -16,6 +16,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
...
@@ -16,6 +16,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
# libavfilter tests
# libavfilter tests
AVFILTEROBJS-$(CONFIG_BLEND_FILTER)
+=
vf_blend.o
AVFILTEROBJS-$(CONFIG_BLEND_FILTER)
+=
vf_blend.o
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER)
+=
vf_colorspace.o
CHECKASMOBJS-$(CONFIG_AVFILTER)
+=
$(AVFILTEROBJS-yes)
CHECKASMOBJS-$(CONFIG_AVFILTER)
+=
$(AVFILTEROBJS-yes)
...
...
tests/checkasm/checkasm.c
View file @
5ce703a6
...
@@ -106,6 +106,9 @@ static const struct {
...
@@ -106,6 +106,9 @@ static const struct {
#if CONFIG_BLEND_FILTER
#if CONFIG_BLEND_FILTER
{
"vf_blend"
,
checkasm_check_blend
},
{
"vf_blend"
,
checkasm_check_blend
},
#endif
#endif
#if CONFIG_COLORSPACE_FILTER
{
"vf_colorspace"
,
checkasm_check_colorspace
},
#endif
#endif
#endif
{
NULL
}
{
NULL
}
};
};
...
...
tests/checkasm/checkasm.h
View file @
5ce703a6
...
@@ -33,6 +33,7 @@
...
@@ -33,6 +33,7 @@
void
checkasm_check_alacdsp
(
void
);
void
checkasm_check_alacdsp
(
void
);
void
checkasm_check_blend
(
void
);
void
checkasm_check_blend
(
void
);
void
checkasm_check_bswapdsp
(
void
);
void
checkasm_check_bswapdsp
(
void
);
void
checkasm_check_colorspace
(
void
);
void
checkasm_check_flacdsp
(
void
);
void
checkasm_check_flacdsp
(
void
);
void
checkasm_check_fmtconvert
(
void
);
void
checkasm_check_fmtconvert
(
void
);
void
checkasm_check_h264pred
(
void
);
void
checkasm_check_h264pred
(
void
);
...
...
tests/checkasm/vf_colorspace.c
0 → 100644
View file @
5ce703a6
/*
* Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <string.h>
#include "checkasm.h"
#include "libavfilter/colorspacedsp.h"
#include "libavutil/common.h"
#include "libavutil/internal.h"
#include "libavutil/intreadwrite.h"
#define W 64
#define H 64
#define randomize_buffers() \
do { \
unsigned mask = bpp_mask[idepth]; \
int n, m; \
int bpp = 1 + (!!idepth); \
int buf_size = W * H * bpp; \
for (m = 0; m < 3; m++) { \
int ss = m ? ss_w + ss_h : 0; \
int plane_sz = buf_size >> ss; \
for (n = 0; n < plane_sz; n += 4) { \
unsigned r = rnd() & mask; \
AV_WN32A(&src[m][n], r); \
} \
} \
} while (0)
static
const
char
*
format_string
[]
=
{
"444"
,
"422"
,
"420"
};
static
unsigned
bpp_mask
[]
=
{
0xffffffff
,
0x03ff03ff
,
0x0fff0fff
};
static
void
check_yuv2yuv
(
void
)
{
declare_func
(
void
,
uint8_t
*
dst
[
3
],
ptrdiff_t
dst_stride
[
3
],
uint8_t
*
src
[
3
],
ptrdiff_t
src_stride
[
3
],
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
],
const
int16_t
off
[
2
][
8
]);
ColorSpaceDSPContext
dsp
;
int
idepth
,
odepth
,
fmt
,
n
;
LOCAL_ALIGNED_32
(
uint8_t
,
src_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_v
,
[
W
*
H
*
2
]);
uint8_t
*
src
[
3
]
=
{
src_y
,
src_u
,
src_v
};
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_v
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_v
,
[
W
*
H
*
2
]);
uint8_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
LOCAL_ALIGNED_32
(
int16_t
,
offset_buf
,
[
16
]);
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
offset
)[
8
]
=
(
int16_t
(
*
)[
8
])
offset_buf
;
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
offset
[
0
][
n
]
=
offset
[
1
][
n
]
=
16
;
coeff
[
0
][
0
][
n
]
=
(
1
<<
14
)
+
(
1
<<
7
)
+
1
;
coeff
[
0
][
1
][
n
]
=
(
1
<<
7
)
-
1
;
coeff
[
0
][
2
][
n
]
=
-
(
1
<<
8
);
coeff
[
1
][
0
][
n
]
=
coeff
[
2
][
0
][
n
]
=
0
;
coeff
[
1
][
1
][
n
]
=
(
1
<<
14
)
+
(
1
<<
7
);
coeff
[
1
][
2
][
n
]
=
-
(
1
<<
7
);
coeff
[
2
][
2
][
n
]
=
(
1
<<
14
)
-
(
1
<<
6
);
coeff
[
2
][
1
][
n
]
=
1
<<
6
;
}
for
(
idepth
=
0
;
idepth
<
3
;
idepth
++
)
{
for
(
odepth
=
0
;
odepth
<
3
;
odepth
++
)
{
for
(
fmt
=
0
;
fmt
<
3
;
fmt
++
)
{
if
(
check_func
(
dsp
.
yuv2yuv
[
idepth
][
odepth
][
fmt
],
"ff_colorspacedsp_yuv2yuv_%sp%dto%d"
,
format_string
[
fmt
],
idepth
*
2
+
8
,
odepth
*
2
+
8
))
{
int
ss_w
=
!!
fmt
,
ss_h
=
fmt
==
2
;
int
y_src_stride
=
W
<<
!!
idepth
,
y_dst_stride
=
W
<<
!!
odepth
;
int
uv_src_stride
=
y_src_stride
>>
ss_w
,
uv_dst_stride
=
y_dst_stride
>>
ss_w
;
randomize_buffers
();
call_ref
(
dst0
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
call_new
(
dst1
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
y_dst_stride
*
H
)
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
uv_dst_stride
*
H
>>
ss_h
)
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
uv_dst_stride
*
H
>>
ss_h
))
{
fail
();
}
}
}
}
}
report
(
"yuv2yuv"
);
}
static
void
check_yuv2rgb
(
void
)
{
declare_func
(
void
,
int16_t
*
dst
[
3
],
ptrdiff_t
dst_stride
,
uint8_t
*
src
[
3
],
ptrdiff_t
src_stride
[
3
],
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
],
const
int16_t
off
[
8
]);
ColorSpaceDSPContext
dsp
;
int
idepth
,
fmt
,
n
;
LOCAL_ALIGNED_32
(
uint8_t
,
src_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_v
,
[
W
*
H
*
2
]);
uint8_t
*
src
[
3
]
=
{
src_y
,
src_u
,
src_v
};
LOCAL_ALIGNED_32
(
int16_t
,
dst0_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_v
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_v
,
[
W
*
H
]);
int16_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
LOCAL_ALIGNED_32
(
int16_t
,
offset
,
[
8
]);
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
offset
[
n
]
=
16
;
coeff
[
0
][
0
][
n
]
=
coeff
[
1
][
0
][
n
]
=
coeff
[
2
][
0
][
n
]
=
(
1
<<
14
)
|
1
;
coeff
[
0
][
1
][
n
]
=
coeff
[
2
][
2
][
n
]
=
0
;
coeff
[
0
][
2
][
n
]
=
1
<<
13
;
coeff
[
1
][
1
][
n
]
=
-
(
1
<<
12
);
coeff
[
1
][
2
][
n
]
=
1
<<
12
;
coeff
[
2
][
1
][
n
]
=
1
<<
11
;
}
for
(
idepth
=
0
;
idepth
<
3
;
idepth
++
)
{
for
(
fmt
=
0
;
fmt
<
3
;
fmt
++
)
{
if
(
check_func
(
dsp
.
yuv2rgb
[
idepth
][
fmt
],
"ff_colorspacedsp_yuv2rgb_%sp%d"
,
format_string
[
fmt
],
idepth
*
2
+
8
))
{
int
ss_w
=
!!
fmt
,
ss_h
=
fmt
==
2
;
int
y_src_stride
=
W
<<
!!
idepth
;
int
uv_src_stride
=
y_src_stride
>>
ss_w
;
randomize_buffers
();
call_ref
(
dst0
,
W
,
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
call_new
(
dst1
,
W
,
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
W
*
H
*
sizeof
(
int16_t
))
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
W
*
H
*
sizeof
(
int16_t
))
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
W
*
H
*
sizeof
(
int16_t
)))
{
fail
();
}
}
}
}
report
(
"yuv2rgb"
);
}
#undef randomize_buffers
#define randomize_buffers() \
do { \
int y, x, p; \
for (p = 0; p < 3; p++) { \
for (y = 0; y < H; y++) { \
for (x = 0; x < W; x++) { \
int r = rnd() & 0x7fff; \
r -= (32768 - 28672) >> 1; \
src[p][y * W + x] = r; \
} \
} \
} \
} while (0)
static
void
check_rgb2yuv
(
void
)
{
declare_func
(
void
,
uint8_t
*
dst
[
3
],
ptrdiff_t
dst_stride
[
3
],
int16_t
*
src
[
3
],
ptrdiff_t
src_stride
,
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
],
const
int16_t
off
[
8
]);
ColorSpaceDSPContext
dsp
;
int
odepth
,
fmt
,
n
;
LOCAL_ALIGNED_32
(
int16_t
,
src_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
int16_t
,
src_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
int16_t
,
src_v
,
[
W
*
H
*
2
]);
int16_t
*
src
[
3
]
=
{
src_y
,
src_u
,
src_v
};
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_v
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_v
,
[
W
*
H
]);
uint8_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
LOCAL_ALIGNED_32
(
int16_t
,
offset
,
[
8
]);
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
offset
[
n
]
=
16
;
// these somewhat resemble bt601/smpte170m coefficients
coeff
[
0
][
0
][
n
]
=
lrint
(
0
.
3
*
(
1
<<
14
));
coeff
[
0
][
1
][
n
]
=
lrint
(
0
.
6
*
(
1
<<
14
));
coeff
[
0
][
2
][
n
]
=
lrint
(
0
.
1
*
(
1
<<
14
));
coeff
[
1
][
0
][
n
]
=
lrint
(
-
0
.
15
*
(
1
<<
14
));
coeff
[
1
][
1
][
n
]
=
lrint
(
-
0
.
35
*
(
1
<<
14
));
coeff
[
1
][
2
][
n
]
=
lrint
(
0
.
5
*
(
1
<<
14
));
coeff
[
2
][
0
][
n
]
=
lrint
(
0
.
5
*
(
1
<<
14
));
coeff
[
2
][
1
][
n
]
=
lrint
(
-
0
.
42
*
(
1
<<
14
));
coeff
[
2
][
2
][
n
]
=
lrint
(
-
0
.
08
*
(
1
<<
14
));
}
for
(
odepth
=
0
;
odepth
<
3
;
odepth
++
)
{
for
(
fmt
=
0
;
fmt
<
3
;
fmt
++
)
{
if
(
check_func
(
dsp
.
rgb2yuv
[
odepth
][
fmt
],
"ff_colorspacedsp_rgb2yuv_%sp%d"
,
format_string
[
fmt
],
odepth
*
2
+
8
))
{
int
ss_w
=
!!
fmt
,
ss_h
=
fmt
==
2
;
int
y_dst_stride
=
W
<<
!!
odepth
;
int
uv_dst_stride
=
y_dst_stride
>>
ss_w
;
randomize_buffers
();
call_ref
(
dst0
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
W
,
W
,
H
,
coeff
,
offset
);
call_new
(
dst1
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
W
,
W
,
H
,
coeff
,
offset
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
H
*
y_dst_stride
)
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
H
*
uv_dst_stride
>>
ss_h
)
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
H
*
uv_dst_stride
>>
ss_h
))
{
fail
();
}
}
}
}
report
(
"rgb2yuv"
);
}
static
void
check_multiply3x3
(
void
)
{
declare_func
(
void
,
int16_t
*
data
[
3
],
ptrdiff_t
stride
,
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
]);
ColorSpaceDSPContext
dsp
;
LOCAL_ALIGNED_32
(
int16_t
,
dst0_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_v
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_v
,
[
W
*
H
]);
int16_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
int16_t
**
src
=
dst0
;
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
int
n
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
coeff
[
0
][
0
][
n
]
=
lrint
(
0
.
85
*
(
1
<<
14
));
coeff
[
0
][
1
][
n
]
=
lrint
(
0
.
10
*
(
1
<<
14
));
coeff
[
0
][
2
][
n
]
=
lrint
(
0
.
05
*
(
1
<<
14
));
coeff
[
1
][
0
][
n
]
=
lrint
(
-
0
.
1
*
(
1
<<
14
));
coeff
[
1
][
1
][
n
]
=
lrint
(
0
.
95
*
(
1
<<
14
));
coeff
[
1
][
2
][
n
]
=
lrint
(
0
.
15
*
(
1
<<
14
));
coeff
[
2
][
0
][
n
]
=
lrint
(
-
0
.
2
*
(
1
<<
14
));
coeff
[
2
][
1
][
n
]
=
lrint
(
0
.
30
*
(
1
<<
14
));
coeff
[
2
][
2
][
n
]
=
lrint
(
0
.
90
*
(
1
<<
14
));
}
if
(
check_func
(
dsp
.
multiply3x3
,
"ff_colorspacedsp_multiply3x3"
))
{
randomize_buffers
();
memcpy
(
dst1_y
,
dst0_y
,
W
*
H
*
sizeof
(
*
dst1_y
));
memcpy
(
dst1_u
,
dst0_u
,
W
*
H
*
sizeof
(
*
dst1_u
));
memcpy
(
dst1_v
,
dst0_v
,
W
*
H
*
sizeof
(
*
dst1_v
));
call_ref
(
dst0
,
W
,
W
,
H
,
coeff
);
call_new
(
dst1
,
W
,
W
,
H
,
coeff
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
H
*
W
*
sizeof
(
*
dst0_y
))
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
H
*
W
*
sizeof
(
*
dst0_u
))
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
H
*
W
*
sizeof
(
*
dst0_v
)))
{
fail
();
}
}
report
(
"multiply3x3"
);
}
void
checkasm_check_colorspace
(
void
)
{
check_yuv2yuv
();
check_yuv2rgb
();
check_rgb2yuv
();
check_multiply3x3
();
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment