Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
c9d98c56
Commit
c9d98c56
authored
Apr 04, 2017
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
cavs: convert idct from inline asm to yasm.
parent
b51d7d89
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
171 additions
and
160 deletions
+171
-160
Makefile
libavcodec/x86/Makefile
+1
-0
cavsdsp.c
libavcodec/x86/cavsdsp.c
+5
-160
cavsidct.asm
libavcodec/x86/cavsidct.asm
+165
-0
No files found.
libavcodec/x86/Makefile
View file @
c9d98c56
...
...
@@ -142,6 +142,7 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER)
+=
x86/g722dsp.o
YASM-OBJS-$(CONFIG_ALAC_DECODER)
+=
x86/alacdsp.o
YASM-OBJS-$(CONFIG_APNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_CAVS_DECODER)
+=
x86/cavsidct.o
YASM-OBJS-$(CONFIG_DCA_DECODER)
+=
x86/dcadsp.o
x86/synth_filter.o
YASM-OBJS-$(CONFIG_DIRAC_DECODER)
+=
x86/diracdsp.o
\
x86/dirac_dwt.o
...
...
libavcodec/x86/cavsdsp.c
View file @
c9d98c56
...
...
@@ -34,172 +34,19 @@
#include "idctdsp.h"
#include "config.h"
#if HAVE_MMX_INLINE
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \
"paddw "#b", "#a" \n\t"\
"paddw "#b", "#b" \n\t"\
"psubw "#a", "#b" \n\t"
/*****************************************************************************
*
* inverse transform
*
****************************************************************************/
static
inline
void
cavs_idct8_1d
(
int16_t
*
block
,
uint64_t
bias
)
{
__asm__
volatile
(
"movq 112(%0), %%mm4
\n\t
"
/* mm4 = src7 */
"movq 16(%0), %%mm5
\n\t
"
/* mm5 = src1 */
"movq 80(%0), %%mm2
\n\t
"
/* mm2 = src5 */
"movq 48(%0), %%mm7
\n\t
"
/* mm7 = src3 */
"movq %%mm4, %%mm0
\n\t
"
"movq %%mm5, %%mm3
\n\t
"
"movq %%mm2, %%mm6
\n\t
"
"movq %%mm7, %%mm1
\n\t
"
"paddw %%mm4, %%mm4
\n\t
"
/* mm4 = 2*src7 */
"paddw %%mm3, %%mm3
\n\t
"
/* mm3 = 2*src1 */
"paddw %%mm6, %%mm6
\n\t
"
/* mm6 = 2*src5 */
"paddw %%mm1, %%mm1
\n\t
"
/* mm1 = 2*src3 */
"paddw %%mm4, %%mm0
\n\t
"
/* mm0 = 3*src7 */
"paddw %%mm3, %%mm5
\n\t
"
/* mm5 = 3*src1 */
"paddw %%mm6, %%mm2
\n\t
"
/* mm2 = 3*src5 */
"paddw %%mm1, %%mm7
\n\t
"
/* mm7 = 3*src3 */
"psubw %%mm4, %%mm5
\n\t
"
/* mm5 = 3*src1 - 2*src7 = a0 */
"paddw %%mm6, %%mm7
\n\t
"
/* mm7 = 3*src3 + 2*src5 = a1 */
"psubw %%mm2, %%mm1
\n\t
"
/* mm1 = 2*src3 - 3*src5 = a2 */
"paddw %%mm0, %%mm3
\n\t
"
/* mm3 = 2*src1 + 3*src7 = a3 */
"movq %%mm5, %%mm4
\n\t
"
"movq %%mm7, %%mm6
\n\t
"
"movq %%mm3, %%mm0
\n\t
"
"movq %%mm1, %%mm2
\n\t
"
SUMSUB_BA
(
%%
mm7
,
%%
mm5
)
/* mm7 = a0 + a1 mm5 = a0 - a1 */
"paddw %%mm3, %%mm7
\n\t
"
/* mm7 = a0 + a1 + a3 */
"paddw %%mm1, %%mm5
\n\t
"
/* mm5 = a0 - a1 + a2 */
"paddw %%mm7, %%mm7
\n\t
"
"paddw %%mm5, %%mm5
\n\t
"
"paddw %%mm6, %%mm7
\n\t
"
/* mm7 = b4 */
"paddw %%mm4, %%mm5
\n\t
"
/* mm5 = b5 */
SUMSUB_BA
(
%%
mm1
,
%%
mm3
)
/* mm1 = a3 + a2 mm3 = a3 - a2 */
"psubw %%mm1, %%mm4
\n\t
"
/* mm4 = a0 - a2 - a3 */
"movq %%mm4, %%mm1
\n\t
"
/* mm1 = a0 - a2 - a3 */
"psubw %%mm6, %%mm3
\n\t
"
/* mm3 = a3 - a2 - a1 */
"paddw %%mm1, %%mm1
\n\t
"
"paddw %%mm3, %%mm3
\n\t
"
"psubw %%mm2, %%mm1
\n\t
"
/* mm1 = b7 */
"paddw %%mm0, %%mm3
\n\t
"
/* mm3 = b6 */
"movq 32(%0), %%mm2
\n\t
"
/* mm2 = src2 */
"movq 96(%0), %%mm6
\n\t
"
/* mm6 = src6 */
"movq %%mm2, %%mm4
\n\t
"
"movq %%mm6, %%mm0
\n\t
"
"psllw $2, %%mm4
\n\t
"
/* mm4 = 4*src2 */
"psllw $2, %%mm6
\n\t
"
/* mm6 = 4*src6 */
"paddw %%mm4, %%mm2
\n\t
"
/* mm2 = 5*src2 */
"paddw %%mm6, %%mm0
\n\t
"
/* mm0 = 5*src6 */
"paddw %%mm2, %%mm2
\n\t
"
"paddw %%mm0, %%mm0
\n\t
"
"psubw %%mm0, %%mm4
\n\t
"
/* mm4 = 4*src2 - 10*src6 = a7 */
"paddw %%mm2, %%mm6
\n\t
"
/* mm6 = 4*src6 + 10*src2 = a6 */
"movq (%0), %%mm2
\n\t
"
/* mm2 = src0 */
"movq 64(%0), %%mm0
\n\t
"
/* mm0 = src4 */
SUMSUB_BA
(
%%
mm0
,
%%
mm2
)
/* mm0 = src0+src4 mm2 = src0-src4 */
"psllw $3, %%mm0
\n\t
"
"psllw $3, %%mm2
\n\t
"
"paddw %1, %%mm0
\n\t
"
/* add rounding bias */
"paddw %1, %%mm2
\n\t
"
/* add rounding bias */
SUMSUB_BA
(
%%
mm6
,
%%
mm0
)
/* mm6 = a4 + a6 mm0 = a4 - a6 */
SUMSUB_BA
(
%%
mm4
,
%%
mm2
)
/* mm4 = a5 + a7 mm2 = a5 - a7 */
SUMSUB_BA
(
%%
mm7
,
%%
mm6
)
/* mm7 = dst0 mm6 = dst7 */
SUMSUB_BA
(
%%
mm5
,
%%
mm4
)
/* mm5 = dst1 mm4 = dst6 */
SUMSUB_BA
(
%%
mm3
,
%%
mm2
)
/* mm3 = dst2 mm2 = dst5 */
SUMSUB_BA
(
%%
mm1
,
%%
mm0
)
/* mm1 = dst3 mm0 = dst4 */
::
"r"
(
block
),
"m"
(
bias
)
);
}
#define SBUTTERFLY(a,b,t,n,m)\
"mov" #m " " #a ", " #t " \n\t"
/* abcd */
\
"punpckl" #n " " #b ", " #a " \n\t"
/* aebf */
\
"punpckh" #n " " #b ", " #t " \n\t"
/* cgdh */
\
#if HAVE_MMX_EXTERNAL
#define TRANSPOSE4(a,b,c,d,t)\
SBUTTERFLY(a,b,t,wd,q)
/* a=aebf t=cgdh */
\
SBUTTERFLY(c,d,b,wd,q)
/* c=imjn b=kolp */
\
SBUTTERFLY(a,c,d,dq,q)
/* a=aeim d=bfjn */
\
SBUTTERFLY(t,b,c,dq,q)
/* t=cgko c=dhlp */
void
ff_cavs_idct8_mmx
(
int16_t
*
out
,
const
int16_t
*
in
);
static
void
cavs_idct8_add_mmx
(
uint8_t
*
dst
,
int16_t
*
block
,
ptrdiff_t
stride
)
{
int
i
;
LOCAL_ALIGNED
(
16
,
int16_t
,
b2
,
[
64
]);
for
(
i
=
0
;
i
<
2
;
i
++
){
cavs_idct8_1d
(
block
+
4
*
i
,
ff_pw_4
.
a
);
__asm__
volatile
(
"psraw $3, %%mm7
\n\t
"
"psraw $3, %%mm6
\n\t
"
"psraw $3, %%mm5
\n\t
"
"psraw $3, %%mm4
\n\t
"
"psraw $3, %%mm3
\n\t
"
"psraw $3, %%mm2
\n\t
"
"psraw $3, %%mm1
\n\t
"
"psraw $3, %%mm0
\n\t
"
"movq %%mm7, (%0)
\n\t
"
TRANSPOSE4
(
%%
mm0
,
%%
mm2
,
%%
mm4
,
%%
mm6
,
%%
mm7
)
"movq %%mm0, 8(%0)
\n\t
"
"movq %%mm6, 24(%0)
\n\t
"
"movq %%mm7, 40(%0)
\n\t
"
"movq %%mm4, 56(%0)
\n\t
"
"movq (%0), %%mm7
\n\t
"
TRANSPOSE4
(
%%
mm7
,
%%
mm5
,
%%
mm3
,
%%
mm1
,
%%
mm0
)
"movq %%mm7, (%0)
\n\t
"
"movq %%mm1, 16(%0)
\n\t
"
"movq %%mm0, 32(%0)
\n\t
"
"movq %%mm3, 48(%0)
\n\t
"
:
:
"r"
(
b2
+
32
*
i
)
:
"memory"
);
}
for
(
i
=
0
;
i
<
2
;
i
++
){
cavs_idct8_1d
(
b2
+
4
*
i
,
ff_pw_64
.
a
);
__asm__
volatile
(
"psraw $7, %%mm7
\n\t
"
"psraw $7, %%mm6
\n\t
"
"psraw $7, %%mm5
\n\t
"
"psraw $7, %%mm4
\n\t
"
"psraw $7, %%mm3
\n\t
"
"psraw $7, %%mm2
\n\t
"
"psraw $7, %%mm1
\n\t
"
"psraw $7, %%mm0
\n\t
"
"movq %%mm7, (%0)
\n\t
"
"movq %%mm5, 16(%0)
\n\t
"
"movq %%mm3, 32(%0)
\n\t
"
"movq %%mm1, 48(%0)
\n\t
"
"movq %%mm0, 64(%0)
\n\t
"
"movq %%mm2, 80(%0)
\n\t
"
"movq %%mm4, 96(%0)
\n\t
"
"movq %%mm6, 112(%0)
\n\t
"
::
"r"
(
b2
+
4
*
i
)
:
"memory"
);
}
ff_cavs_idct8_mmx
(
b2
,
block
);
ff_add_pixels_clamped
(
b2
,
dst
,
stride
);
}
#endif
/* HAVE_MMX_
INLINE
*/
#endif
/* HAVE_MMX_
EXTERNAL
*/
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
...
...
@@ -529,12 +376,10 @@ static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
c
->
put_cavs_qpel_pixels_tab
[
1
][
0
]
=
put_cavs_qpel8_mc00_mmx
;
c
->
avg_cavs_qpel_pixels_tab
[
0
][
0
]
=
avg_cavs_qpel16_mc00_mmx
;
c
->
avg_cavs_qpel_pixels_tab
[
1
][
0
]
=
avg_cavs_qpel8_mc00_mmx
;
#endif
#if HAVE_MMX_INLINE
c
->
cavs_idct8_add
=
cavs_idct8_add_mmx
;
c
->
idct_perm
=
FF_IDCT_PERM_TRANSPOSE
;
#endif
/* HAVE_MMX_
INLINE
*/
#endif
/* HAVE_MMX_
EXTERNAL
*/
}
#define DSPFUNC(PFX, IDX, NUM, EXT) \
...
...
libavcodec/x86/cavsidct.asm
0 → 100644
View file @
c9d98c56
; Chinese AVS video (AVS1-P2, JiZhun profile) decoder
; Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
;
; MMX-optimized DSP functions, based on H.264 optimizations by
; Michael Niedermayer and Loren Merritt
; Conversion from gcc syntax to x264asm syntax with modifications
; by Ronald S. Bultje <rsbultje@gmail.com>
;
; This file is part of FFmpeg.
;
; FFmpeg is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public
; License as published by the Free Software Foundation; either
; version 2.1 of the License, or (at your option) any later version.
;
; FFmpeg is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public License
; along with FFmpeg; if not, write to the Free Software Foundation,
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
%include
"libavutil/x86/x86util.asm"
cextern
pw_4
cextern
pw_64
SECTION
.
text
%macro
CAVS_IDCT8_1D
2
; source, round
mova
m4
,
[
%1
+
7
*
16
]
; m4 = src7
mova
m5
,
[
%1
+
1
*
16
]
; m5 = src1
mova
m2
,
[
%1
+
5
*
16
]
; m2 = src5
mova
m7
,
[
%1
+
3
*
16
]
; m7 = src3
mova
m0
,
m4
mova
m3
,
m5
mova
m6
,
m2
mova
m1
,
m7
paddw
m4
,
m4
; m4 = 2*src7
paddw
m3
,
m3
; m3 = 2*src1
paddw
m6
,
m6
; m6 = 2*src5
paddw
m1
,
m1
; m1 = 2*src3
paddw
m0
,
m4
; m0 = 3*src7
paddw
m5
,
m3
; m5 = 3*src1
paddw
m2
,
m6
; m2 = 3*src5
paddw
m7
,
m1
; m7 = 3*src3
psubw
m5
,
m4
; m5 = 3*src1 - 2*src7 = a0
paddw
m7
,
m6
; m7 = 3*src3 - 2*src5 = a1
psubw
m1
,
m2
; m1 = 2*src3 - 3*src5 = a2
paddw
m3
,
m0
; m3 = 2*src1 - 3*src7 = a3
mova
m4
,
m5
mova
m6
,
m7
mova
m0
,
m3
mova
m2
,
m1
SUMSUB_BA
w
,
7
,
5
; m7 = a0 + a1, m5 = a0 - a1
paddw
m7
,
m3
; m7 = a0 + a1 + a3
paddw
m5
,
m1
; m5 = a0 - a1 + a2
paddw
m7
,
m7
paddw
m5
,
m5
paddw
m7
,
m6
; m7 = b4
paddw
m5
,
m4
; m5 = b5
SUMSUB_BA
w
,
1
,
3
; m1 = a3 + a2, m3 = a3 - a2
psubw
m4
,
m1
; m4 = a0 - a2 - a3
mova
m1
,
m4
; m1 = a0 - a2 - a3
psubw
m3
,
m6
; m3 = a3 - a2 - a1
paddw
m1
,
m1
paddw
m3
,
m3
psubw
m1
,
m2
; m1 = b7
paddw
m3
,
m0
; m3 = b6
mova
m2
,
[
%1
+
2
*
16
]
; m2 = src2
mova
m6
,
[
%1
+
6
*
16
]
; m6 = src6
mova
m4
,
m2
mova
m0
,
m6
psllw
m4
,
2
; m4 = 4*src2
psllw
m6
,
2
; m6 = 4*src6
paddw
m2
,
m4
; m2 = 5*src2
paddw
m0
,
m6
; m0 = 5*src6
paddw
m2
,
m2
paddw
m0
,
m0
psubw
m4
,
m0
; m4 = 4*src2 - 10*src6 = a7
paddw
m6
,
m2
; m6 = 4*src6 + 10*src2 = a6
mova
m2
,
[
%1
+
0
*
16
]
; m2 = src0
mova
m0
,
[
%1
+
4
*
16
]
; m0 = src4
SUMSUB_BA
w
,
0
,
2
; m0 = src0 + src4, m2 = src0 - src4
psllw
m0
,
3
psllw
m2
,
3
paddw
m0
,
%2
; add rounding bias
paddw
m2
,
%2
; add rounding bias
SUMSUB_BA
w
,
6
,
0
; m6 = a4 + a6, m0 = a4 - a6
SUMSUB_BA
w
,
4
,
2
; m4 = a5 + a7, m2 = a5 - a7
SUMSUB_BA
w
,
7
,
6
; m7 = dst0, m6 = dst7
SUMSUB_BA
w
,
5
,
4
; m5 = dst1, m4 = dst6
SUMSUB_BA
w
,
3
,
2
; m3 = dst2, m2 = dst5
SUMSUB_BA
w
,
1
,
0
; m1 = dst3, m0 = dst4
%endmacro
INIT_MMX
mmx
cglobal
cavs_idct8
,
2
,
4
,
8
,
8
*
16
,
out
,
in
,
cnt
,
tmp
mov
cntd
,
2
mov
tmpq
,
rsp
.
loop_1
:
CAVS_IDCT8_1D
inq
,
[
pw_4
]
psraw
m7
,
3
psraw
m6
,
3
psraw
m5
,
3
psraw
m4
,
3
psraw
m3
,
3
psraw
m2
,
3
psraw
m1
,
3
psraw
m0
,
3
mova
[tmpq],
m7
TRANSPOSE4x4W
0
,
2
,
4
,
6
,
7
mova
[
tmpq
+
1
*
8
]
,
m0
mova
[
tmpq
+
3
*
8
]
,
m2
mova
[
tmpq
+
5
*
8
]
,
m4
mova
[
tmpq
+
7
*
8
]
,
m6
mova
m7
,
[tmpq]
TRANSPOSE4x4W
7
,
5
,
3
,
1
,
0
mova
[
tmpq
+
0
*
8
]
,
m7
mova
[
tmpq
+
2
*
8
]
,
m5
mova
[
tmpq
+
4
*
8
]
,
m3
mova
[
tmpq
+
6
*
8
]
,
m1
add
inq
,
mmsize
add
tmpq
,
64
dec
cntd
jg
.
loop_1
mov
cntd
,
2
mov
tmpq
,
rsp
.
loop_2
:
CAVS_IDCT8_1D
tmpq
,
[
pw_64
]
psraw
m7
,
7
psraw
m6
,
7
psraw
m5
,
7
psraw
m4
,
7
psraw
m3
,
7
psraw
m2
,
7
psraw
m1
,
7
psraw
m0
,
7
mova
[
outq
+
0
*
16
]
,
m7
mova
[
outq
+
1
*
16
]
,
m5
mova
[
outq
+
2
*
16
]
,
m3
mova
[
outq
+
3
*
16
]
,
m1
mova
[
outq
+
4
*
16
]
,
m0
mova
[
outq
+
5
*
16
]
,
m2
mova
[
outq
+
6
*
16
]
,
m4
mova
[
outq
+
7
*
16
]
,
m6
add
outq
,
mmsize
add
tmpq
,
mmsize
dec
cntd
jg
.
loop_2
RET
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment