Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
3ced55d5
Commit
3ced55d5
authored
Mar 10, 2013
by
Ronald S. Bultje
Committed by
Michael Niedermayer
Mar 13, 2013
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Move x86 half-pel assembly from dsputil to hpeldsp.
parent
e0a8f315
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
967 additions
and
660 deletions
+967
-660
hpeldsp.c
libavcodec/hpeldsp.c
+1
-1
Makefile
libavcodec/x86/Makefile
+7
-3
dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c
+10
-203
dsputil_rnd_template.c
libavcodec/x86/dsputil_rnd_template.c
+0
-373
fpelbase.asm
libavcodec/x86/fpelbase.asm
+106
-0
hpeldsp_avg_template.c
libavcodec/x86/hpeldsp_avg_template.c
+0
-0
hpeldsp_init.c
libavcodec/x86/hpeldsp_init.c
+415
-0
hpeldsp_rnd_template.c
libavcodec/x86/hpeldsp_rnd_template.c
+428
-0
qpelbase.asm
libavcodec/x86/qpelbase.asm
+0
-80
No files found.
libavcodec/hpeldsp.c
View file @
3ced55d5
...
@@ -53,8 +53,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext* c, int flags)
...
@@ -53,8 +53,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext* c, int flags)
hpel_funcs
(
avg
,
[
3
],
2
);
hpel_funcs
(
avg
,
[
3
],
2
);
hpel_funcs
(
avg_no_rnd
,,
16
);
hpel_funcs
(
avg_no_rnd
,,
16
);
#if 0
if
(
ARCH_X86
)
ff_hpeldsp_init_x86
(
c
,
flags
);
if
(
ARCH_X86
)
ff_hpeldsp_init_x86
(
c
,
flags
);
#if 0
if (ARCH_ARM) ff_hpeldsp_init_arm (c, flags);
if (ARCH_ARM) ff_hpeldsp_init_arm (c, flags);
if (HAVE_VIS) ff_hpeldsp_init_vis (c, flags);
if (HAVE_VIS) ff_hpeldsp_init_vis (c, flags);
if (ARCH_ALPHA) ff_hpeldsp_init_alpha (c, flags);
if (ARCH_ALPHA) ff_hpeldsp_init_alpha (c, flags);
...
...
libavcodec/x86/Makefile
View file @
3ced55d5
...
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
...
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP)
+=
x86/h264dsp_init.o
OBJS-$(CONFIG_H264DSP)
+=
x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED)
+=
x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264PRED)
+=
x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL)
+=
x86/h264_qpel.o
OBJS-$(CONFIG_H264QPEL)
+=
x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP)
+=
x86/hpeldsp_init.o
OBJS-$(CONFIG_LPC)
+=
x86/lpc.o
OBJS-$(CONFIG_LPC)
+=
x86/lpc.o
OBJS-$(CONFIG_MLP_DECODER)
+=
x86/mlpdsp.o
OBJS-$(CONFIG_MLP_DECODER)
+=
x86/mlpdsp.o
OBJS-$(CONFIG_MPEGAUDIODSP)
+=
x86/mpegaudiodec.o
OBJS-$(CONFIG_MPEGAUDIODSP)
+=
x86/mpegaudiodec.o
...
@@ -66,7 +67,10 @@ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
...
@@ -66,7 +67,10 @@ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
x86/h264_intrapred_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL)
+=
x86/h264_qpel_8bit.o
\
YASM-OBJS-$(CONFIG_H264QPEL)
+=
x86/h264_qpel_8bit.o
\
x86/h264_qpel_10bit.o
\
x86/h264_qpel_10bit.o
\
x86/qpelbase.o
x86/qpelbase.o
\
x86/fpelbase.o
YASM-OBJS-$(CONFIG_HPELDSP)
+=
x86/hpeldsp.o
\
x86/fpelbase.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP)
+=
x86/imdct36.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP)
+=
x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_PNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER)
+=
x86/proresdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER)
+=
x86/proresdsp.o
...
@@ -83,9 +87,9 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
...
@@ -83,9 +87,9 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER)
+=
x86/vp8dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER)
+=
x86/vp8dsp.o
YASM-OBJS-$(CONFIG_DSPUTIL)
+=
x86/dsputil.o
\
YASM-OBJS-$(CONFIG_DSPUTIL)
+=
x86/dsputil.o
\
x86/hpeldsp.o
\
x86/mpeg4qpel.o
\
x86/mpeg4qpel.o
\
x86/qpelbase.o
x86/qpelbase.o
\
x86/fpelbase.o
YASM-OBJS
+=
x86/deinterlace.o
\
YASM-OBJS
+=
x86/deinterlace.o
\
x86/fmtconvert.o
x86/fmtconvert.o
libavcodec/x86/dsputil_mmx.c
View file @
3ced55d5
...
@@ -60,10 +60,6 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
...
@@ -60,10 +60,6 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
#if HAVE_YASM
#if HAVE_YASM
void
ff_put_pixels8_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
void
ff_put_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_put_no_rnd_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
void
ff_put_no_rnd_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
...
@@ -71,54 +67,14 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
...
@@ -71,54 +67,14 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
int
src1Stride
,
int
h
);
int
src1Stride
,
int
h
);
void
ff_avg_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
void
ff_avg_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_put_pixels16_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels16_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
void
ff_put_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_avg_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
void
ff_avg_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_put_no_rnd_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
void
ff_put_no_rnd_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
void
ff_avg_pixels8_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_exact_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_exact_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_y2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_y2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_exact_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_exact_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_y2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_y2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_xy2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_xy2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
static
void
ff_put_pixels16_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
static
void
ff_put_pixels16_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
...
@@ -192,14 +148,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
...
@@ -192,14 +148,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
// using regr as temporary and for the output result
// using regr as temporary and for the output result
// first argument is unmodifed and second is trashed
// first argument is unmodifed and second is trashed
// regfe is supposed to contain 0xfefefefefefefefe
// regfe is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"paddb "#regb", "#regr" \n\t"
#define PAVGB_MMX(rega, regb, regr, regfe) \
#define PAVGB_MMX(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"movq "#rega", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
...
@@ -209,20 +157,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
...
@@ -209,20 +157,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
"psubb "#regb", "#regr" \n\t"
"psubb "#regb", "#regr" \n\t"
// mm6 is supposed to contain 0xfefefefefefefefe
// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"movq "#regc", "#regp" \n\t" \
...
@@ -237,22 +171,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
...
@@ -237,22 +171,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
"psubb "#regb", "#regr" \n\t" \
"psubb "#regb", "#regr" \n\t" \
"psubb "#regd", "#regp" \n\t"
"psubb "#regd", "#regp" \n\t"
/***********************************/
/* MMX no rounding */
#define NO_RND 1
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "dsputil_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef NO_RND
/***********************************/
/***********************************/
/* MMX rounding */
/* MMX rounding */
...
@@ -260,6 +178,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
...
@@ -260,6 +178,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
#define SET_RND MOVQ_WTWO
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "dsputil_rnd_template.c"
#include "dsputil_rnd_template.c"
...
@@ -274,31 +193,21 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
...
@@ -274,31 +193,21 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
#if HAVE_YASM
#if HAVE_YASM
/***********************************/
/* 3Dnow specific */
#define DEF(x) x ## _3dnow
#include "dsputil_avg_template.c"
#undef DEF
/***********************************/
/***********************************/
/* MMXEXT specific */
/* MMXEXT specific */
#define DEF(x) x ## _mmxext
//FIXME the following could be optimized too ...
static
void
ff_avg_pixels16_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
#include "dsputil_avg_template.c"
int
line_size
,
int
h
)
{
#undef DEF
ff_avg_pixels8_mmxext
(
block
,
pixels
,
line_size
,
h
);
ff_avg_pixels8_mmxext
(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
#endif
/* HAVE_YASM */
#endif
/* HAVE_YASM */
#if HAVE_INLINE_ASM
#if HAVE_INLINE_ASM
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
/***********************************/
/***********************************/
/* standard MMX */
/* standard MMX */
...
@@ -1520,14 +1429,6 @@ void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
...
@@ -1520,14 +1429,6 @@ void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
} while (0)
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
static
av_cold
void
dsputil_init_mmx
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
static
av_cold
void
dsputil_init_mmx
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
int
mm_flags
)
int
mm_flags
)
{
{
...
@@ -1542,14 +1443,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
...
@@ -1542,14 +1443,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c
->
clear_block
=
clear_block_mmx
;
c
->
clear_block
=
clear_block_mmx
;
c
->
clear_blocks
=
clear_blocks_mmx
;
c
->
clear_blocks
=
clear_blocks_mmx
;
c
->
draw_edges
=
draw_edges_mmx
;
c
->
draw_edges
=
draw_edges_mmx
;
SET_HPEL_FUNCS
(
put
,
[
0
],
16
,
mmx
);
SET_HPEL_FUNCS
(
put_no_rnd
,
[
0
],
16
,
mmx
);
SET_HPEL_FUNCS
(
avg
,
[
0
],
16
,
mmx
);
SET_HPEL_FUNCS
(
avg_no_rnd
,
,
16
,
mmx
);
SET_HPEL_FUNCS
(
put
,
[
1
],
8
,
mmx
);
SET_HPEL_FUNCS
(
put_no_rnd
,
[
1
],
8
,
mmx
);
SET_HPEL_FUNCS
(
avg
,
[
1
],
8
,
mmx
);
}
}
#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
...
@@ -1584,43 +1477,9 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
...
@@ -1584,43 +1477,9 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
SET_QPEL_FUNCS
(
put_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_no_rnd_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_no_rnd_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_no_rnd_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_no_rnd_qpel
,
1
,
8
,
mmxext
,
);
if
(
!
high_bit_depth
)
{
c
->
put_pixels_tab
[
0
][
1
]
=
ff_put_pixels16_x2_mmxext
;
c
->
put_pixels_tab
[
0
][
2
]
=
ff_put_pixels16_y2_mmxext
;
c
->
avg_pixels_tab
[
0
][
0
]
=
ff_avg_pixels16_mmxext
;
c
->
avg_pixels_tab
[
0
][
1
]
=
ff_avg_pixels16_x2_mmxext
;
c
->
avg_pixels_tab
[
0
][
2
]
=
ff_avg_pixels16_y2_mmxext
;
c
->
put_pixels_tab
[
1
][
1
]
=
ff_put_pixels8_x2_mmxext
;
c
->
put_pixels_tab
[
1
][
2
]
=
ff_put_pixels8_y2_mmxext
;
c
->
avg_pixels_tab
[
1
][
0
]
=
ff_avg_pixels8_mmxext
;
c
->
avg_pixels_tab
[
1
][
1
]
=
ff_avg_pixels8_x2_mmxext
;
c
->
avg_pixels_tab
[
1
][
2
]
=
ff_avg_pixels8_y2_mmxext
;
}
if
(
!
(
avctx
->
flags
&
CODEC_FLAG_BITEXACT
))
{
if
(
!
high_bit_depth
)
{
c
->
put_no_rnd_pixels_tab
[
0
][
1
]
=
ff_put_no_rnd_pixels16_x2_mmxext
;
c
->
put_no_rnd_pixels_tab
[
0
][
2
]
=
ff_put_no_rnd_pixels16_y2_mmxext
;
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_mmxext
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_mmxext
;
c
->
avg_pixels_tab
[
0
][
3
]
=
ff_avg_pixels16_xy2_mmxext
;
c
->
avg_pixels_tab
[
1
][
3
]
=
ff_avg_pixels8_xy2_mmxext
;
}
}
#endif
/* HAVE_YASM */
#endif
/* HAVE_YASM */
#if HAVE_MMXEXT_EXTERNAL
#if HAVE_MMXEXT_EXTERNAL
if
(
CONFIG_VP3_DECODER
&&
(
avctx
->
codec_id
==
AV_CODEC_ID_VP3
||
avctx
->
codec_id
==
AV_CODEC_ID_THEORA
))
{
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_exact_mmxext
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_exact_mmxext
;
}
/* slower than cmov version on AMD */
/* slower than cmov version on AMD */
if
(
!
(
mm_flags
&
AV_CPU_FLAG_3DNOW
))
if
(
!
(
mm_flags
&
AV_CPU_FLAG_3DNOW
))
c
->
add_hfyu_median_prediction
=
ff_add_hfyu_median_prediction_mmxext
;
c
->
add_hfyu_median_prediction
=
ff_add_hfyu_median_prediction_mmxext
;
...
@@ -1636,46 +1495,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
...
@@ -1636,46 +1495,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
#endif
/* HAVE_MMXEXT_EXTERNAL */
#endif
/* HAVE_MMXEXT_EXTERNAL */
}
}
static
av_cold
void
dsputil_init_3dnow
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
int
mm_flags
)
{
const
int
high_bit_depth
=
avctx
->
bits_per_raw_sample
>
8
;
#if HAVE_YASM
if
(
!
high_bit_depth
)
{
c
->
put_pixels_tab
[
0
][
1
]
=
ff_put_pixels16_x2_3dnow
;
c
->
put_pixels_tab
[
0
][
2
]
=
ff_put_pixels16_y2_3dnow
;
c
->
avg_pixels_tab
[
0
][
0
]
=
ff_avg_pixels16_3dnow
;
c
->
avg_pixels_tab
[
0
][
1
]
=
ff_avg_pixels16_x2_3dnow
;
c
->
avg_pixels_tab
[
0
][
2
]
=
ff_avg_pixels16_y2_3dnow
;
c
->
put_pixels_tab
[
1
][
1
]
=
ff_put_pixels8_x2_3dnow
;
c
->
put_pixels_tab
[
1
][
2
]
=
ff_put_pixels8_y2_3dnow
;
c
->
avg_pixels_tab
[
1
][
0
]
=
ff_avg_pixels8_3dnow
;
c
->
avg_pixels_tab
[
1
][
1
]
=
ff_avg_pixels8_x2_3dnow
;
c
->
avg_pixels_tab
[
1
][
2
]
=
ff_avg_pixels8_y2_3dnow
;
if
(
!
(
avctx
->
flags
&
CODEC_FLAG_BITEXACT
)){
c
->
put_no_rnd_pixels_tab
[
0
][
1
]
=
ff_put_no_rnd_pixels16_x2_3dnow
;
c
->
put_no_rnd_pixels_tab
[
0
][
2
]
=
ff_put_no_rnd_pixels16_y2_3dnow
;
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_3dnow
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_3dnow
;
c
->
avg_pixels_tab
[
0
][
3
]
=
ff_avg_pixels16_xy2_3dnow
;
c
->
avg_pixels_tab
[
1
][
3
]
=
ff_avg_pixels8_xy2_3dnow
;
}
}
if
(
CONFIG_VP3_DECODER
&&
(
avctx
->
codec_id
==
AV_CODEC_ID_VP3
||
avctx
->
codec_id
==
AV_CODEC_ID_THEORA
))
{
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_exact_3dnow
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_exact_3dnow
;
}
#endif
/* HAVE_YASM */
}
static
av_cold
void
dsputil_init_sse
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
static
av_cold
void
dsputil_init_sse
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
int
mm_flags
)
int
mm_flags
)
{
{
...
@@ -1716,15 +1535,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
...
@@ -1716,15 +1535,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
#endif
/* HAVE_SSE2_INLINE */
#endif
/* HAVE_SSE2_INLINE */
#if HAVE_SSE2_EXTERNAL
#if HAVE_SSE2_EXTERNAL
if
(
!
(
mm_flags
&
AV_CPU_FLAG_SSE2SLOW
))
{
// these functions are slower than mmx on AMD, but faster on Intel
if
(
!
high_bit_depth
)
{
c
->
put_pixels_tab
[
0
][
0
]
=
ff_put_pixels16_sse2
;
c
->
put_no_rnd_pixels_tab
[
0
][
0
]
=
ff_put_pixels16_sse2
;
c
->
avg_pixels_tab
[
0
][
0
]
=
ff_avg_pixels16_sse2
;
}
}
c
->
scalarproduct_int16
=
ff_scalarproduct_int16_sse2
;
c
->
scalarproduct_int16
=
ff_scalarproduct_int16_sse2
;
c
->
scalarproduct_and_madd_int16
=
ff_scalarproduct_and_madd_int16_sse2
;
c
->
scalarproduct_and_madd_int16
=
ff_scalarproduct_and_madd_int16_sse2
;
if
(
mm_flags
&
AV_CPU_FLAG_ATOM
)
{
if
(
mm_flags
&
AV_CPU_FLAG_ATOM
)
{
...
@@ -1811,9 +1621,6 @@ av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
...
@@ -1811,9 +1621,6 @@ av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
if
(
mm_flags
&
AV_CPU_FLAG_MMXEXT
)
if
(
mm_flags
&
AV_CPU_FLAG_MMXEXT
)
dsputil_init_mmxext
(
c
,
avctx
,
mm_flags
);
dsputil_init_mmxext
(
c
,
avctx
,
mm_flags
);
if
(
mm_flags
&
AV_CPU_FLAG_3DNOW
)
dsputil_init_3dnow
(
c
,
avctx
,
mm_flags
);
if
(
mm_flags
&
AV_CPU_FLAG_SSE
)
if
(
mm_flags
&
AV_CPU_FLAG_SSE
)
dsputil_init_sse
(
c
,
avctx
,
mm_flags
);
dsputil_init_sse
(
c
,
avctx
,
mm_flags
);
...
...
libavcodec/x86/dsputil_rnd_template.c
View file @
3ced55d5
...
@@ -25,212 +25,6 @@
...
@@ -25,212 +25,6 @@
*/
*/
// put_pixels
// put_pixels
static
void
DEF
(
put
,
pixels8_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
static
void
av_unused
DEF
(
put
,
pixels8_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"testl $1, %0
\n\t
"
" jz 1f
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq (%2), %%mm1
\n\t
"
"add %4, %1
\n\t
"
"add $8, %2
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm6
)
"movq %%mm4, (%3)
\n\t
"
"add %5, %3
\n\t
"
"decl %0
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq (%2), %%mm1
\n\t
"
"add %4, %1
\n\t
"
"movq (%1), %%mm2
\n\t
"
"movq 8(%2), %%mm3
\n\t
"
"add %4, %1
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%3)
\n\t
"
"add %5, %3
\n\t
"
"movq %%mm5, (%3)
\n\t
"
"add %5, %3
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 16(%2), %%mm1
\n\t
"
"add %4, %1
\n\t
"
"movq (%1), %%mm2
\n\t
"
"movq 24(%2), %%mm3
\n\t
"
"add %4, %1
\n\t
"
"add $32, %2
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%3)
\n\t
"
"add %5, %3
\n\t
"
"movq %%mm5, (%3)
\n\t
"
"add %5, %3
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:
"+m"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#else
:
"+b"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#endif
:
"S"
((
x86_reg
)
src1Stride
),
"D"
((
x86_reg
)
dstStride
)
:
"memory"
);
}
static
void
DEF
(
put
,
pixels16_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"movq 8(%1), %%mm0
\n\t
"
"movq 9(%1), %%mm1
\n\t
"
"movq 8(%1, %3), %%mm2
\n\t
"
"movq 9(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, 8(%2)
\n\t
"
"movq %%mm5, 8(%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"movq 8(%1), %%mm0
\n\t
"
"movq 9(%1), %%mm1
\n\t
"
"movq 8(%1, %3), %%mm2
\n\t
"
"movq 9(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, 8(%2)
\n\t
"
"movq %%mm5, 8(%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
static
void
av_unused
DEF
(
put
,
pixels16_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"testl $1, %0
\n\t
"
" jz 1f
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq (%2), %%mm1
\n\t
"
"movq 8(%1), %%mm2
\n\t
"
"movq 8(%2), %%mm3
\n\t
"
"add %4, %1
\n\t
"
"add $16, %2
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%3)
\n\t
"
"movq %%mm5, 8(%3)
\n\t
"
"add %5, %3
\n\t
"
"decl %0
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq (%2), %%mm1
\n\t
"
"movq 8(%1), %%mm2
\n\t
"
"movq 8(%2), %%mm3
\n\t
"
"add %4, %1
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%3)
\n\t
"
"movq %%mm5, 8(%3)
\n\t
"
"add %5, %3
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 16(%2), %%mm1
\n\t
"
"movq 8(%1), %%mm2
\n\t
"
"movq 24(%2), %%mm3
\n\t
"
"add %4, %1
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%3)
\n\t
"
"movq %%mm5, 8(%3)
\n\t
"
"add %5, %3
\n\t
"
"add $32, %2
\n\t
"
"subl $2, %0
\n\t
"
"jnz 1b
\n\t
"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:
"+m"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#else
:
"+b"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#endif
:
"S"
((
x86_reg
)
src1Stride
),
"D"
((
x86_reg
)
dstStride
)
:
"memory"
);
}
static
void
DEF
(
put
,
pixels8_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
"movq (%1), %%mm0
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"),%%mm2
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm0
,
%%
mm4
,
%%
mm2
,
%%
mm1
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"),%%mm0
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm2
,
%%
mm4
,
%%
mm0
,
%%
mm1
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
static
void
DEF
(
put
,
pixels8_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
static
void
DEF
(
put
,
pixels8_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
{
MOVQ_ZERO
(
mm7
);
MOVQ_ZERO
(
mm7
);
...
@@ -297,27 +91,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff
...
@@ -297,27 +91,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff
:
REG_a
,
"memory"
);
:
REG_a
,
"memory"
);
}
}
// avg_pixels
static
void
av_unused
DEF
(
avg
,
pixels4
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movd %0, %%mm0
\n\t
"
"movd %1, %%mm1
\n\t
"
OP_AVG
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
"movd %%mm2, %0
\n\t
"
:
"+m"
(
*
block
)
:
"m"
(
*
pixels
)
:
"memory"
);
pixels
+=
line_size
;
block
+=
line_size
;
}
while
(
--
h
);
}
#ifndef NO_RND
// in case more speed is needed - unroling would certainly help
// in case more speed is needed - unroling would certainly help
static
void
DEF
(
avg
,
pixels8
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
static
void
DEF
(
avg
,
pixels8
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
{
...
@@ -337,7 +110,6 @@ static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t l
...
@@ -337,7 +110,6 @@ static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t l
}
}
while
(
--
h
);
while
(
--
h
);
}
}
#endif // NO_RND
static
void
DEF
(
avg
,
pixels16
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
static
void
DEF
(
avg
,
pixels16
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
{
...
@@ -362,141 +134,6 @@ static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t
...
@@ -362,141 +134,6 @@ static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t
while
(
--
h
);
while
(
--
h
);
}
}
#ifndef NO_RND
static
void
DEF
(
avg
,
pixels8_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %1, %%mm0
\n\t
"
"movq 1%1, %%mm1
\n\t
"
"movq %0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, %0
\n\t
"
:
"+m"
(
*
block
)
:
"m"
(
*
pixels
)
:
"memory"
);
pixels
+=
line_size
;
block
+=
line_size
;
}
while
(
--
h
);
}
#endif // NO_RND
static
av_unused
void
DEF
(
avg
,
pixels8_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %1, %%mm0
\n\t
"
"movq %2, %%mm1
\n\t
"
"movq %0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, %0
\n\t
"
:
"+m"
(
*
dst
)
:
"m"
(
*
src1
),
"m"
(
*
src2
)
:
"memory"
);
dst
+=
dstStride
;
src1
+=
src1Stride
;
src2
+=
8
;
}
while
(
--
h
);
}
static
void
DEF
(
avg
,
pixels16_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %1, %%mm0
\n\t
"
"movq 1%1, %%mm1
\n\t
"
"movq %0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, %0
\n\t
"
"movq 8%1, %%mm0
\n\t
"
"movq 9%1, %%mm1
\n\t
"
"movq 8%0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, 8%0
\n\t
"
:
"+m"
(
*
block
)
:
"m"
(
*
pixels
)
:
"memory"
);
pixels
+=
line_size
;
block
+=
line_size
;
}
while
(
--
h
);
}
static
av_unused
void
DEF
(
avg
,
pixels16_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %1, %%mm0
\n\t
"
"movq %2, %%mm1
\n\t
"
"movq %0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, %0
\n\t
"
"movq 8%1, %%mm0
\n\t
"
"movq 8%2, %%mm1
\n\t
"
"movq 8%0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, 8%0
\n\t
"
:
"+m"
(
*
dst
)
:
"m"
(
*
src1
),
"m"
(
*
src2
)
:
"memory"
);
dst
+=
dstStride
;
src1
+=
src1Stride
;
src2
+=
16
;
}
while
(
--
h
);
}
static
void
DEF
(
avg
,
pixels8_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
"movq (%1), %%mm0
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm2
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm0
,
%%
mm4
,
%%
mm2
,
%%
mm1
,
%%
mm5
)
"movq (%2), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm4
,
%%
mm0
,
%%
mm6
)
"movq (%2, %3), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm5
,
%%
mm1
,
%%
mm6
)
"movq %%mm0, (%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm0
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm2
,
%%
mm4
,
%%
mm0
,
%%
mm1
,
%%
mm5
)
"movq (%2), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm4
,
%%
mm2
,
%%
mm6
)
"movq (%2, %3), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm5
,
%%
mm1
,
%%
mm6
)
"movq %%mm2, (%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
// this routine is 'slightly' suboptimal but mostly unused
// this routine is 'slightly' suboptimal but mostly unused
static
void
DEF
(
avg
,
pixels8_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
static
void
DEF
(
avg
,
pixels8_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
{
...
@@ -573,21 +210,11 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff
...
@@ -573,21 +210,11 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff
}
}
//FIXME optimize
//FIXME optimize
static
void
DEF
(
put
,
pixels16_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
put
,
pixels8_y2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
put
,
pixels8_y2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
static
void
DEF
(
put
,
pixels16_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
static
void
DEF
(
put
,
pixels16_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
put
,
pixels8_xy2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
put
,
pixels8_xy2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
put
,
pixels8_xy2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
DEF
(
put
,
pixels8_xy2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
}
static
void
DEF
(
avg
,
pixels16_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
avg
,
pixels8_y2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
avg
,
pixels8_y2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
static
void
DEF
(
avg
,
pixels16_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
static
void
DEF
(
avg
,
pixels16_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
avg
,
pixels8_xy2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
avg
,
pixels8_xy2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
avg
,
pixels8_xy2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
DEF
(
avg
,
pixels8_xy2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
...
...
libavcodec/x86/fpelbase.asm
0 → 100644
View file @
3ced55d5
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION
.
text
INIT_MMX
mmxext
; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
%macro
PIXELS48
2
%if
%2
==
4
%define
OP
movh
%else
%define
OP
mova
%endif
cglobal
%1
_pixels
%2
,
4
,
5
movsxdifnidn
r2
,
r2d
lea
r4
,
[
r2
*
3
]
.
loop
:
OP
m0
,
[r1]
OP
m1
,
[
r1
+
r2
]
OP
m2
,
[
r1
+
r2
*
2
]
OP
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
%ifidn
%1
,
avg
pavgb
m0
,
[r0]
pavgb
m1
,
[
r0
+
r2
]
pavgb
m2
,
[
r0
+
r2
*
2
]
pavgb
m3
,
[
r0
+
r4
]
%endif
OP
[r0],
m0
OP
[
r0
+
r2
]
,
m1
OP
[
r0
+
r2
*
2
]
,
m2
OP
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jne
.
loop
RET
%endmacro
PIXELS48
put
,
4
PIXELS48
avg
,
4
PIXELS48
put
,
8
PIXELS48
avg
,
8
INIT_XMM
sse2
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal
put_pixels16
,
4
,
5
,
4
lea
r4
,
[
r2
*
3
]
.
loop
:
movu
m0
,
[r1]
movu
m1
,
[
r1
+
r2
]
movu
m2
,
[
r1
+
r2
*
2
]
movu
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
mova
[r0],
m0
mova
[
r0
+
r2
]
,
m1
mova
[
r0
+
r2
*
2
]
,
m2
mova
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jnz
.
loop
REP_RET
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal
avg_pixels16
,
4
,
5
,
4
lea
r4
,
[
r2
*
3
]
.
loop
:
movu
m0
,
[r1]
movu
m1
,
[
r1
+
r2
]
movu
m2
,
[
r1
+
r2
*
2
]
movu
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
pavgb
m0
,
[r0]
pavgb
m1
,
[
r0
+
r2
]
pavgb
m2
,
[
r0
+
r2
*
2
]
pavgb
m3
,
[
r0
+
r4
]
mova
[r0],
m0
mova
[
r0
+
r2
]
,
m1
mova
[
r0
+
r2
*
2
]
,
m2
mova
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jnz
.
loop
REP_RET
libavcodec/x86/
dsputil
_avg_template.c
→
libavcodec/x86/
hpeldsp
_avg_template.c
View file @
3ced55d5
File moved
libavcodec/x86/hpeldsp_init.c
0 → 100644
View file @
3ced55d5
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/hpeldsp.h"
#include "dsputil_mmx.h"
//#undef NDEBUG
//#include <assert.h>
#if HAVE_YASM
void
ff_put_pixels8_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels16_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels16_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_exact_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_x2_exact_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_y2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_pixels8_y2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_exact_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_put_no_rnd_pixels8_y2_exact_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_x2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_x2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_y2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_y2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_xy2_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels8_xy2_3dnow
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
#endif
/* HAVE_YASM */
#if HAVE_INLINE_ASM
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
#ifndef PIC
#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
#else
// for shared library it's better to use this way for accessing constants
// pcmpeqd -> -1
#define MOVQ_BONE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"psrlw $15, %%"#regd" \n\t" \
"packuswb %%"#regd", %%"#regd" \n\t" ::)
#define MOVQ_WTWO(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"psrlw $15, %%"#regd" \n\t" \
"psllw $1, %%"#regd" \n\t"::)
#endif
// using regr as temporary and for the output result
// first argument is unmodifed and second is trashed
// regfe is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"paddb "#regb", "#regr" \n\t"
#define PAVGB_MMX(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t"
// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"por "#regb", "#regr" \n\t" \
"por "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t" \
"psubb "#regd", "#regp" \n\t"
/***********************************/
/* MMX no rounding */
#define NO_RND 1
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "hpeldsp_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef NO_RND
/***********************************/
/* MMX rounding */
#define DEF(x, y) x ## _ ## y ## _mmx
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "hpeldsp_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef OP_AVG
#endif
/* HAVE_INLINE_ASM */
#if HAVE_YASM
#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
/***********************************/
/* 3Dnow specific */
#define DEF(x) x ## _3dnow
#include "hpeldsp_avg_template.c"
#undef DEF
/***********************************/
/* MMXEXT specific */
#define DEF(x) x ## _mmxext
#include "hpeldsp_avg_template.c"
#undef DEF
#endif
/* HAVE_YASM */
#if HAVE_INLINE_ASM
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
#define put_pixels16_mmxext put_pixels16_mmx
#define put_pixels8_mmxext put_pixels8_mmx
#define put_pixels4_mmxext put_pixels4_mmx
#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
static
void
put_pixels8_mmx
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1 ), %%mm0
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq %%mm0, (%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1 ), %%mm0
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq %%mm0, (%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+r"
(
pixels
),
"+r"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
"%"
REG_a
,
"memory"
);
}
static
void
put_pixels16_mmx
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1 ), %%mm0
\n\t
"
"movq 8(%1 ), %%mm4
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq 8(%1, %3), %%mm5
\n\t
"
"movq %%mm0, (%2)
\n\t
"
"movq %%mm4, 8(%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"movq %%mm5, 8(%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1 ), %%mm0
\n\t
"
"movq 8(%1 ), %%mm4
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq 8(%1, %3), %%mm5
\n\t
"
"movq %%mm0, (%2)
\n\t
"
"movq %%mm4, 8(%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"movq %%mm5, 8(%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+r"
(
pixels
),
"+r"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
"%"
REG_a
,
"memory"
);
}
#endif
/* HAVE_INLINE_ASM */
void
ff_put_pixels16_sse2
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
void
ff_avg_pixels16_sse2
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
);
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
static
void
hpeldsp_init_mmx
(
HpelDSPContext
*
c
,
int
flags
,
int
mm_flags
)
{
#if HAVE_INLINE_ASM
SET_HPEL_FUNCS
(
put
,
[
0
],
16
,
mmx
);
SET_HPEL_FUNCS
(
put_no_rnd
,
[
0
],
16
,
mmx
);
SET_HPEL_FUNCS
(
avg
,
[
0
],
16
,
mmx
);
SET_HPEL_FUNCS
(
avg_no_rnd
,
,
16
,
mmx
);
SET_HPEL_FUNCS
(
put
,
[
1
],
8
,
mmx
);
SET_HPEL_FUNCS
(
put_no_rnd
,
[
1
],
8
,
mmx
);
SET_HPEL_FUNCS
(
avg
,
[
1
],
8
,
mmx
);
#endif
/* HAVE_INLINE_ASM */
}
static
void
hpeldsp_init_mmxext
(
HpelDSPContext
*
c
,
int
flags
,
int
mm_flags
)
{
#if HAVE_YASM
c
->
put_pixels_tab
[
0
][
1
]
=
ff_put_pixels16_x2_mmxext
;
c
->
put_pixels_tab
[
0
][
2
]
=
ff_put_pixels16_y2_mmxext
;
c
->
avg_pixels_tab
[
0
][
0
]
=
ff_avg_pixels16_mmxext
;
c
->
avg_pixels_tab
[
0
][
1
]
=
ff_avg_pixels16_x2_mmxext
;
c
->
avg_pixels_tab
[
0
][
2
]
=
ff_avg_pixels16_y2_mmxext
;
c
->
put_pixels_tab
[
1
][
1
]
=
ff_put_pixels8_x2_mmxext
;
c
->
put_pixels_tab
[
1
][
2
]
=
ff_put_pixels8_y2_mmxext
;
c
->
avg_pixels_tab
[
1
][
0
]
=
ff_avg_pixels8_mmxext
;
c
->
avg_pixels_tab
[
1
][
1
]
=
ff_avg_pixels8_x2_mmxext
;
c
->
avg_pixels_tab
[
1
][
2
]
=
ff_avg_pixels8_y2_mmxext
;
if
(
!
(
flags
&
CODEC_FLAG_BITEXACT
))
{
c
->
put_no_rnd_pixels_tab
[
0
][
1
]
=
ff_put_no_rnd_pixels16_x2_mmxext
;
c
->
put_no_rnd_pixels_tab
[
0
][
2
]
=
ff_put_no_rnd_pixels16_y2_mmxext
;
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_mmxext
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_mmxext
;
c
->
avg_pixels_tab
[
0
][
3
]
=
ff_avg_pixels16_xy2_mmxext
;
c
->
avg_pixels_tab
[
1
][
3
]
=
ff_avg_pixels8_xy2_mmxext
;
}
#endif
/* HAVE_YASM */
#if HAVE_MMXEXT_EXTERNAL
if
(
flags
&
CODEC_FLAG_BITEXACT
&&
CONFIG_VP3_DECODER
)
{
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_exact_mmxext
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_exact_mmxext
;
}
#endif
/* HAVE_MMXEXT_EXTERNAL */
}
static
void
hpeldsp_init_3dnow
(
HpelDSPContext
*
c
,
int
flags
,
int
mm_flags
)
{
#if HAVE_YASM
c
->
put_pixels_tab
[
0
][
1
]
=
ff_put_pixels16_x2_3dnow
;
c
->
put_pixels_tab
[
0
][
2
]
=
ff_put_pixels16_y2_3dnow
;
c
->
avg_pixels_tab
[
0
][
0
]
=
ff_avg_pixels16_3dnow
;
c
->
avg_pixels_tab
[
0
][
1
]
=
ff_avg_pixels16_x2_3dnow
;
c
->
avg_pixels_tab
[
0
][
2
]
=
ff_avg_pixels16_y2_3dnow
;
c
->
put_pixels_tab
[
1
][
1
]
=
ff_put_pixels8_x2_3dnow
;
c
->
put_pixels_tab
[
1
][
2
]
=
ff_put_pixels8_y2_3dnow
;
c
->
avg_pixels_tab
[
1
][
0
]
=
ff_avg_pixels8_3dnow
;
c
->
avg_pixels_tab
[
1
][
1
]
=
ff_avg_pixels8_x2_3dnow
;
c
->
avg_pixels_tab
[
1
][
2
]
=
ff_avg_pixels8_y2_3dnow
;
if
(
!
(
flags
&
CODEC_FLAG_BITEXACT
)){
c
->
put_no_rnd_pixels_tab
[
0
][
1
]
=
ff_put_no_rnd_pixels16_x2_3dnow
;
c
->
put_no_rnd_pixels_tab
[
0
][
2
]
=
ff_put_no_rnd_pixels16_y2_3dnow
;
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_3dnow
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_3dnow
;
c
->
avg_pixels_tab
[
0
][
3
]
=
ff_avg_pixels16_xy2_3dnow
;
c
->
avg_pixels_tab
[
1
][
3
]
=
ff_avg_pixels8_xy2_3dnow
;
}
if
(
flags
&
CODEC_FLAG_BITEXACT
&&
CONFIG_VP3_DECODER
)
{
c
->
put_no_rnd_pixels_tab
[
1
][
1
]
=
ff_put_no_rnd_pixels8_x2_exact_3dnow
;
c
->
put_no_rnd_pixels_tab
[
1
][
2
]
=
ff_put_no_rnd_pixels8_y2_exact_3dnow
;
}
#endif
/* HAVE_YASM */
}
static
void
hpeldsp_init_sse2
(
HpelDSPContext
*
c
,
int
flags
,
int
mm_flags
)
{
#if HAVE_SSE2_EXTERNAL
if
(
!
(
mm_flags
&
AV_CPU_FLAG_SSE2SLOW
))
{
// these functions are slower than mmx on AMD, but faster on Intel
c
->
put_pixels_tab
[
0
][
0
]
=
ff_put_pixels16_sse2
;
c
->
put_no_rnd_pixels_tab
[
0
][
0
]
=
ff_put_pixels16_sse2
;
c
->
avg_pixels_tab
[
0
][
0
]
=
ff_avg_pixels16_sse2
;
}
#endif
/* HAVE_SSE2_EXTERNAL */
}
void
ff_hpeldsp_init_x86
(
HpelDSPContext
*
c
,
int
flags
)
{
int
mm_flags
=
av_get_cpu_flags
();
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
hpeldsp_init_mmx
(
c
,
flags
,
mm_flags
);
if
(
mm_flags
&
AV_CPU_FLAG_MMXEXT
)
hpeldsp_init_mmxext
(
c
,
flags
,
mm_flags
);
if
(
mm_flags
&
AV_CPU_FLAG_3DNOW
)
hpeldsp_init_3dnow
(
c
,
flags
,
mm_flags
);
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
)
hpeldsp_init_sse2
(
c
,
flags
,
mm_flags
);
}
libavcodec/x86/hpeldsp_rnd_template.c
0 → 100644
View file @
3ced55d5
/*
* DSP utils mmx functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
// put_pixels
static
void
DEF
(
put
,
pixels8_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
static
void
DEF
(
put
,
pixels16_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"movq 8(%1), %%mm0
\n\t
"
"movq 9(%1), %%mm1
\n\t
"
"movq 8(%1, %3), %%mm2
\n\t
"
"movq 9(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, 8(%2)
\n\t
"
"movq %%mm5, 8(%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %3), %%mm2
\n\t
"
"movq 1(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"movq 8(%1), %%mm0
\n\t
"
"movq 9(%1), %%mm1
\n\t
"
"movq 8(%1, %3), %%mm2
\n\t
"
"movq 9(%1, %3), %%mm3
\n\t
"
PAVGBP
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm2
,
%%
mm3
,
%%
mm5
)
"movq %%mm4, 8(%2)
\n\t
"
"movq %%mm5, 8(%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
static
void
DEF
(
put
,
pixels8_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
"movq (%1), %%mm0
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"),%%mm2
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm0
,
%%
mm4
,
%%
mm2
,
%%
mm1
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"),%%mm0
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm2
,
%%
mm4
,
%%
mm0
,
%%
mm1
,
%%
mm5
)
"movq %%mm4, (%2)
\n\t
"
"movq %%mm5, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
static
void
DEF
(
put
,
pixels8_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_ZERO
(
mm7
);
SET_RND
(
mm6
);
// =2 for rnd and =1 for no_rnd version
__asm__
volatile
(
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm4
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpckhbw %%mm7, %%mm5
\n\t
"
"paddusw %%mm0, %%mm4
\n\t
"
"paddusw %%mm1, %%mm5
\n\t
"
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
"add %3, %1
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm0
\n\t
"
"movq 1(%1, %%"
REG_a
"), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"paddusw %%mm2, %%mm0
\n\t
"
"paddusw %%mm3, %%mm1
\n\t
"
"paddusw %%mm6, %%mm4
\n\t
"
"paddusw %%mm6, %%mm5
\n\t
"
"paddusw %%mm0, %%mm4
\n\t
"
"paddusw %%mm1, %%mm5
\n\t
"
"psrlw $2, %%mm4
\n\t
"
"psrlw $2, %%mm5
\n\t
"
"packuswb %%mm5, %%mm4
\n\t
"
"movq %%mm4, (%2, %%"
REG_a
")
\n\t
"
"add %3, %%"
REG_a
"
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm2
\n\t
"
// 0 <-> 2 1 <-> 3
"movq 1(%1, %%"
REG_a
"), %%mm4
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"punpckhbw %%mm7, %%mm5
\n\t
"
"paddusw %%mm2, %%mm4
\n\t
"
"paddusw %%mm3, %%mm5
\n\t
"
"paddusw %%mm6, %%mm0
\n\t
"
"paddusw %%mm6, %%mm1
\n\t
"
"paddusw %%mm4, %%mm0
\n\t
"
"paddusw %%mm5, %%mm1
\n\t
"
"psrlw $2, %%mm0
\n\t
"
"psrlw $2, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%2, %%"
REG_a
")
\n\t
"
"add %3, %%"
REG_a
"
\n\t
"
"subl $2, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
)
:
"D"
(
block
),
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
// avg_pixels
#ifndef NO_RND
// in case more speed is needed - unroling would certainly help
static
void
DEF
(
avg
,
pixels8
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %0, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
OP_AVG
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
"movq %%mm2, %0
\n\t
"
:
"+m"
(
*
block
)
:
"m"
(
*
pixels
)
:
"memory"
);
pixels
+=
line_size
;
block
+=
line_size
;
}
while
(
--
h
);
}
#endif // NO_RND
static
void
DEF
(
avg
,
pixels16
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %0, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
OP_AVG
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
"movq %%mm2, %0
\n\t
"
"movq 8%0, %%mm0
\n\t
"
"movq 8%1, %%mm1
\n\t
"
OP_AVG
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
"movq %%mm2, 8%0
\n\t
"
:
"+m"
(
*
block
)
:
"m"
(
*
pixels
)
:
"memory"
);
pixels
+=
line_size
;
block
+=
line_size
;
}
while
(
--
h
);
}
#ifndef NO_RND
static
void
DEF
(
avg
,
pixels8_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %1, %%mm0
\n\t
"
"movq 1%1, %%mm1
\n\t
"
"movq %0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, %0
\n\t
"
:
"+m"
(
*
block
)
:
"m"
(
*
pixels
)
:
"memory"
);
pixels
+=
line_size
;
block
+=
line_size
;
}
while
(
--
h
);
}
#endif // NO_RND
static
void
DEF
(
avg
,
pixels16_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
JUMPALIGN
();
do
{
__asm__
volatile
(
"movq %1, %%mm0
\n\t
"
"movq 1%1, %%mm1
\n\t
"
"movq %0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, %0
\n\t
"
"movq 8%1, %%mm0
\n\t
"
"movq 9%1, %%mm1
\n\t
"
"movq 8%0, %%mm3
\n\t
"
PAVGB
(
%%
mm0
,
%%
mm1
,
%%
mm2
,
%%
mm6
)
OP_AVG
(
%%
mm3
,
%%
mm2
,
%%
mm0
,
%%
mm6
)
"movq %%mm0, 8%0
\n\t
"
:
"+m"
(
*
block
)
:
"m"
(
*
pixels
)
:
"memory"
);
pixels
+=
line_size
;
block
+=
line_size
;
}
while
(
--
h
);
}
static
void
DEF
(
avg
,
pixels8_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_BFE
(
mm6
);
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
"movq (%1), %%mm0
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm2
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm0
,
%%
mm4
,
%%
mm2
,
%%
mm1
,
%%
mm5
)
"movq (%2), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm4
,
%%
mm0
,
%%
mm6
)
"movq (%2, %3), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm5
,
%%
mm1
,
%%
mm6
)
"movq %%mm0, (%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movq (%1, %3), %%mm1
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm0
\n\t
"
PAVGBP
(
%%
mm1
,
%%
mm2
,
%%
mm4
,
%%
mm0
,
%%
mm1
,
%%
mm5
)
"movq (%2), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm4
,
%%
mm2
,
%%
mm6
)
"movq (%2, %3), %%mm3
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm5
,
%%
mm1
,
%%
mm6
)
"movq %%mm2, (%2)
\n\t
"
"movq %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
),
"+D"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
// this routine is 'slightly' suboptimal but mostly unused
static
void
DEF
(
avg
,
pixels8_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
)
{
MOVQ_ZERO
(
mm7
);
SET_RND
(
mm6
);
// =2 for rnd and =1 for no_rnd version
__asm__
volatile
(
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm4
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpckhbw %%mm7, %%mm5
\n\t
"
"paddusw %%mm0, %%mm4
\n\t
"
"paddusw %%mm1, %%mm5
\n\t
"
"xor %%"
REG_a
", %%"
REG_a
"
\n\t
"
"add %3, %1
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm0
\n\t
"
"movq 1(%1, %%"
REG_a
"), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"paddusw %%mm2, %%mm0
\n\t
"
"paddusw %%mm3, %%mm1
\n\t
"
"paddusw %%mm6, %%mm4
\n\t
"
"paddusw %%mm6, %%mm5
\n\t
"
"paddusw %%mm0, %%mm4
\n\t
"
"paddusw %%mm1, %%mm5
\n\t
"
"psrlw $2, %%mm4
\n\t
"
"psrlw $2, %%mm5
\n\t
"
"movq (%2, %%"
REG_a
"), %%mm3
\n\t
"
"packuswb %%mm5, %%mm4
\n\t
"
"pcmpeqd %%mm2, %%mm2
\n\t
"
"paddb %%mm2, %%mm2
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm4
,
%%
mm5
,
%%
mm2
)
"movq %%mm5, (%2, %%"
REG_a
")
\n\t
"
"add %3, %%"
REG_a
"
\n\t
"
"movq (%1, %%"
REG_a
"), %%mm2
\n\t
"
// 0 <-> 2 1 <-> 3
"movq 1(%1, %%"
REG_a
"), %%mm4
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"punpckhbw %%mm7, %%mm5
\n\t
"
"paddusw %%mm2, %%mm4
\n\t
"
"paddusw %%mm3, %%mm5
\n\t
"
"paddusw %%mm6, %%mm0
\n\t
"
"paddusw %%mm6, %%mm1
\n\t
"
"paddusw %%mm4, %%mm0
\n\t
"
"paddusw %%mm5, %%mm1
\n\t
"
"psrlw $2, %%mm0
\n\t
"
"psrlw $2, %%mm1
\n\t
"
"movq (%2, %%"
REG_a
"), %%mm3
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"pcmpeqd %%mm2, %%mm2
\n\t
"
"paddb %%mm2, %%mm2
\n\t
"
OP_AVG
(
%%
mm3
,
%%
mm0
,
%%
mm1
,
%%
mm2
)
"movq %%mm1, (%2, %%"
REG_a
")
\n\t
"
"add %3, %%"
REG_a
"
\n\t
"
"subl $2, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+S"
(
pixels
)
:
"D"
(
block
),
"r"
((
x86_reg
)
line_size
)
:
REG_a
,
"memory"
);
}
//FIXME optimize
static
void
DEF
(
put
,
pixels16_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
put
,
pixels8_y2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
put
,
pixels8_y2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
static
void
DEF
(
put
,
pixels16_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
put
,
pixels8_xy2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
put
,
pixels8_xy2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
static
void
DEF
(
avg
,
pixels16_y2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
avg
,
pixels8_y2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
avg
,
pixels8_y2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
static
void
DEF
(
avg
,
pixels16_xy2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
ptrdiff_t
line_size
,
int
h
){
DEF
(
avg
,
pixels8_xy2
)(
block
,
pixels
,
line_size
,
h
);
DEF
(
avg
,
pixels8_xy2
)(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
libavcodec/x86/qpelbase.asm
View file @
3ced55d5
...
@@ -174,83 +174,3 @@ cglobal %1_pixels16_l2, 6,6
...
@@ -174,83 +174,3 @@ cglobal %1_pixels16_l2, 6,6
INIT_MMX
mmxext
INIT_MMX
mmxext
PIXELS16_L2
put
PIXELS16_L2
put
PIXELS16_L2
avg
PIXELS16_L2
avg
INIT_MMX
mmxext
; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
%macro
PIXELS48
2
%if
%2
==
4
%define
OP
movh
%else
%define
OP
mova
%endif
cglobal
%1
_pixels
%2
,
4
,
5
movsxdifnidn
r2
,
r2d
lea
r4
,
[
r2
*
3
]
.
loop
:
OP
m0
,
[r1]
OP
m1
,
[
r1
+
r2
]
OP
m2
,
[
r1
+
r2
*
2
]
OP
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
%ifidn
%1
,
avg
pavgb
m0
,
[r0]
pavgb
m1
,
[
r0
+
r2
]
pavgb
m2
,
[
r0
+
r2
*
2
]
pavgb
m3
,
[
r0
+
r4
]
%endif
OP
[r0],
m0
OP
[
r0
+
r2
]
,
m1
OP
[
r0
+
r2
*
2
]
,
m2
OP
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jne
.
loop
RET
%endmacro
PIXELS48
put
,
4
PIXELS48
avg
,
4
PIXELS48
put
,
8
PIXELS48
avg
,
8
INIT_XMM
sse2
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal
put_pixels16
,
4
,
5
,
4
lea
r4
,
[
r2
*
3
]
.
loop
:
movu
m0
,
[r1]
movu
m1
,
[
r1
+
r2
]
movu
m2
,
[
r1
+
r2
*
2
]
movu
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
mova
[r0],
m0
mova
[
r0
+
r2
]
,
m1
mova
[
r0
+
r2
*
2
]
,
m2
mova
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jnz
.
loop
REP_RET
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal
avg_pixels16
,
4
,
5
,
4
lea
r4
,
[
r2
*
3
]
.
loop
:
movu
m0
,
[r1]
movu
m1
,
[
r1
+
r2
]
movu
m2
,
[
r1
+
r2
*
2
]
movu
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
pavgb
m0
,
[r0]
pavgb
m1
,
[
r0
+
r2
]
pavgb
m2
,
[
r0
+
r2
*
2
]
pavgb
m3
,
[
r0
+
r4
]
mova
[r0],
m0
mova
[
r0
+
r2
]
,
m1
mova
[
r0
+
r2
*
2
]
,
m2
mova
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jnz
.
loop
REP_RET
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment