Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
9f00b1cb
Commit
9f00b1cb
authored
Jan 16, 2013
by
Daniel Kang
Committed by
Luca Barbato
Jan 21, 2013
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
dsputilenc: x86: Convert pixel inline asm to yasm
Signed-off-by:
Luca Barbato
<
lu_zero@gentoo.org
>
parent
c7df1532
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
172 additions
and
181 deletions
+172
-181
dsputilenc.asm
libavcodec/x86/dsputilenc.asm
+152
-0
dsputilenc_mmx.c
libavcodec/x86/dsputilenc_mmx.c
+20
-181
No files found.
libavcodec/x86/dsputilenc.asm
View file @
9f00b1cb
...
@@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8
...
@@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8
paddd
m7
,
m1
paddd
m7
,
m1
movd
eax
,
m7
; return value
movd
eax
,
m7
; return value
RET
RET
INIT_MMX
mmx
; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
cglobal
get_pixels
,
3
,
4
movsxdifnidn
r2
,
r2d
add
r0
,
128
mov
r3
,
-
128
pxor
m7
,
m7
.
loop
:
mova
m0
,
[r1]
mova
m2
,
[
r1
+
r2
]
mova
m1
,
m0
mova
m3
,
m2
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
mova
[
r0
+
r3
+
0
]
,
m0
mova
[
r0
+
r3
+
8
]
,
m1
mova
[
r0
+
r3
+
16
]
,
m2
mova
[
r0
+
r3
+
24
]
,
m3
lea
r1
,
[
r1
+
r2
*
2
]
add
r3
,
32
js
.
loop
REP_RET
INIT_XMM
sse2
cglobal
get_pixels
,
3
,
4
movsxdifnidn
r2
,
r2d
lea
r3
,
[
r2
*
3
]
pxor
m4
,
m4
movh
m0
,
[r1]
movh
m1
,
[
r1
+
r2
]
movh
m2
,
[
r1
+
r2
*
2
]
movh
m3
,
[
r1
+
r3
]
lea
r1
,
[
r1
+
r2
*
4
]
punpcklbw
m0
,
m4
punpcklbw
m1
,
m4
punpcklbw
m2
,
m4
punpcklbw
m3
,
m4
mova
[r0],
m0
mova
[
r0
+
0x10
]
,
m1
mova
[
r0
+
0x20
]
,
m2
mova
[
r0
+
0x30
]
,
m3
movh
m0
,
[r1]
movh
m1
,
[
r1
+
r2
*
1
]
movh
m2
,
[
r1
+
r2
*
2
]
movh
m3
,
[
r1
+
r3
]
punpcklbw
m0
,
m4
punpcklbw
m1
,
m4
punpcklbw
m2
,
m4
punpcklbw
m3
,
m4
mova
[
r0
+
0x40
]
,
m0
mova
[
r0
+
0x50
]
,
m1
mova
[
r0
+
0x60
]
,
m2
mova
[
r0
+
0x70
]
,
m3
RET
INIT_MMX
mmx
; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride)
cglobal
diff_pixels
,
4
,
5
movsxdifnidn
r3
,
r3d
pxor
m7
,
m7
add
r0
,
128
mov
r4
,
-
128
.
loop
:
mova
m0
,
[r1]
mova
m2
,
[r2]
mova
m1
,
m0
mova
m3
,
m2
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
psubw
m0
,
m2
psubw
m1
,
m3
mova
[
r0
+
r4
+
0
]
,
m0
mova
[
r0
+
r4
+
8
]
,
m1
add
r1
,
r3
add
r2
,
r3
add
r4
,
16
jne
.
loop
REP_RET
INIT_MMX
mmx
; pix_sum16_mmx(uint8_t * pix, int line_size)
cglobal
pix_sum16
,
2
,
3
movsxdifnidn
r1
,
r1d
mov
r2
,
r1
neg
r2
shl
r2
,
4
sub
r0
,
r2
pxor
m7
,
m7
pxor
m6
,
m6
.
loop
:
mova
m0
,
[
r0
+
r2
+
0
]
mova
m1
,
[
r0
+
r2
+
0
]
mova
m2
,
[
r0
+
r2
+
8
]
mova
m3
,
[
r0
+
r2
+
8
]
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
paddw
m1
,
m0
paddw
m3
,
m2
paddw
m3
,
m1
paddw
m6
,
m3
add
r2
,
r1
js
.
loop
mova
m5
,
m6
psrlq
m6
,
32
paddw
m6
,
m5
mova
m5
,
m6
psrlq
m6
,
16
paddw
m6
,
m5
movd
eax
,
m6
and
eax
,
0xffff
RET
INIT_MMX
mmx
; pix_norm1_mmx(uint8_t *pix, int line_size)
cglobal
pix_norm1
,
2
,
4
movsxdifnidn
r1
,
r1d
mov
r2
,
16
pxor
m0
,
m0
pxor
m7
,
m7
.
loop
:
mova
m2
,
[
r0
+
0
]
mova
m3
,
[
r0
+
8
]
mova
m1
,
m2
punpckhbw
m1
,
m0
punpcklbw
m2
,
m0
mova
m4
,
m3
punpckhbw
m3
,
m0
punpcklbw
m4
,
m0
pmaddwd
m1
,
m1
pmaddwd
m2
,
m2
pmaddwd
m3
,
m3
pmaddwd
m4
,
m4
paddd
m2
,
m1
paddd
m4
,
m3
paddd
m7
,
m2
add
r0
,
r1
paddd
m7
,
m4
dec
r2
jne
.
loop
mova
m1
,
m7
psrlq
m7
,
32
paddd
m1
,
m7
movd
eax
,
m1
RET
libavcodec/x86/dsputilenc_mmx.c
View file @
9f00b1cb
...
@@ -30,181 +30,14 @@
...
@@ -30,181 +30,14 @@
#include "libavcodec/mathops.h"
#include "libavcodec/mathops.h"
#include "dsputil_mmx.h"
#include "dsputil_mmx.h"
void
ff_get_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
);
void
ff_get_pixels_sse2
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
);
void
ff_diff_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
s1
,
const
uint8_t
*
s2
,
int
stride
);
int
ff_pix_sum16_mmx
(
uint8_t
*
pix
,
int
line_size
);
int
ff_pix_norm1_mmx
(
uint8_t
*
pix
,
int
line_size
);
#if HAVE_INLINE_ASM
#if HAVE_INLINE_ASM
static
void
get_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
)
{
__asm__
volatile
(
"mov $-128, %%"
REG_a
"
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
".p2align 4
\n\t
"
"1:
\n\t
"
"movq (%0), %%mm0
\n\t
"
"movq (%0, %2), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"movq %%mm0, (%1, %%"
REG_a
")
\n\t
"
"movq %%mm1, 8(%1, %%"
REG_a
")
\n\t
"
"movq %%mm2, 16(%1, %%"
REG_a
")
\n\t
"
"movq %%mm3, 24(%1, %%"
REG_a
")
\n\t
"
"add %3, %0
\n\t
"
"add $32, %%"
REG_a
"
\n\t
"
"js 1b
\n\t
"
:
"+r"
(
pixels
)
:
"r"
(
block
+
64
),
"r"
((
x86_reg
)
line_size
),
"r"
((
x86_reg
)
line_size
*
2
)
:
"%"
REG_a
);
}
static
void
get_pixels_sse2
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
)
{
__asm__
volatile
(
"pxor %%xmm4, %%xmm4
\n\t
"
"movq (%0), %%xmm0
\n\t
"
"movq (%0, %2), %%xmm1
\n\t
"
"movq (%0, %2,2), %%xmm2
\n\t
"
"movq (%0, %3), %%xmm3
\n\t
"
"lea (%0,%2,4), %0
\n\t
"
"punpcklbw %%xmm4, %%xmm0
\n\t
"
"punpcklbw %%xmm4, %%xmm1
\n\t
"
"punpcklbw %%xmm4, %%xmm2
\n\t
"
"punpcklbw %%xmm4, %%xmm3
\n\t
"
"movdqa %%xmm0, (%1)
\n\t
"
"movdqa %%xmm1, 16(%1)
\n\t
"
"movdqa %%xmm2, 32(%1)
\n\t
"
"movdqa %%xmm3, 48(%1)
\n\t
"
"movq (%0), %%xmm0
\n\t
"
"movq (%0, %2), %%xmm1
\n\t
"
"movq (%0, %2,2), %%xmm2
\n\t
"
"movq (%0, %3), %%xmm3
\n\t
"
"punpcklbw %%xmm4, %%xmm0
\n\t
"
"punpcklbw %%xmm4, %%xmm1
\n\t
"
"punpcklbw %%xmm4, %%xmm2
\n\t
"
"punpcklbw %%xmm4, %%xmm3
\n\t
"
"movdqa %%xmm0, 64(%1)
\n\t
"
"movdqa %%xmm1, 80(%1)
\n\t
"
"movdqa %%xmm2, 96(%1)
\n\t
"
"movdqa %%xmm3, 112(%1)
\n\t
"
:
"+r"
(
pixels
)
:
"r"
(
block
),
"r"
((
x86_reg
)
line_size
),
"r"
((
x86_reg
)
line_size
*
3
)
);
}
static
inline
void
diff_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
s1
,
const
uint8_t
*
s2
,
int
stride
)
{
__asm__
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"mov $-128, %%"
REG_a
"
\n\t
"
".p2align 4
\n\t
"
"1:
\n\t
"
"movq (%0), %%mm0
\n\t
"
"movq (%1), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"psubw %%mm2, %%mm0
\n\t
"
"psubw %%mm3, %%mm1
\n\t
"
"movq %%mm0, (%2, %%"
REG_a
")
\n\t
"
"movq %%mm1, 8(%2, %%"
REG_a
")
\n\t
"
"add %3, %0
\n\t
"
"add %3, %1
\n\t
"
"add $16, %%"
REG_a
"
\n\t
"
"jnz 1b
\n\t
"
:
"+r"
(
s1
),
"+r"
(
s2
)
:
"r"
(
block
+
64
),
"r"
((
x86_reg
)
stride
)
:
"%"
REG_a
);
}
static
int
pix_sum16_mmx
(
uint8_t
*
pix
,
int
line_size
){
const
int
h
=
16
;
int
sum
;
x86_reg
index
=
-
line_size
*
h
;
__asm__
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"pxor %%mm6, %%mm6
\n\t
"
"1:
\n\t
"
"movq (%2, %1), %%mm0
\n\t
"
"movq (%2, %1), %%mm1
\n\t
"
"movq 8(%2, %1), %%mm2
\n\t
"
"movq 8(%2, %1), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"paddw %%mm0, %%mm1
\n\t
"
"paddw %%mm2, %%mm3
\n\t
"
"paddw %%mm1, %%mm3
\n\t
"
"paddw %%mm3, %%mm6
\n\t
"
"add %3, %1
\n\t
"
" js 1b
\n\t
"
"movq %%mm6, %%mm5
\n\t
"
"psrlq $32, %%mm6
\n\t
"
"paddw %%mm5, %%mm6
\n\t
"
"movq %%mm6, %%mm5
\n\t
"
"psrlq $16, %%mm6
\n\t
"
"paddw %%mm5, %%mm6
\n\t
"
"movd %%mm6, %0
\n\t
"
"andl $0xFFFF, %0
\n\t
"
:
"=&r"
(
sum
),
"+r"
(
index
)
:
"r"
(
pix
-
index
),
"r"
((
x86_reg
)
line_size
)
);
return
sum
;
}
static
int
pix_norm1_mmx
(
uint8_t
*
pix
,
int
line_size
)
{
int
tmp
;
__asm__
volatile
(
"movl $16,%%ecx
\n
"
"pxor %%mm0,%%mm0
\n
"
"pxor %%mm7,%%mm7
\n
"
"1:
\n
"
"movq (%0),%%mm2
\n
"
/* mm2 = pix[0-7] */
"movq 8(%0),%%mm3
\n
"
/* mm3 = pix[8-15] */
"movq %%mm2,%%mm1
\n
"
/* mm1 = mm2 = pix[0-7] */
"punpckhbw %%mm0,%%mm1
\n
"
/* mm1 = [pix4-7] */
"punpcklbw %%mm0,%%mm2
\n
"
/* mm2 = [pix0-3] */
"movq %%mm3,%%mm4
\n
"
/* mm4 = mm3 = pix[8-15] */
"punpckhbw %%mm0,%%mm3
\n
"
/* mm3 = [pix12-15] */
"punpcklbw %%mm0,%%mm4
\n
"
/* mm4 = [pix8-11] */
"pmaddwd %%mm1,%%mm1
\n
"
/* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
"pmaddwd %%mm2,%%mm2
\n
"
/* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
"pmaddwd %%mm3,%%mm3
\n
"
"pmaddwd %%mm4,%%mm4
\n
"
"paddd %%mm1,%%mm2
\n
"
/* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
pix2^2+pix3^2+pix6^2+pix7^2) */
"paddd %%mm3,%%mm4
\n
"
"paddd %%mm2,%%mm7
\n
"
"add %2, %0
\n
"
"paddd %%mm4,%%mm7
\n
"
"dec %%ecx
\n
"
"jnz 1b
\n
"
"movq %%mm7,%%mm1
\n
"
"psrlq $32, %%mm7
\n
"
/* shift hi dword to lo */
"paddd %%mm7,%%mm1
\n
"
"movd %%mm1,%1
\n
"
:
"+r"
(
pix
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
)
:
"%ecx"
);
return
tmp
;
}
static
int
sse8_mmx
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
static
int
sse8_mmx
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
tmp
;
int
tmp
;
__asm__
volatile
(
__asm__
volatile
(
...
@@ -1111,10 +944,23 @@ hadamard_func(ssse3)
...
@@ -1111,10 +944,23 @@ hadamard_func(ssse3)
void
ff_dsputilenc_init_mmx
(
DSPContext
*
c
,
AVCodecContext
*
avctx
)
void
ff_dsputilenc_init_mmx
(
DSPContext
*
c
,
AVCodecContext
*
avctx
)
{
{
int
mm_flags
=
av_get_cpu_flags
();
int
mm_flags
=
av_get_cpu_flags
();
#if HAVE_INLINE_ASM
int
bit_depth
=
avctx
->
bits_per_raw_sample
;
int
bit_depth
=
avctx
->
bits_per_raw_sample
;
#if HAVE_YASM
if
(
EXTERNAL_MMX
(
mm_flags
))
{
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
ff_get_pixels_mmx
;
c
->
diff_pixels
=
ff_diff_pixels_mmx
;
c
->
pix_sum
=
ff_pix_sum16_mmx
;
c
->
pix_norm1
=
ff_pix_norm1_mmx
;
}
if
(
EXTERNAL_SSE2
(
mm_flags
))
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
ff_get_pixels_sse2
;
#endif
/* HAVE_YASM */
#if HAVE_INLINE_ASM
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
{
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
{
const
int
dct_algo
=
avctx
->
dct_algo
;
const
int
dct_algo
=
avctx
->
dct_algo
;
if
(
avctx
->
bits_per_raw_sample
<=
8
&&
if
(
avctx
->
bits_per_raw_sample
<=
8
&&
...
@@ -1128,15 +974,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
...
@@ -1128,15 +974,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
}
}
}
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
get_pixels_mmx
;
c
->
diff_pixels
=
diff_pixels_mmx
;
c
->
pix_sum
=
pix_sum16_mmx
;
c
->
diff_bytes
=
diff_bytes_mmx
;
c
->
diff_bytes
=
diff_bytes_mmx
;
c
->
sum_abs_dctelem
=
sum_abs_dctelem_mmx
;
c
->
sum_abs_dctelem
=
sum_abs_dctelem_mmx
;
c
->
pix_norm1
=
pix_norm1_mmx
;
c
->
sse
[
0
]
=
sse16_mmx
;
c
->
sse
[
0
]
=
sse16_mmx
;
c
->
sse
[
1
]
=
sse8_mmx
;
c
->
sse
[
1
]
=
sse8_mmx
;
c
->
vsad
[
4
]
=
vsad_intra16_mmx
;
c
->
vsad
[
4
]
=
vsad_intra16_mmx
;
...
@@ -1166,8 +1007,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
...
@@ -1166,8 +1007,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
}
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
){
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
){
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
get_pixels_sse2
;
c
->
sum_abs_dctelem
=
sum_abs_dctelem_sse2
;
c
->
sum_abs_dctelem
=
sum_abs_dctelem_sse2
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment