Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
de53b906
Commit
de53b906
authored
Feb 02, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swscale: implement MMX, SSE2 and AVX functions for RGB32 input.
parent
38c6bbc1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
175 additions
and
0 deletions
+175
-0
input.asm
libswscale/x86/input.asm
+159
-0
swscale_mmx.c
libswscale/x86/swscale_mmx.c
+16
-0
No files found.
libswscale/x86/input.asm
View file @
de53b906
...
@@ -51,6 +51,19 @@ bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
...
@@ -51,6 +51,19 @@ bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
rgb_Vcoeff_12x4
:
times
2
dw
RV
,
GV
,
0
,
RV
rgb_Vcoeff_12x4
:
times
2
dw
RV
,
GV
,
0
,
RV
rgb_Vcoeff_3x56
:
times
2
dw
BV
,
0
,
GV
,
BV
rgb_Vcoeff_3x56
:
times
2
dw
BV
,
0
,
GV
,
BV
rgba_Ycoeff_rb
:
times
4
dw
RY
,
BY
rgba_Ycoeff_br
:
times
4
dw
BY
,
RY
rgba_Ycoeff_ga
:
times
4
dw
GY
,
0
rgba_Ycoeff_ag
:
times
4
dw
0
,
GY
rgba_Ucoeff_rb
:
times
4
dw
RU
,
BU
rgba_Ucoeff_br
:
times
4
dw
BU
,
RU
rgba_Ucoeff_ga
:
times
4
dw
GU
,
0
rgba_Ucoeff_ag
:
times
4
dw
0
,
GU
rgba_Vcoeff_rb
:
times
4
dw
RV
,
BV
rgba_Vcoeff_br
:
times
4
dw
BV
,
RV
rgba_Vcoeff_ga
:
times
4
dw
GV
,
0
rgba_Vcoeff_ag
:
times
4
dw
0
,
GV
shuf_rgb_12x4
:
db
0
,
0x80
,
1
,
0x80
,
2
,
0x80
,
3
,
0x80
,
\
shuf_rgb_12x4
:
db
0
,
0x80
,
1
,
0x80
,
2
,
0x80
,
3
,
0x80
,
\
6
,
0x80
,
7
,
0x80
,
8
,
0x80
,
9
,
0x80
6
,
0x80
,
7
,
0x80
,
8
,
0x80
,
9
,
0x80
shuf_rgb_3x56
:
db
2
,
0x80
,
3
,
0x80
,
4
,
0x80
,
5
,
0x80
,
\
shuf_rgb_3x56
:
db
2
,
0x80
,
3
,
0x80
,
4
,
0x80
,
5
,
0x80
,
\
...
@@ -296,6 +309,152 @@ RGB24_FUNCS 11, 13
...
@@ -296,6 +309,152 @@ RGB24_FUNCS 11, 13
INIT_XMM
avx
INIT_XMM
avx
RGB24_FUNCS
11
,
13
RGB24_FUNCS
11
,
13
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro
RGB32_TO_Y_FN
5
-
6
cglobal
%2%3%4%5
%
+
ToY
,
3
,
3
,
%1
,
dst
,
src
,
w
mova
m5
,
[
rgba_Ycoeff_
%2%4
]
mova
m6
,
[
rgba_Ycoeff_
%3%5
]
%if
%0
==
6
jmp
mangle
(
program_name
%
+
_
%
+
%6
%
+
ToY
%
+
SUFFIX
).
body
%else
; %0 == 6
.
body
:
%if
ARCH_X86_64
movsxd
wq
,
wd
%endif
lea
srcq
,
[
srcq
+
wq
*
4
]
add
dstq
,
wq
neg
wq
mova
m4
,
[
rgb_Yrnd
]
pcmpeqb
m7
,
m7
psrlw
m7
,
8
; (word) { 0x00ff } x4
.
loop
:
; FIXME check alignment and use mova
movu
m0
,
[
srcq
+
wq
*
4
+
0
]
; (byte) { Bx, Gx, Rx, xx }[0-3]
movu
m2
,
[
srcq
+
wq
*
4
+
mmsize
]
; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB
1
,
0
,
3
,
2
,
7
; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd
m1
,
m5
; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd
m0
,
m6
; (dword) { Gx*GY }[0-3]
pmaddwd
m3
,
m5
; (dword) { Bx*BY + Rx*RY }[4-7]
pmaddwd
m2
,
m6
; (dword) { Gx*GY }[4-7]
paddd
m0
,
m4
; += rgb_Yrnd
paddd
m2
,
m4
; += rgb_Yrnd
paddd
m0
,
m1
; (dword) { Y[0-3] }
paddd
m2
,
m3
; (dword) { Y[4-7] }
psrad
m0
,
15
psrad
m2
,
15
packssdw
m0
,
m2
; (word) { Y[0-7] }
packuswb
m0
,
m0
; (byte) { Y[0-7] }
movh
[
dstq
+
wq
]
,
m0
add
wq
,
mmsize
/
2
jl
.
loop
REP_RET
%endif
; %0 == 3
%endmacro
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro
RGB32_TO_UV_FN
5
-
6
cglobal
%2%3%4%5
%
+
ToUV
,
3
,
4
,
%1
,
dstU
,
dstV
,
src
,
w
%if
ARCH_X86_64
mova
m8
,
[
rgba_Ucoeff_
%2%4
]
mova
m9
,
[
rgba_Ucoeff_
%3%5
]
mova
m10
,
[
rgba_Vcoeff_
%2%4
]
mova
m11
,
[
rgba_Vcoeff_
%3%5
]
%define
coeffU1
m8
%define
coeffU2
m9
%define
coeffV1
m10
%define
coeffV2
m11
%else
; x86-32
%define
coeffU1
[
rgba_Ucoeff_
%2%4
]
%define
coeffU2
[
rgba_Ucoeff_
%3%5
]
%define
coeffV1
[
rgba_Vcoeff_
%2%4
]
%define
coeffV2
[
rgba_Vcoeff_
%3%5
]
%endif
; x86-64/32
%if
ARCH_X86_64
&&
%0
==
6
jmp
mangle
(
program_name
%
+
_
%
+
%6
%
+
ToUV
%
+
SUFFIX
).
body
%else
; ARCH_X86_64 && %0 == 6
.
body
:
%if
ARCH_X86_64
movsxd
wq
,
dword
r4m
%else
; x86-32
mov
wq
,
r4m
%endif
add
dstUq
,
wq
add
dstVq
,
wq
lea
srcq
,
[
srcq
+
wq
*
4
]
neg
wq
pcmpeqb
m7
,
m7
psrlw
m7
,
8
; (word) { 0x00ff } x4
mova
m6
,
[
rgb_UVrnd
]
.
loop
:
; FIXME check alignment and use mova
movu
m0
,
[
srcq
+
wq
*
4
+
0
]
; (byte) { Bx, Gx, Rx, xx }[0-3]
movu
m4
,
[
srcq
+
wq
*
4
+
mmsize
]
; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB
1
,
0
,
5
,
4
,
7
; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd
m3
,
m1
,
coeffV1
; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd
m2
,
m0
,
coeffV2
; (dword) { Gx*GV }[0-3]
pmaddwd
m1
,
coeffU1
; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd
m0
,
coeffU2
; (dword) { Gx*GU }[0-3]
paddd
m3
,
m6
; += rgb_UVrnd
paddd
m1
,
m6
; += rgb_UVrnd
paddd
m2
,
m3
; (dword) { V[0-3] }
paddd
m0
,
m1
; (dword) { U[0-3] }
pmaddwd
m3
,
m5
,
coeffV1
; (dword) { Bx*BV + Rx*RV }[4-7]
pmaddwd
m1
,
m4
,
coeffV2
; (dword) { Gx*GV }[4-7]
pmaddwd
m5
,
coeffU1
; (dword) { Bx*BU + Rx*RU }[4-7]
pmaddwd
m4
,
coeffU2
; (dword) { Gx*GU }[4-7]
paddd
m3
,
m6
; += rgb_UVrnd
paddd
m5
,
m6
; += rgb_UVrnd
psrad
m0
,
15
paddd
m1
,
m3
; (dword) { V[4-7] }
paddd
m4
,
m5
; (dword) { U[4-7] }
psrad
m2
,
15
psrad
m4
,
15
psrad
m1
,
15
packssdw
m0
,
m4
; (word) { U[0-7] }
packssdw
m2
,
m1
; (word) { V[0-7] }
%if
mmsize
==
8
packuswb
m0
,
m0
; (byte) { U[0-7] }
packuswb
m2
,
m2
; (byte) { V[0-7] }
movh
[
dstUq
+
wq
]
,
m0
movh
[
dstVq
+
wq
]
,
m2
%else
; mmsize == 16
packuswb
m0
,
m2
; (byte) { U[0-7], V[0-7] }
movh
[
dstUq
+
wq
]
,
m0
movhps
[
dstVq
+
wq
]
,
m0
%endif
; mmsize == 8/16
add
wq
,
mmsize
/
2
jl
.
loop
REP_RET
%endif
; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro
RGB32_FUNCS
2
RGB32_TO_Y_FN
%1
,
r
,
g
,
b
,
a
RGB32_TO_Y_FN
%1
,
b
,
g
,
r
,
a
,
rgba
RGB32_TO_Y_FN
%1
,
a
,
r
,
g
,
b
,
rgba
RGB32_TO_Y_FN
%1
,
a
,
b
,
g
,
r
,
rgba
RGB32_TO_UV_FN
%2
,
r
,
g
,
b
,
a
RGB32_TO_UV_FN
%2
,
b
,
g
,
r
,
a
,
rgba
RGB32_TO_UV_FN
%2
,
a
,
r
,
g
,
b
,
rgba
RGB32_TO_UV_FN
%2
,
a
,
b
,
g
,
r
,
rgba
%endmacro
%if
ARCH_X86_32
INIT_MMX
mmx
RGB32_FUNCS
0
,
0
%endif
INIT_XMM
sse2
RGB32_FUNCS
8
,
12
INIT_XMM
avx
RGB32_FUNCS
8
,
12
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
;
...
...
libswscale/x86/swscale_mmx.c
View file @
de53b906
...
@@ -244,6 +244,10 @@ extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
...
@@ -244,6 +244,10 @@ extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
INPUT_FUNC(yuyv, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt); \
INPUT_UV_FUNC(nv21, opt); \
INPUT_FUNC(rgba, opt); \
INPUT_FUNC(bgra, opt); \
INPUT_FUNC(argb, opt); \
INPUT_FUNC(abgr, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
INPUT_FUNC(bgr24, opt)
...
@@ -335,6 +339,10 @@ switch(c->dstBpc){ \
...
@@ -335,6 +339,10 @@ switch(c->dstBpc){ \
break
;
break
;
case_rgb
(
rgb24
,
RGB24
,
mmx
);
case_rgb
(
rgb24
,
RGB24
,
mmx
);
case_rgb
(
bgr24
,
BGR24
,
mmx
);
case_rgb
(
bgr24
,
BGR24
,
mmx
);
case_rgb
(
bgra
,
BGRA
,
mmx
);
case_rgb
(
rgba
,
RGBA
,
mmx
);
case_rgb
(
abgr
,
ABGR
,
mmx
);
case_rgb
(
argb
,
ARGB
,
mmx
);
default:
default:
break
;
break
;
}
}
...
@@ -379,6 +387,10 @@ switch(c->dstBpc){ \
...
@@ -379,6 +387,10 @@ switch(c->dstBpc){ \
break
;
break
;
case_rgb
(
rgb24
,
RGB24
,
sse2
);
case_rgb
(
rgb24
,
RGB24
,
sse2
);
case_rgb
(
bgr24
,
BGR24
,
sse2
);
case_rgb
(
bgr24
,
BGR24
,
sse2
);
case_rgb
(
bgra
,
BGRA
,
sse2
);
case_rgb
(
rgba
,
RGBA
,
sse2
);
case_rgb
(
abgr
,
ABGR
,
sse2
);
case_rgb
(
argb
,
ARGB
,
sse2
);
default:
default:
break
;
break
;
}
}
...
@@ -422,6 +434,10 @@ switch(c->dstBpc){ \
...
@@ -422,6 +434,10 @@ switch(c->dstBpc){ \
break
;
break
;
case_rgb
(
rgb24
,
RGB24
,
avx
);
case_rgb
(
rgb24
,
RGB24
,
avx
);
case_rgb
(
bgr24
,
BGR24
,
avx
);
case_rgb
(
bgr24
,
BGR24
,
avx
);
case_rgb
(
bgra
,
BGRA
,
avx
);
case_rgb
(
rgba
,
RGBA
,
avx
);
case_rgb
(
abgr
,
ABGR
,
avx
);
case_rgb
(
argb
,
ARGB
,
avx
);
default:
default:
break
;
break
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment