Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
5aca33c2
Commit
5aca33c2
authored
Dec 07, 2015
by
Matthieu Bouron
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swscale/arm: add ff_nv{12,21}_to_{argb,rgba,abgr,bgra}_neon_16
parent
102842d5
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
167 additions
and
59 deletions
+167
-59
swscale_unscaled.c
libswscale/arm/swscale_unscaled.c
+32
-27
yuv2rgb_neon.S
libswscale/arm/yuv2rgb_neon.S
+135
-32
No files found.
libswscale/arm/swscale_unscaled.c
View file @
5aca33c2
...
@@ -63,8 +63,8 @@ static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[
...
@@ -63,8 +63,8 @@ static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[
}
}
#endif
#endif
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt
)
\
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt
, precision)
\
int ff_##ifmt##_to_##ofmt##_neon
(int w, int h,
\
int ff_##ifmt##_to_##ofmt##_neon
_##precision(int w, int h,
\
uint8_t *dst, int linesize, \
uint8_t *dst, int linesize, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcY, int linesizeY, \
const uint8_t *srcC, int linesizeC, \
const uint8_t *srcC, int linesizeC, \
...
@@ -72,59 +72,64 @@ int ff_##ifmt##_to_##ofmt##_neon(int w, int h,
...
@@ -72,59 +72,64 @@ int ff_##ifmt##_to_##ofmt##_neon(int w, int h,
int y_offset, \
int y_offset, \
int y_coeff); \
int y_coeff); \
\
\
static int ifmt##_to_##ofmt##_neon_wrapper
(SwsContext *c, const uint8_t *src[],
\
static int ifmt##_to_##ofmt##_neon_wrapper
_##precision(SwsContext *c, const uint8_t *src[],
\
int srcStride[], int srcSliceY, int srcSliceH, \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) { \
uint8_t *dst[], int dstStride[]) { \
const int16_t yuv2rgb_table[] = { \
const int16_t yuv2rgb_table[] = { \
c->yuv2rgb_v2r_coeff
,
\
c->yuv2rgb_v2r_coeff
/ ((precision) == 16 ? 1 << 7 : 1),
\
c->yuv2rgb_u2g_coeff
,
\
c->yuv2rgb_u2g_coeff
/ ((precision) == 16 ? 1 << 7 : 1),
\
c->yuv2rgb_v2g_coeff
,
\
c->yuv2rgb_v2g_coeff
/ ((precision) == 16 ? 1 << 7 : 1),
\
c->yuv2rgb_u2b_coeff
,
\
c->yuv2rgb_u2b_coeff
/ ((precision) == 16 ? 1 << 7 : 1),
\
}; \
}; \
\
\
ff_##ifmt##_to_##ofmt##_neon
(c->srcW, srcSliceH,
\
ff_##ifmt##_to_##ofmt##_neon
_##precision(c->srcW, srcSliceH,
\
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
src[0] + srcSliceY * srcStride[0], srcStride[0], \
src[0] + srcSliceY * srcStride[0], srcStride[0], \
src[1] + (srcSliceY / 2) * srcStride[1], srcStride[1], \
src[1] + (srcSliceY / 2) * srcStride[1], srcStride[1], \
yuv2rgb_table, \
yuv2rgb_table, \
c->yuv2rgb_y_offset >> 9, \
c->yuv2rgb_y_offset >> 9, \
c->yuv2rgb_y_coeff
);
\
c->yuv2rgb_y_coeff
/ ((precision) == 16 ? 1 << 7 : 1));
\
\
\
return 0; \
return 0; \
}
} \
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx, precision) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb, precision) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba, precision) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr, precision) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra, precision) \
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
#define DECLARE_FF_NVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(nvx) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx, 16) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx, 32) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS
(
nv12
)
DECLARE_FF_NVX_TO_ALL_RGBX_
ALL_PRECISION_
FUNCS
(
nv12
)
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS
(
nv21
)
DECLARE_FF_NVX_TO_ALL_RGBX_
ALL_PRECISION_
FUNCS
(
nv21
)
/* We need a 16 pixel width alignment. This constraint can easily be removed
/* We need a 16 pixel width alignment. This constraint can easily be removed
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
* for input reading but for the output which is 4-bytes per pixel (RGBA) the
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
* assembly might be writing as much as 4*15=60 extra bytes at the end of the
* line, which won't fit the 32-bytes buffer alignment. */
* line, which won't fit the 32-bytes buffer alignment. */
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT
) do {
\
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT
, accurate_rnd) do {
\
if (c->srcFormat == AV_PIX_FMT_##IFMT \
if (c->srcFormat == AV_PIX_FMT_##IFMT \
&& c->dstFormat == AV_PIX_FMT_##OFMT \
&& c->dstFormat == AV_PIX_FMT_##OFMT \
&& !(c->srcH & 1) \
&& !(c->srcH & 1) \
&& !(c->srcW & 15)) { \
&& !(c->srcW & 15)) { \
c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \
c->swscale = (accurate_rnd) ? ifmt##_to_##ofmt##_neon_wrapper_32 : \
ifmt##_to_##ofmt##_neon_wrapper_16 ; \
} \
} \
} while (0)
} while (0)
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX
) do {
\
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX
, accurate_rnd) do {
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB
);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB
, accurate_rnd);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA
);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA
, accurate_rnd);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR
);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR
, accurate_rnd);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA
);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA
, accurate_rnd);
\
} while (0)
} while (0)
static
void
get_unscaled_swscale_neon
(
SwsContext
*
c
)
{
static
void
get_unscaled_swscale_neon
(
SwsContext
*
c
)
{
#if 0
int
accurate_rnd
=
c
->
flags
&
SWS_ACCURATE_RND
;
int
accurate_rnd
=
c
->
flags
&
SWS_ACCURATE_RND
;
#if 0
if (c->srcFormat == AV_PIX_FMT_RGBA
if (c->srcFormat == AV_PIX_FMT_RGBA
&& c->dstFormat == AV_PIX_FMT_NV12
&& c->dstFormat == AV_PIX_FMT_NV12
&& (c->srcW >= 16)) {
&& (c->srcW >= 16)) {
...
@@ -133,8 +138,8 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
...
@@ -133,8 +138,8 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
}
}
#endif
#endif
SET_FF_NVX_TO_ALL_RGBX_FUNC
(
nv12
,
NV12
);
SET_FF_NVX_TO_ALL_RGBX_FUNC
(
nv12
,
NV12
,
accurate_rnd
);
SET_FF_NVX_TO_ALL_RGBX_FUNC
(
nv21
,
NV21
);
SET_FF_NVX_TO_ALL_RGBX_FUNC
(
nv21
,
NV21
,
accurate_rnd
);
}
}
void
ff_get_unscaled_swscale_arm
(
SwsContext
*
c
)
void
ff_get_unscaled_swscale_arm
(
SwsContext
*
c
)
...
...
libswscale/arm/yuv2rgb_neon.S
View file @
5aca33c2
...
@@ -21,7 +21,35 @@
...
@@ -21,7 +21,35 @@
#include "libavutil/arm/asm.S"
#include "libavutil/arm/asm.S"
.macro compute_premult half_u half_v
.macro compute_premult_16 half_u1, half_u2, half_v1, half_v2
vmov d2, \half_u1 @ copy left q14 to left q1
vmov d3, \half_u1 @ copy left q14 to right q1
vmov d4, \half_u2 @ copy right q14 to left q2
vmov d5, \half_u2 @ copy right q14 to right q2
vmov d6, \half_v1 @ copy left q15 to left q3
vmov d7, \half_v1 @ copy left q15 to right q3
vmov d8, \half_v2 @ copy right q15 to left q4
vmov d9, \half_v2 @ copy right q15 to right q4
vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4
vzip.16 d4, d5 @ U5U5U6U6U7U7U8U8
vzip.16 d6, d7 @ V1V1V2V2V3V3V4V4
vzip.16 d8, d9 @ V5V5V6V6V7V7V8V8
vmul.s16 q8, q3, d1[0] @ V * v2r (left, red)
vmul.s16 q9, q4, d1[0] @ V * v2r (right, red)
vmul.s16 q10, q1, d1[1] @ U * u2g
vmul.s16 q11, q2, d1[1] @ U * u2g
vmla.s16 q10, q3, d1[2] @ U * u2g + V * v2g (left, green)
vmla.s16 q11, q4, d1[2] @ U * u2g + V * v2g (right, green)
vmul.s16 q12, q1, d1[3] @ U * u2b (left, blue)
vmul.s16 q13, q2, d1[3] @ U * u2b (right, blue)
.endm
.macro compute_premult_32 half_u half_v
vmov d2, \half_u @ copy left q14 to left q1
vmov d2, \half_u @ copy left q14 to left q1
vmov d3, \half_u @ copy left q14 to right q1
vmov d3, \half_u @ copy left q14 to right q1
vmov d4, \half_v @ copy left q15 to left q2
vmov d4, \half_v @ copy left q15 to left q2
...
@@ -40,7 +68,14 @@
...
@@ -40,7 +68,14 @@
vmull.s16 q13, d3, d1[3] @ U * u2b (right, blue)
vmull.s16 q13, d3, d1[3] @ U * u2b (right, blue)
.endm
.endm
.macro compute_color dst_comp pre1 pre2
.macro compute_color_16 dst_comp1 dst_comp2 pre1 pre2
vadd.s16 q1, q14, \pre1
vadd.s16 q2, q15, \pre2
vqrshrun.s16 \dst_comp1, q1, #6
vqrshrun.s16 \dst_comp2, q2, #6
.endm
.macro compute_color_32 dst_comp pre1 pre2
vadd.s32 q3, q1, \pre1
vadd.s32 q3, q1, \pre1
vadd.s32 q4, q2, \pre2
vadd.s32 q4, q2, \pre2
vqrshrun.s32 d10, q3, #13
vqrshrun.s32 d10, q3, #13
...
@@ -48,14 +83,56 @@
...
@@ -48,14 +83,56 @@
vqmovn.u16 \dst_comp, q5 @ saturate 16bit -> 8bit
vqmovn.u16 \dst_comp, q5 @ saturate 16bit -> 8bit
.endm
.endm
.macro compute_rgba r g b a
.macro compute_rgba_16 r1 r2 g1 g2 b1 b2 a1 a2
compute_color \r, q8, q9
compute_color_16 \r1, \r2, q8, q9
compute_color \g, q10, q11
compute_color_16 \g1, \g2, q10, q11
compute_color \b, q12, q13
compute_color_16 \b1, \b2, q12, q13
vmov.u8 \a1, #255
vmov.u8 \a2, #255
.endm
.macro compute_rgba_32 r g b a
compute_color_32 \r, q8, q9
compute_color_32 \g, q10, q11
compute_color_32 \b, q12, q13
vmov.u8 \a, #255
vmov.u8 \a, #255
.endm
.endm
.macro compute_half_line dst half_y ofmt
.macro compute_16px_16 dst y0 y1 ofmt
vmovl.u8 q14, \y0 @ 8px of y
vmovl.u8 q15, \y1 @ 8px of y
vdup.16 q5, r9 @ q5 = y_offset
vdup.16 q7, r10 @ q7 = y_coeff
vsub.s16 q14, q5
vsub.s16 q15, q5
vmul.s16 q14, q7 @ q14 = (srcY - y_offset) * y_coeff (left)
vmul.s16 q15, q7 @ q15 = (srcY - y_offset) * y_coeff (right)
.ifc \ofmt,argb
compute_rgba_16 d7, d11, d8, d12, d9, d13, d6, d10
.endif
.ifc \ofmt,rgba
compute_rgba_16 d6, d10, d7, d11, d8, d12, d9, d13
.endif
.ifc \ofmt,abgr
compute_rgba_16 d9, d13, d8, d12, d7, d11, d6, d10
.endif
.ifc \ofmt,bgra
compute_rgba_16 d8, d12, d7, d11, d6, d10, d9, d13
.endif
vst4.8 {q3, q4}, [\dst,:128]!
vst4.8 {q5, q6}, [\dst,:128]!
.endm
.macro compute_8px_32 dst half_y ofmt
vmovl.u8 q7, \half_y @ 8px of Y
vmovl.u8 q7, \half_y @ 8px of Y
vdup.16 q5, r9
vdup.16 q5, r9
vsub.s16 q7, q5
vsub.s16 q7, q5
...
@@ -63,26 +140,51 @@
...
@@ -63,26 +140,51 @@
vmull.s16 q2, d15, d0 @ q2 = (srcY - y_offset) * y_coeff (right)
vmull.s16 q2, d15, d0 @ q2 = (srcY - y_offset) * y_coeff (right)
.ifc \ofmt,argb
.ifc \ofmt,argb
compute_rgba
d13, d14, d15, d12
compute_rgba
_32
d13, d14, d15, d12
.endif
.endif
.ifc \ofmt,rgba
.ifc \ofmt,rgba
compute_rgba
d12, d13, d14, d15
compute_rgba
_32
d12, d13, d14, d15
.endif
.endif
.ifc \ofmt,abgr
.ifc \ofmt,abgr
compute_rgba
d15, d14, d13, d12
compute_rgba
_32
d15, d14, d13, d12
.endif
.endif
.ifc \ofmt,bgra
.ifc \ofmt,bgra
compute_rgba
d14, d13, d12, d15
compute_rgba
_32
d14, d13, d12, d15
.endif
.endif
vst4.8 {q6, q7}, [\dst,:128]!
vst4.8 {q6, q7}, [\dst,:128]!
.endm
.endm
.macro declare_func ifmt ofmt
.macro process_16px_16 ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
compute_premult_16 d28, d29, d30, d31
vld1.8 {q7}, [r4]! @ first line of luma
compute_16px_16 r2, d14, d15, \ofmt
vld1.8 {q7}, [r12]! @ second line of luma
compute_16px_16 r11, d14, d15, \ofmt
.endm
.macro process_16px_32 ofmt
compute_premult_32 d28, d30
vld1.8 {q7}, [r4]! @ first line of luma
vmov d28, d15 @ save right of the first line of luma for later use
compute_8px_32 r2, d14, \ofmt
vld1.8 {q7}, [r12]! @ second line of luma
vmov d30, d15 @ save right of the second line of luma for later use
compute_8px_32 r11, d14, \ofmt
compute_premult_32 d29, d31
compute_8px_32 r2, d28, \ofmt
compute_8px_32 r11, d30, \ofmt
.endm
.macro load_args
push {r4-r12, lr}
push {r4-r12, lr}
vpush {q4-q7}
vpush {q4-q7}
ldr r4, [sp, #104] @ r4 = srcY
ldr r4, [sp, #104] @ r4 = srcY
...
@@ -102,6 +204,11 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
...
@@ -102,6 +204,11 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
.endm
.macro declare_func ifmt ofmt precision
function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
load_args
1:
1:
mov r8, r0 @ r8 = width
mov r8, r0 @ r8 = width
2:
2:
...
@@ -119,19 +226,13 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
...
@@ -119,19 +226,13 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
vsubl.u8 q15, d2, d10 @ q15 = V - 128
vsubl.u8 q15, d2, d10 @ q15 = V - 128
.endif
.endif
compute_premult d28, d30
.ifc \precision,16
process_16px_16 \ofmt
vld1.8 {q7}, [r4]! @ first line of luma
.endif
vmov d28, d15 @ save right of the first line of luma for later use
compute_half_line r2, d14, \ofmt
vld1.8 {q7}, [r12]! @ second line of luma
vmov d30, d15 @ save right of the second line of luma for later use
compute_half_line r11, d14, \ofmt
compute_premult d29, d31
.ifc \precision,32
compute_half_line r2, d28,
\ofmt
process_16px_32
\ofmt
compute_half_line r11, d30, \ofmt
.endif
subs r8, r8, #16 @ width -= 16
subs r8, r8, #16 @ width -= 16
bgt 2b
bgt 2b
...
@@ -151,12 +252,14 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
...
@@ -151,12 +252,14 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
endfunc
endfunc
.endm
.endm
.macro declare_rgb_funcs ifmt
.macro declare_rgb_funcs ifmt
precision
declare_func \ifmt, argb
declare_func \ifmt, argb
, \precision
declare_func \ifmt, rgba
declare_func \ifmt, rgba
, \precision
declare_func \ifmt, abgr
declare_func \ifmt, abgr
, \precision
declare_func \ifmt, bgra
declare_func \ifmt, bgra
, \precision
.endm
.endm
declare_rgb_funcs nv12
declare_rgb_funcs nv12, 16
declare_rgb_funcs nv21
declare_rgb_funcs nv21, 16
declare_rgb_funcs nv12, 32
declare_rgb_funcs nv21, 32
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment