Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ffmpeg
Commits
a51c78c6
Commit
a51c78c6
authored
Dec 24, 2014
by
Stefano Sabatini
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
lavfi/mp: drop mp=fspp filter
It has been ported to libavfilter.
parent
bdc4db0e
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
1 addition
and
2129 deletions
+1
-2129
filters.texi
doc/filters.texi
+0
-1
Makefile
libavfilter/Makefile
+0
-1
vf_fspp.c
libavfilter/libmpcodecs/vf_fspp.c
+0
-2124
version.h
libavfilter/version.h
+1
-1
vf_mp.c
libavfilter/vf_mp.c
+0
-2
No files found.
doc/filters.texi
View file @
a51c78c6
...
...
@@ -6216,7 +6216,6 @@ The list of the currently supported filters follows:
@table @var
@item eq2
@item eq
@item fspp
@item ilpack
@item pp7
@item softpulldown
...
...
libavfilter/Makefile
View file @
a51c78c6
...
...
@@ -225,7 +225,6 @@ OBJS-$(CONFIG_MP_FILTER) += libmpcodecs/mp_image.o
OBJS-$(CONFIG_MP_FILTER)
+=
libmpcodecs/img_format.o
OBJS-$(CONFIG_MP_FILTER)
+=
libmpcodecs/vf_eq2.o
OBJS-$(CONFIG_MP_FILTER)
+=
libmpcodecs/vf_eq.o
OBJS-$(CONFIG_MP_FILTER)
+=
libmpcodecs/vf_fspp.o
OBJS-$(CONFIG_MP_FILTER)
+=
libmpcodecs/vf_ilpack.o
OBJS-$(CONFIG_MP_FILTER)
+=
libmpcodecs/vf_pp7.o
OBJS-$(CONFIG_MP_FILTER)
+=
libmpcodecs/vf_softpulldown.o
...
...
libavfilter/libmpcodecs/vf_fspp.c
deleted
100644 → 0
View file @
bdc4db0e
/*
* Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
*
* This file is part of MPlayer.
*
* MPlayer is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* MPlayer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with MPlayer; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
/*
* This implementation is based on an algorithm described in
* "Aria Nosratinia Embedded Post-Processing for
* Enhancement of Compressed Images (1999)"
* (http://citeseer.nj.nec.com/nosratinia99embedded.html)
* Further, with splitting (i)dct into hor/ver passes, one of them can be
* performed once per block, not pixel. This allows for much better speed.
*/
/*
Heavily optimized version of SPP filter by Nikolaj
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <math.h>
#include "config.h"
#include "mp_msg.h"
#include "cpudetect.h"
#include "img_format.h"
#include "mp_image.h"
#include "vf.h"
#include "av_helpers.h"
#include "libvo/fastmemcpy.h"
#include "libavutil/internal.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#undef free
#undef malloc
//===========================================================================//
#define BLOCKSZ 12
static
const
short
custom_threshold
[
64
]
=
// values (296) can't be too high
// -it causes too big quant dependence
// or maybe overflow(check), which results in some flashing
{
71
,
296
,
295
,
237
,
71
,
40
,
38
,
19
,
245
,
193
,
185
,
121
,
102
,
73
,
53
,
27
,
158
,
129
,
141
,
107
,
97
,
73
,
50
,
26
,
102
,
116
,
109
,
98
,
82
,
66
,
45
,
23
,
71
,
94
,
95
,
81
,
70
,
56
,
38
,
20
,
56
,
77
,
74
,
66
,
56
,
44
,
30
,
15
,
38
,
53
,
50
,
45
,
38
,
30
,
21
,
11
,
20
,
27
,
26
,
23
,
20
,
15
,
11
,
5
};
DECLARE_ALIGNED
(
32
,
static
const
uint8_t
,
dither
)[
8
][
8
]
=
{
{
0
,
48
,
12
,
60
,
3
,
51
,
15
,
63
,
},
{
32
,
16
,
44
,
28
,
35
,
19
,
47
,
31
,
},
{
8
,
56
,
4
,
52
,
11
,
59
,
7
,
55
,
},
{
40
,
24
,
36
,
20
,
43
,
27
,
39
,
23
,
},
{
2
,
50
,
14
,
62
,
1
,
49
,
13
,
61
,
},
{
34
,
18
,
46
,
30
,
33
,
17
,
45
,
29
,
},
{
10
,
58
,
6
,
54
,
9
,
57
,
5
,
53
,
},
{
42
,
26
,
38
,
22
,
41
,
25
,
37
,
21
,
},
};
struct
vf_priv_s
{
//align 16 !
uint64_t
threshold_mtx_noq
[
8
*
2
];
uint64_t
threshold_mtx
[
8
*
2
];
//used in both C & MMX (& later SSE2) versions
int
log2_count
;
int
temp_stride
;
int
qp
;
int
mpeg2
;
int
prev_q
;
uint8_t
*
src
;
int16_t
*
temp
;
int
bframes
;
char
*
non_b_qp
;
};
#if !HAVE_MMX_INLINE
//This func reads from 1 slice, 1 and clears 0 & 1
static
void
store_slice_c
(
uint8_t
*
dst
,
int16_t
*
src
,
int
dst_stride
,
int
src_stride
,
int
width
,
int
height
,
int
log2_scale
)
{
int
y
,
x
;
#define STORE(pos) \
temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
src[x + pos]=src[x + pos - 8*src_stride]=0; \
if(temp & 0x100) temp= ~(temp>>31); \
dst[x + pos]= temp;
for
(
y
=
0
;
y
<
height
;
y
++
){
const
uint8_t
*
d
=
dither
[
y
];
for
(
x
=
0
;
x
<
width
;
x
+=
8
){
int
temp
;
STORE
(
0
);
STORE
(
1
);
STORE
(
2
);
STORE
(
3
);
STORE
(
4
);
STORE
(
5
);
STORE
(
6
);
STORE
(
7
);
}
src
+=
src_stride
;
dst
+=
dst_stride
;
}
}
//This func reads from 2 slices, 0 & 2 and clears 2-nd
static
void
store_slice2_c
(
uint8_t
*
dst
,
int16_t
*
src
,
int
dst_stride
,
int
src_stride
,
int
width
,
int
height
,
int
log2_scale
)
{
int
y
,
x
;
#define STORE2(pos) \
temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
src[x + pos + 16*src_stride]=0; \
if(temp & 0x100) temp= ~(temp>>31); \
dst[x + pos]= temp;
for
(
y
=
0
;
y
<
height
;
y
++
){
const
uint8_t
*
d
=
dither
[
y
];
for
(
x
=
0
;
x
<
width
;
x
+=
8
){
int
temp
;
STORE2
(
0
);
STORE2
(
1
);
STORE2
(
2
);
STORE2
(
3
);
STORE2
(
4
);
STORE2
(
5
);
STORE2
(
6
);
STORE2
(
7
);
}
src
+=
src_stride
;
dst
+=
dst_stride
;
}
}
static
void
mul_thrmat_c
(
struct
vf_priv_s
*
p
,
int
q
)
{
int
a
;
for
(
a
=
0
;
a
<
64
;
a
++
)
((
short
*
)
p
->
threshold_mtx
)[
a
]
=
q
*
((
short
*
)
p
->
threshold_mtx_noq
)[
a
];
//ints faster in C
}
static
void
column_fidct_c
(
int16_t
*
thr_adr
,
int16_t
*
data
,
int16_t
*
output
,
int
cnt
);
static
void
row_idct_c
(
int16_t
*
workspace
,
int16_t
*
output_adr
,
int
output_stride
,
int
cnt
);
static
void
row_fdct_c
(
int16_t
*
data
,
const
uint8_t
*
pixels
,
int
line_size
,
int
cnt
);
//this is rather ugly, but there is no need for function pointers
#define store_slice_s store_slice_c
#define store_slice2_s store_slice2_c
#define mul_thrmat_s mul_thrmat_c
#define column_fidct_s column_fidct_c
#define row_idct_s row_idct_c
#define row_fdct_s row_fdct_c
#else
/* HAVE_MMX_INLINE */
//This func reads from 1 slice, 1 and clears 0 & 1
static
void
store_slice_mmx
(
uint8_t
*
dst
,
int16_t
*
src
,
long
dst_stride
,
long
src_stride
,
long
width
,
long
height
,
long
log2_scale
)
{
const
uint8_t
*
od
=&
dither
[
0
][
0
];
const
uint8_t
*
end
=&
dither
[
height
][
0
];
width
=
(
width
+
7
)
&~
7
;
dst_stride
-=
width
;
//src_stride=(src_stride-width)*2;
__asm__
volatile
(
"mov %5, %%"
REG_d
"
\n\t
"
"mov %6, %%"
REG_S
"
\n\t
"
"mov %7, %%"
REG_D
"
\n\t
"
"mov %1, %%"
REG_a
"
\n\t
"
"movd %%"
REG_d
", %%mm5
\n\t
"
"xor $-1, %%"
REG_d
"
\n\t
"
"mov %%"
REG_a
", %%"
REG_c
"
\n\t
"
"add $7, %%"
REG_d
"
\n\t
"
"neg %%"
REG_a
"
\n\t
"
"sub %0, %%"
REG_c
"
\n\t
"
"add %%"
REG_c
", %%"
REG_c
"
\n\t
"
"movd %%"
REG_d
", %%mm2
\n\t
"
"mov %%"
REG_c
", %1
\n\t
"
"mov %2, %%"
REG_d
"
\n\t
"
"shl $4, %%"
REG_a
"
\n\t
"
"2:
\n\t
"
"movq (%%"
REG_d
"), %%mm3
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"punpckhbw %%mm7, %%mm4
\n\t
"
"mov %0, %%"
REG_c
"
\n\t
"
"psraw %%mm5, %%mm3
\n\t
"
"psraw %%mm5, %%mm4
\n\t
"
"1:
\n\t
"
"movq %%mm7, (%%"
REG_S
",%%"
REG_a
")
\n\t
"
"movq (%%"
REG_S
"), %%mm0
\n\t
"
"movq 8(%%"
REG_S
"), %%mm1
\n\t
"
"movq %%mm7, 8(%%"
REG_S
",%%"
REG_a
")
\n\t
"
"paddw %%mm3, %%mm0
\n\t
"
"paddw %%mm4, %%mm1
\n\t
"
"movq %%mm7, (%%"
REG_S
")
\n\t
"
"psraw %%mm2, %%mm0
\n\t
"
"psraw %%mm2, %%mm1
\n\t
"
"movq %%mm7, 8(%%"
REG_S
")
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"add $16, %%"
REG_S
"
\n\t
"
"movq %%mm0, (%%"
REG_D
")
\n\t
"
"add $8, %%"
REG_D
"
\n\t
"
"sub $8, %%"
REG_c
"
\n\t
"
"jg 1b
\n\t
"
"add %1, %%"
REG_S
"
\n\t
"
"add $8, %%"
REG_d
"
\n\t
"
"add %3, %%"
REG_D
"
\n\t
"
"cmp %4, %%"
REG_d
"
\n\t
"
"jl 2b
\n\t
"
:
:
"m"
(
width
),
"m"
(
src_stride
),
"erm"
(
od
),
"m"
(
dst_stride
),
"erm"
(
end
),
"m"
(
log2_scale
),
"m"
(
src
),
"m"
(
dst
)
//input
:
"%"
REG_a
,
"%"
REG_c
,
"%"
REG_d
,
"%"
REG_S
,
"%"
REG_D
);
}
//This func reads from 2 slices, 0 & 2 and clears 2-nd
static
void
store_slice2_mmx
(
uint8_t
*
dst
,
int16_t
*
src
,
long
dst_stride
,
long
src_stride
,
long
width
,
long
height
,
long
log2_scale
)
{
const
uint8_t
*
od
=&
dither
[
0
][
0
];
const
uint8_t
*
end
=&
dither
[
height
][
0
];
width
=
(
width
+
7
)
&~
7
;
dst_stride
-=
width
;
//src_stride=(src_stride-width)*2;
__asm__
volatile
(
"mov %5, %%"
REG_d
"
\n\t
"
"mov %6, %%"
REG_S
"
\n\t
"
"mov %7, %%"
REG_D
"
\n\t
"
"mov %1, %%"
REG_a
"
\n\t
"
"movd %%"
REG_d
", %%mm5
\n\t
"
"xor $-1, %%"
REG_d
"
\n\t
"
"mov %%"
REG_a
", %%"
REG_c
"
\n\t
"
"add $7, %%"
REG_d
"
\n\t
"
"sub %0, %%"
REG_c
"
\n\t
"
"add %%"
REG_c
", %%"
REG_c
"
\n\t
"
"movd %%"
REG_d
", %%mm2
\n\t
"
"mov %%"
REG_c
", %1
\n\t
"
"mov %2, %%"
REG_d
"
\n\t
"
"shl $5, %%"
REG_a
"
\n\t
"
"2:
\n\t
"
"movq (%%"
REG_d
"), %%mm3
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"punpckhbw %%mm7, %%mm4
\n\t
"
"mov %0, %%"
REG_c
"
\n\t
"
"psraw %%mm5, %%mm3
\n\t
"
"psraw %%mm5, %%mm4
\n\t
"
"1:
\n\t
"
"movq (%%"
REG_S
"), %%mm0
\n\t
"
"movq 8(%%"
REG_S
"), %%mm1
\n\t
"
"paddw %%mm3, %%mm0
\n\t
"
"paddw (%%"
REG_S
",%%"
REG_a
"), %%mm0
\n\t
"
"paddw %%mm4, %%mm1
\n\t
"
"movq 8(%%"
REG_S
",%%"
REG_a
"), %%mm6
\n\t
"
"movq %%mm7, (%%"
REG_S
",%%"
REG_a
")
\n\t
"
"psraw %%mm2, %%mm0
\n\t
"
"paddw %%mm6, %%mm1
\n\t
"
"movq %%mm7, 8(%%"
REG_S
",%%"
REG_a
")
\n\t
"
"psraw %%mm2, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%%"
REG_D
")
\n\t
"
"add $16, %%"
REG_S
"
\n\t
"
"add $8, %%"
REG_D
"
\n\t
"
"sub $8, %%"
REG_c
"
\n\t
"
"jg 1b
\n\t
"
"add %1, %%"
REG_S
"
\n\t
"
"add $8, %%"
REG_d
"
\n\t
"
"add %3, %%"
REG_D
"
\n\t
"
"cmp %4, %%"
REG_d
"
\n\t
"
"jl 2b
\n\t
"
:
:
"m"
(
width
),
"m"
(
src_stride
),
"erm"
(
od
),
"m"
(
dst_stride
),
"erm"
(
end
),
"m"
(
log2_scale
),
"m"
(
src
),
"m"
(
dst
)
//input
:
"%"
REG_a
,
"%"
REG_c
,
"%"
REG_d
,
"%"
REG_D
,
"%"
REG_S
);
}
static
void
mul_thrmat_mmx
(
struct
vf_priv_s
*
p
,
int
q
)
{
uint64_t
*
adr
=&
p
->
threshold_mtx_noq
[
0
];
__asm__
volatile
(
"movd %0, %%mm7
\n\t
"
"add $8*8*2, %%"
REG_D
"
\n\t
"
"movq 0*8(%%"
REG_S
"), %%mm0
\n\t
"
"punpcklwd %%mm7, %%mm7
\n\t
"
"movq 1*8(%%"
REG_S
"), %%mm1
\n\t
"
"punpckldq %%mm7, %%mm7
\n\t
"
"pmullw %%mm7, %%mm0
\n\t
"
"movq 2*8(%%"
REG_S
"), %%mm2
\n\t
"
"pmullw %%mm7, %%mm1
\n\t
"
"movq 3*8(%%"
REG_S
"), %%mm3
\n\t
"
"pmullw %%mm7, %%mm2
\n\t
"
"movq %%mm0, 0*8(%%"
REG_D
")
\n\t
"
"movq 4*8(%%"
REG_S
"), %%mm4
\n\t
"
"pmullw %%mm7, %%mm3
\n\t
"
"movq %%mm1, 1*8(%%"
REG_D
")
\n\t
"
"movq 5*8(%%"
REG_S
"), %%mm5
\n\t
"
"pmullw %%mm7, %%mm4
\n\t
"
"movq %%mm2, 2*8(%%"
REG_D
")
\n\t
"
"movq 6*8(%%"
REG_S
"), %%mm6
\n\t
"
"pmullw %%mm7, %%mm5
\n\t
"
"movq %%mm3, 3*8(%%"
REG_D
")
\n\t
"
"movq 7*8+0*8(%%"
REG_S
"), %%mm0
\n\t
"
"pmullw %%mm7, %%mm6
\n\t
"
"movq %%mm4, 4*8(%%"
REG_D
")
\n\t
"
"movq 7*8+1*8(%%"
REG_S
"), %%mm1
\n\t
"
"pmullw %%mm7, %%mm0
\n\t
"
"movq %%mm5, 5*8(%%"
REG_D
")
\n\t
"
"movq 7*8+2*8(%%"
REG_S
"), %%mm2
\n\t
"
"pmullw %%mm7, %%mm1
\n\t
"
"movq %%mm6, 6*8(%%"
REG_D
")
\n\t
"
"movq 7*8+3*8(%%"
REG_S
"), %%mm3
\n\t
"
"pmullw %%mm7, %%mm2
\n\t
"
"movq %%mm0, 7*8+0*8(%%"
REG_D
")
\n\t
"
"movq 7*8+4*8(%%"
REG_S
"), %%mm4
\n\t
"
"pmullw %%mm7, %%mm3
\n\t
"
"movq %%mm1, 7*8+1*8(%%"
REG_D
")
\n\t
"
"movq 7*8+5*8(%%"
REG_S
"), %%mm5
\n\t
"
"pmullw %%mm7, %%mm4
\n\t
"
"movq %%mm2, 7*8+2*8(%%"
REG_D
")
\n\t
"
"movq 7*8+6*8(%%"
REG_S
"), %%mm6
\n\t
"
"pmullw %%mm7, %%mm5
\n\t
"
"movq %%mm3, 7*8+3*8(%%"
REG_D
")
\n\t
"
"movq 14*8+0*8(%%"
REG_S
"), %%mm0
\n\t
"
"pmullw %%mm7, %%mm6
\n\t
"
"movq %%mm4, 7*8+4*8(%%"
REG_D
")
\n\t
"
"movq 14*8+1*8(%%"
REG_S
"), %%mm1
\n\t
"
"pmullw %%mm7, %%mm0
\n\t
"
"movq %%mm5, 7*8+5*8(%%"
REG_D
")
\n\t
"
"pmullw %%mm7, %%mm1
\n\t
"
"movq %%mm6, 7*8+6*8(%%"
REG_D
")
\n\t
"
"movq %%mm0, 14*8+0*8(%%"
REG_D
")
\n\t
"
"movq %%mm1, 14*8+1*8(%%"
REG_D
")
\n\t
"
:
"+g"
(
q
),
"+S"
(
adr
),
"+D"
(
adr
)
:
);
}
static
void
column_fidct_mmx
(
int16_t
*
thr_adr
,
int16_t
*
data
,
int16_t
*
output
,
int
cnt
);
static
void
row_idct_mmx
(
int16_t
*
workspace
,
int16_t
*
output_adr
,
int
output_stride
,
int
cnt
);
static
void
row_fdct_mmx
(
int16_t
*
data
,
const
uint8_t
*
pixels
,
int
line_size
,
int
cnt
);
#define store_slice_s store_slice_mmx
#define store_slice2_s store_slice2_mmx
#define mul_thrmat_s mul_thrmat_mmx
#define column_fidct_s column_fidct_mmx
#define row_idct_s row_idct_mmx
#define row_fdct_s row_fdct_mmx
#endif // HAVE_MMX_INLINE
static
void
filter
(
struct
vf_priv_s
*
p
,
uint8_t
*
dst
,
uint8_t
*
src
,
int
dst_stride
,
int
src_stride
,
int
width
,
int
height
,
uint8_t
*
qp_store
,
int
qp_stride
,
int
is_luma
)
{
int
x
,
x0
,
y
,
es
,
qy
,
t
;
const
int
stride
=
is_luma
?
p
->
temp_stride
:
(
width
+
16
);
//((width+16+15)&(~15))
const
int
step
=
6
-
p
->
log2_count
;
const
int
qps
=
3
+
is_luma
;
DECLARE_ALIGNED
(
32
,
int32_t
,
block_align
)[
4
*
8
*
BLOCKSZ
+
4
*
8
*
BLOCKSZ
];
int16_t
*
block
=
(
int16_t
*
)
block_align
;
int16_t
*
block3
=
(
int16_t
*
)(
block_align
+
4
*
8
*
BLOCKSZ
);
memset
(
block3
,
0
,
4
*
8
*
BLOCKSZ
);
//p->src=src-src_stride*8-8;//!
if
(
!
src
||
!
dst
)
return
;
// HACK avoid crash for Y8 colourspace
for
(
y
=
0
;
y
<
height
;
y
++
){
int
index
=
8
+
8
*
stride
+
y
*
stride
;
fast_memcpy
(
p
->
src
+
index
,
src
+
y
*
src_stride
,
width
);
//this line can be avoided by using DR & user fr.buffers
for
(
x
=
0
;
x
<
8
;
x
++
){
p
->
src
[
index
-
x
-
1
]
=
p
->
src
[
index
+
x
];
p
->
src
[
index
+
width
+
x
]
=
p
->
src
[
index
+
width
-
x
-
1
];
}
}
for
(
y
=
0
;
y
<
8
;
y
++
){
fast_memcpy
(
p
->
src
+
(
7
-
y
)
*
stride
,
p
->
src
+
(
y
+
8
)
*
stride
,
stride
);
fast_memcpy
(
p
->
src
+
(
height
+
8
+
y
)
*
stride
,
p
->
src
+
(
height
-
y
+
7
)
*
stride
,
stride
);
}
//FIXME (try edge emu)
for
(
y
=
8
;
y
<
24
;
y
++
)
memset
(
p
->
temp
+
8
+
y
*
stride
,
0
,
width
*
sizeof
(
int16_t
));
for
(
y
=
step
;
y
<
height
+
8
;
y
+=
step
){
//step= 1,2
qy
=
y
-
4
;
if
(
qy
>
height
-
1
)
qy
=
height
-
1
;
if
(
qy
<
0
)
qy
=
0
;
qy
=
(
qy
>>
qps
)
*
qp_stride
;
row_fdct_s
(
block
,
p
->
src
+
y
*
stride
+
2
-
(
y
&
1
),
stride
,
2
);
for
(
x0
=
0
;
x0
<
width
+
8
-
8
*
(
BLOCKSZ
-
1
);
x0
+=
8
*
(
BLOCKSZ
-
1
)){
row_fdct_s
(
block
+
8
*
8
,
p
->
src
+
y
*
stride
+
8
+
x0
+
2
-
(
y
&
1
),
stride
,
2
*
(
BLOCKSZ
-
1
));
if
(
p
->
qp
)
column_fidct_s
((
int16_t
*
)(
&
p
->
threshold_mtx
[
0
]),
block
+
0
*
8
,
block3
+
0
*
8
,
8
*
(
BLOCKSZ
-
1
));
//yes, this is a HOTSPOT
else
for
(
x
=
0
;
x
<
8
*
(
BLOCKSZ
-
1
);
x
+=
8
)
{
t
=
x
+
x0
-
2
;
//correct t=x+x0-2-(y&1), but its the same
if
(
t
<
0
)
t
=
0
;
//t always < width-2
t
=
qp_store
[
qy
+
(
t
>>
qps
)];
t
=
norm_qscale
(
t
,
p
->
mpeg2
);
if
(
t
!=
p
->
prev_q
)
p
->
prev_q
=
t
,
mul_thrmat_s
(
p
,
t
);
column_fidct_s
((
int16_t
*
)(
&
p
->
threshold_mtx
[
0
]),
block
+
x
*
8
,
block3
+
x
*
8
,
8
);
//yes, this is a HOTSPOT
}
row_idct_s
(
block3
+
0
*
8
,
p
->
temp
+
(
y
&
15
)
*
stride
+
x0
+
2
-
(
y
&
1
),
stride
,
2
*
(
BLOCKSZ
-
1
));
memmove
(
block
,
block
+
(
BLOCKSZ
-
1
)
*
64
,
8
*
8
*
sizeof
(
int16_t
));
//cycling
memmove
(
block3
,
block3
+
(
BLOCKSZ
-
1
)
*
64
,
6
*
8
*
sizeof
(
int16_t
));
}
//
es
=
width
+
8
-
x0
;
// 8, ...
if
(
es
>
8
)
row_fdct_s
(
block
+
8
*
8
,
p
->
src
+
y
*
stride
+
8
+
x0
+
2
-
(
y
&
1
),
stride
,
(
es
-
4
)
>>
2
);
column_fidct_s
((
int16_t
*
)(
&
p
->
threshold_mtx
[
0
]),
block
,
block3
,
es
&
(
~
1
));
row_idct_s
(
block3
+
0
*
8
,
p
->
temp
+
(
y
&
15
)
*
stride
+
x0
+
2
-
(
y
&
1
),
stride
,
es
>>
2
);
{
const
int
y1
=
y
-
8
+
step
;
//l5-7 l4-6
if
(
!
(
y1
&
7
)
&&
y1
)
{
if
(
y1
&
8
)
store_slice_s
(
dst
+
(
y1
-
8
)
*
dst_stride
,
p
->
temp
+
8
+
8
*
stride
,
dst_stride
,
stride
,
width
,
8
,
5
-
p
->
log2_count
);
else
store_slice2_s
(
dst
+
(
y1
-
8
)
*
dst_stride
,
p
->
temp
+
8
+
0
*
stride
,
dst_stride
,
stride
,
width
,
8
,
5
-
p
->
log2_count
);
}
}
}
if
(
y
&
7
)
{
// == height & 7
if
(
y
&
8
)
store_slice_s
(
dst
+
((
y
-
8
)
&~
7
)
*
dst_stride
,
p
->
temp
+
8
+
8
*
stride
,
dst_stride
,
stride
,
width
,
y
&
7
,
5
-
p
->
log2_count
);
else
store_slice2_s
(
dst
+
((
y
-
8
)
&~
7
)
*
dst_stride
,
p
->
temp
+
8
+
0
*
stride
,
dst_stride
,
stride
,
width
,
y
&
7
,
5
-
p
->
log2_count
);
}
}
static
int
config
(
struct
vf_instance
*
vf
,
int
width
,
int
height
,
int
d_width
,
int
d_height
,
unsigned
int
flags
,
unsigned
int
outfmt
)
{
int
h
=
(
height
+
16
+
15
)
&
(
~
15
);
vf
->
priv
->
temp_stride
=
(
width
+
16
+
15
)
&
(
~
15
);
vf
->
priv
->
temp
=
(
int16_t
*
)
av_mallocz
(
vf
->
priv
->
temp_stride
*
3
*
8
*
sizeof
(
int16_t
));
//this can also be avoided, see above
vf
->
priv
->
src
=
(
uint8_t
*
)
av_malloc
(
vf
->
priv
->
temp_stride
*
h
*
sizeof
(
uint8_t
));
return
ff_vf_next_config
(
vf
,
width
,
height
,
d_width
,
d_height
,
flags
,
outfmt
);
}
static
void
get_image
(
struct
vf_instance
*
vf
,
mp_image_t
*
mpi
)
{
if
(
mpi
->
flags
&
MP_IMGFLAG_PRESERVE
)
return
;
// don't change
// ok, we can do pp in-place (or pp disabled):
vf
->
dmpi
=
ff_vf_get_image
(
vf
->
next
,
mpi
->
imgfmt
,
mpi
->
type
,
mpi
->
flags
,
mpi
->
width
,
mpi
->
height
);
mpi
->
planes
[
0
]
=
vf
->
dmpi
->
planes
[
0
];
mpi
->
stride
[
0
]
=
vf
->
dmpi
->
stride
[
0
];
mpi
->
width
=
vf
->
dmpi
->
width
;
if
(
mpi
->
flags
&
MP_IMGFLAG_PLANAR
){
mpi
->
planes
[
1
]
=
vf
->
dmpi
->
planes
[
1
];
mpi
->
planes
[
2
]
=
vf
->
dmpi
->
planes
[
2
];
mpi
->
stride
[
1
]
=
vf
->
dmpi
->
stride
[
1
];
mpi
->
stride
[
2
]
=
vf
->
dmpi
->
stride
[
2
];
}
mpi
->
flags
|=
MP_IMGFLAG_DIRECT
;
}
static
int
put_image
(
struct
vf_instance
*
vf
,
mp_image_t
*
mpi
,
double
pts
)
{
mp_image_t
*
dmpi
;
if
(
!
(
mpi
->
flags
&
MP_IMGFLAG_DIRECT
)){
// no DR, so get a new image! hope we'll get DR buffer:
dmpi
=
ff_vf_get_image
(
vf
->
next
,
mpi
->
imgfmt
,
MP_IMGTYPE_TEMP
,
MP_IMGFLAG_ACCEPT_STRIDE
|
MP_IMGFLAG_PREFER_ALIGNED_STRIDE
,
mpi
->
width
,
mpi
->
height
);
ff_vf_clone_mpi_attributes
(
dmpi
,
mpi
);
}
else
{
dmpi
=
vf
->
dmpi
;
}
vf
->
priv
->
mpeg2
=
mpi
->
qscale_type
;
if
(
mpi
->
pict_type
!=
3
&&
mpi
->
qscale
&&
!
vf
->
priv
->
qp
){
int
w
=
mpi
->
qstride
;
int
h
=
(
mpi
->
h
+
15
)
>>
4
;
if
(
!
w
)
{
w
=
(
mpi
->
w
+
15
)
>>
4
;
h
=
1
;
}
if
(
!
vf
->
priv
->
non_b_qp
)
vf
->
priv
->
non_b_qp
=
malloc
(
w
*
h
);
fast_memcpy
(
vf
->
priv
->
non_b_qp
,
mpi
->
qscale
,
w
*
h
);
}
if
(
vf
->
priv
->
log2_count
||
!
(
mpi
->
flags
&
MP_IMGFLAG_DIRECT
)){
char
*
qp_tab
=
vf
->
priv
->
non_b_qp
;
if
(
vf
->
priv
->
bframes
||
!
qp_tab
)
qp_tab
=
mpi
->
qscale
;
if
(
qp_tab
||
vf
->
priv
->
qp
){
filter
(
vf
->
priv
,
dmpi
->
planes
[
0
],
mpi
->
planes
[
0
],
dmpi
->
stride
[
0
],
mpi
->
stride
[
0
],
mpi
->
w
,
mpi
->
h
,
qp_tab
,
mpi
->
qstride
,
1
);
filter
(
vf
->
priv
,
dmpi
->
planes
[
1
],
mpi
->
planes
[
1
],
dmpi
->
stride
[
1
],
mpi
->
stride
[
1
],
mpi
->
w
>>
mpi
->
chroma_x_shift
,
mpi
->
h
>>
mpi
->
chroma_y_shift
,
qp_tab
,
mpi
->
qstride
,
0
);
filter
(
vf
->
priv
,
dmpi
->
planes
[
2
],
mpi
->
planes
[
2
],
dmpi
->
stride
[
2
],
mpi
->
stride
[
2
],
mpi
->
w
>>
mpi
->
chroma_x_shift
,
mpi
->
h
>>
mpi
->
chroma_y_shift
,
qp_tab
,
mpi
->
qstride
,
0
);
}
else
{
memcpy_pic
(
dmpi
->
planes
[
0
],
mpi
->
planes
[
0
],
mpi
->
w
,
mpi
->
h
,
dmpi
->
stride
[
0
],
mpi
->
stride
[
0
]);
memcpy_pic
(
dmpi
->
planes
[
1
],
mpi
->
planes
[
1
],
mpi
->
w
>>
mpi
->
chroma_x_shift
,
mpi
->
h
>>
mpi
->
chroma_y_shift
,
dmpi
->
stride
[
1
],
mpi
->
stride
[
1
]);
memcpy_pic
(
dmpi
->
planes
[
2
],
mpi
->
planes
[
2
],
mpi
->
w
>>
mpi
->
chroma_x_shift
,
mpi
->
h
>>
mpi
->
chroma_y_shift
,
dmpi
->
stride
[
2
],
mpi
->
stride
[
2
]);
}
}
#if HAVE_MMX_INLINE
if
(
ff_gCpuCaps
.
hasMMX
)
__asm__
volatile
(
"emms
\n\t
"
);
#endif
#if HAVE_MMXEXT_INLINE
if
(
ff_gCpuCaps
.
hasMMX2
)
__asm__
volatile
(
"sfence
\n\t
"
);
#endif
return
ff_vf_next_put_image
(
vf
,
dmpi
,
pts
);
}
static
void
uninit
(
struct
vf_instance
*
vf
)
{
if
(
!
vf
->
priv
)
return
;
av_free
(
vf
->
priv
->
temp
);
vf
->
priv
->
temp
=
NULL
;
av_free
(
vf
->
priv
->
src
);
vf
->
priv
->
src
=
NULL
;
//free(vf->priv->avctx);
//vf->priv->avctx= NULL;
free
(
vf
->
priv
->
non_b_qp
);
vf
->
priv
->
non_b_qp
=
NULL
;
av_free
(
vf
->
priv
);
vf
->
priv
=
NULL
;
}
//===========================================================================//
static
int
query_format
(
struct
vf_instance
*
vf
,
unsigned
int
fmt
)
{
switch
(
fmt
){
case
IMGFMT_YVU9
:
case
IMGFMT_IF09
:
case
IMGFMT_YV12
:
case
IMGFMT_I420
:
case
IMGFMT_IYUV
:
case
IMGFMT_CLPL
:
case
IMGFMT_Y800
:
case
IMGFMT_Y8
:
case
IMGFMT_444P
:
case
IMGFMT_422P
:
case
IMGFMT_411P
:
return
ff_vf_next_query_format
(
vf
,
fmt
);
}
return
0
;
}
static
int
control
(
struct
vf_instance
*
vf
,
int
request
,
void
*
data
)
{
switch
(
request
){
case
VFCTRL_QUERY_MAX_PP_LEVEL
:
return
5
;
case
VFCTRL_SET_PP_LEVEL
:
vf
->
priv
->
log2_count
=
*
((
unsigned
int
*
)
data
);
if
(
vf
->
priv
->
log2_count
<
4
)
vf
->
priv
->
log2_count
=
4
;
return
CONTROL_TRUE
;
}
return
ff_vf_next_control
(
vf
,
request
,
data
);
}
static
int
vf_open
(
vf_instance_t
*
vf
,
char
*
args
)
{
int
i
=
0
,
bias
;
int
custom_threshold_m
[
64
];
int
log2c
=-
1
;
vf
->
config
=
config
;
vf
->
put_image
=
put_image
;
vf
->
get_image
=
get_image
;
vf
->
query_format
=
query_format
;
vf
->
uninit
=
uninit
;
vf
->
control
=
control
;
vf
->
priv
=
av_mallocz
(
sizeof
(
struct
vf_priv_s
));
//assumes align 16 !
ff_init_avcodec
();
//vf->priv->avctx= avcodec_alloc_context();
//dsputil_init(&vf->priv->dsp, vf->priv->avctx);
vf
->
priv
->
log2_count
=
4
;
vf
->
priv
->
bframes
=
0
;
if
(
args
)
sscanf
(
args
,
"%d:%d:%d:%d"
,
&
log2c
,
&
vf
->
priv
->
qp
,
&
i
,
&
vf
->
priv
->
bframes
);
if
(
log2c
>=
4
&&
log2c
<=
5
)
vf
->
priv
->
log2_count
=
log2c
;
else
if
(
log2c
>=
6
)
vf
->
priv
->
log2_count
=
5
;
if
(
vf
->
priv
->
qp
<
0
)
vf
->
priv
->
qp
=
0
;
if
(
i
<
-
15
)
i
=
-
15
;
if
(
i
>
32
)
i
=
32
;
bias
=
(
1
<<
4
)
+
i
;
//regulable
vf
->
priv
->
prev_q
=
0
;
//
for
(
i
=
0
;
i
<
64
;
i
++
)
//FIXME: tune custom_threshold[] and remove this !
custom_threshold_m
[
i
]
=
(
int
)(
custom_threshold
[
i
]
*
(
bias
/
71
.)
+
0
.
5
);
for
(
i
=
0
;
i
<
8
;
i
++
){
vf
->
priv
->
threshold_mtx_noq
[
2
*
i
]
=
(
uint64_t
)
custom_threshold_m
[
i
*
8
+
2
]
|
(((
uint64_t
)
custom_threshold_m
[
i
*
8
+
6
])
<<
16
)
|
(((
uint64_t
)
custom_threshold_m
[
i
*
8
+
0
])
<<
32
)
|
(((
uint64_t
)
custom_threshold_m
[
i
*
8
+
4
])
<<
48
);
vf
->
priv
->
threshold_mtx_noq
[
2
*
i
+
1
]
=
(
uint64_t
)
custom_threshold_m
[
i
*
8
+
5
]
|
(((
uint64_t
)
custom_threshold_m
[
i
*
8
+
3
])
<<
16
)
|
(((
uint64_t
)
custom_threshold_m
[
i
*
8
+
1
])
<<
32
)
|
(((
uint64_t
)
custom_threshold_m
[
i
*
8
+
7
])
<<
48
);
}
if
(
vf
->
priv
->
qp
)
vf
->
priv
->
prev_q
=
vf
->
priv
->
qp
,
mul_thrmat_s
(
vf
->
priv
,
vf
->
priv
->
qp
);
return
1
;
}
const
vf_info_t
ff_vf_info_fspp
=
{
"fast simple postprocess"
,
"fspp"
,
"Michael Niedermayer, Nikolaj Poroshin"
,
""
,
vf_open
,
NULL
};
//====================================================================
//Specific spp's dct, idct and threshold functions
//I'd prefer to have them in the separate file.
//#define MANGLE(a) #a
//typedef int16_t int16_t; //! only int16_t
#define DCTSIZE 8
#define DCTSIZE_S "8"
#define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
#define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
#define FIX64(x,s) C64(FIX(x,s))
#define MULTIPLY16H(x,k) (((x)*(k))>>16)
#define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
#define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
#if HAVE_MMX_INLINE
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_0_382683433
)
=
FIX64
(
0
.
382683433
,
14
);
extern
uint64_t
ff_MM_FIX_0_707106781
;
extern
uint64_t
ff_MM_FIX_0_541196100
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_1_306562965
)
=
FIX64
(
1
.
306562965
,
14
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_1_414213562_A
)
=
FIX64
(
1
.
414213562
,
14
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_1_847759065
)
=
FIX64
(
1
.
847759065
,
13
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_2_613125930
)
=
FIX64
(
-
2
.
613125930
,
13
);
//-
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_1_414213562
)
=
FIX64
(
1
.
414213562
,
13
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_1_082392200
)
=
FIX64
(
1
.
082392200
,
13
);
//for t3,t5,t7 == 0 shortcut
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_0_847759065
)
=
FIX64
(
0
.
847759065
,
14
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_0_566454497
)
=
FIX64
(
0
.
566454497
,
14
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_FIX_0_198912367
)
=
FIX64
(
0
.
198912367
,
14
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_DESCALE_RND
)
=
C64
(
4
);
DECLARE_ASM_CONST
(
8
,
uint64_t
,
MM_2
)
=
C64
(
2
);
#else
/* !HAVE_MMX_INLINE */
typedef
int32_t
int_simd16_t
;
static
const
int16_t
FIX_0_382683433
=
FIX
(
0
.
382683433
,
14
);
static
const
int16_t
FIX_0_541196100
=
FIX
(
0
.
541196100
,
14
);
static
const
int16_t
FIX_0_707106781
=
FIX
(
0
.
707106781
,
14
);
static
const
int16_t
FIX_1_306562965
=
FIX
(
1
.
306562965
,
14
);
static
const
int16_t
FIX_1_414213562_A
=
FIX
(
1
.
414213562
,
14
);
static
const
int16_t
FIX_1_847759065
=
FIX
(
1
.
847759065
,
13
);
static
const
int16_t
FIX_2_613125930
=
FIX
(
-
2
.
613125930
,
13
);
//-
static
const
int16_t
FIX_1_414213562
=
FIX
(
1
.
414213562
,
13
);
static
const
int16_t
FIX_1_082392200
=
FIX
(
1
.
082392200
,
13
);
#endif
#if !HAVE_MMX_INLINE
static
void
column_fidct_c
(
int16_t
*
thr_adr
,
int16_t
*
data
,
int16_t
*
output
,
int
cnt
)
{
int_simd16_t
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
;
int_simd16_t
tmp10
,
tmp11
,
tmp12
,
tmp13
;
int_simd16_t
z1
,
z2
,
z3
,
z4
,
z5
,
z10
,
z11
,
z12
,
z13
;
int_simd16_t
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
d7
;
int16_t
*
dataptr
;
int16_t
*
wsptr
;
int16_t
*
threshold
;
int
ctr
;
dataptr
=
data
;
wsptr
=
output
;
for
(;
cnt
>
0
;
cnt
-=
2
)
{
//start positions
threshold
=
(
int16_t
*
)
thr_adr
;
//threshold_mtx
for
(
ctr
=
DCTSIZE
;
ctr
>
0
;
ctr
--
)
{
// Process columns from input, add to output.
tmp0
=
dataptr
[
DCTSIZE
*
0
]
+
dataptr
[
DCTSIZE
*
7
];
tmp7
=
dataptr
[
DCTSIZE
*
0
]
-
dataptr
[
DCTSIZE
*
7
];
tmp1
=
dataptr
[
DCTSIZE
*
1
]
+
dataptr
[
DCTSIZE
*
6
];
tmp6
=
dataptr
[
DCTSIZE
*
1
]
-
dataptr
[
DCTSIZE
*
6
];
tmp2
=
dataptr
[
DCTSIZE
*
2
]
+
dataptr
[
DCTSIZE
*
5
];
tmp5
=
dataptr
[
DCTSIZE
*
2
]
-
dataptr
[
DCTSIZE
*
5
];
tmp3
=
dataptr
[
DCTSIZE
*
3
]
+
dataptr
[
DCTSIZE
*
4
];
tmp4
=
dataptr
[
DCTSIZE
*
3
]
-
dataptr
[
DCTSIZE
*
4
];
// Even part of FDCT
tmp10
=
tmp0
+
tmp3
;
tmp13
=
tmp0
-
tmp3
;
tmp11
=
tmp1
+
tmp2
;
tmp12
=
tmp1
-
tmp2
;
d0
=
tmp10
+
tmp11
;
d4
=
tmp10
-
tmp11
;
z1
=
MULTIPLY16H
((
tmp12
+
tmp13
)
<<
2
,
FIX_0_707106781
);
d2
=
tmp13
+
z1
;
d6
=
tmp13
-
z1
;
// Even part of IDCT
THRESHOLD
(
tmp0
,
d0
,
threshold
[
0
*
8
]);
THRESHOLD
(
tmp1
,
d2
,
threshold
[
2
*
8
]);
THRESHOLD
(
tmp2
,
d4
,
threshold
[
4
*
8
]);
THRESHOLD
(
tmp3
,
d6
,
threshold
[
6
*
8
]);
tmp0
+=
2
;
tmp10
=
(
tmp0
+
tmp2
)
>>
2
;
tmp11
=
(
tmp0
-
tmp2
)
>>
2
;
tmp13
=
(
tmp1
+
tmp3
)
>>
2
;
//+2 ! (psnr decides)
tmp12
=
MULTIPLY16H
((
tmp1
-
tmp3
),
FIX_1_414213562_A
)
-
tmp13
;
//<<2
tmp0
=
tmp10
+
tmp13
;
//->temps
tmp3
=
tmp10
-
tmp13
;
//->temps
tmp1
=
tmp11
+
tmp12
;
//->temps
tmp2
=
tmp11
-
tmp12
;
//->temps
// Odd part of FDCT
tmp10
=
tmp4
+
tmp5
;
tmp11
=
tmp5
+
tmp6
;
tmp12
=
tmp6
+
tmp7
;
z5
=
MULTIPLY16H
((
tmp10
-
tmp12
)
<<
2
,
FIX_0_382683433
);
z2
=
MULTIPLY16H
(
tmp10
<<
2
,
FIX_0_541196100
)
+
z5
;
z4
=
MULTIPLY16H
(
tmp12
<<
2
,
FIX_1_306562965
)
+
z5
;
z3
=
MULTIPLY16H
(
tmp11
<<
2
,
FIX_0_707106781
);
z11
=
tmp7
+
z3
;
z13
=
tmp7
-
z3
;
d5
=
z13
+
z2
;
d3
=
z13
-
z2
;
d1
=
z11
+
z4
;
d7
=
z11
-
z4
;
// Odd part of IDCT
THRESHOLD
(
tmp4
,
d1
,
threshold
[
1
*
8
]);
THRESHOLD
(
tmp5
,
d3
,
threshold
[
3
*
8
]);
THRESHOLD
(
tmp6
,
d5
,
threshold
[
5
*
8
]);
THRESHOLD
(
tmp7
,
d7
,
threshold
[
7
*
8
]);
//Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
z13
=
tmp6
+
tmp5
;
z10
=
(
tmp6
-
tmp5
)
<<
1
;
z11
=
tmp4
+
tmp7
;
z12
=
(
tmp4
-
tmp7
)
<<
1
;
tmp7
=
(
z11
+
z13
)
>>
2
;
//+2 !
tmp11
=
MULTIPLY16H
((
z11
-
z13
)
<<
1
,
FIX_1_414213562
);
z5
=
MULTIPLY16H
(
z10
+
z12
,
FIX_1_847759065
);
tmp10
=
MULTIPLY16H
(
z12
,
FIX_1_082392200
)
-
z5
;
tmp12
=
MULTIPLY16H
(
z10
,
FIX_2_613125930
)
+
z5
;
// - !!
tmp6
=
tmp12
-
tmp7
;
tmp5
=
tmp11
-
tmp6
;
tmp4
=
tmp10
+
tmp5
;
wsptr
[
DCTSIZE
*
0
]
+=
(
tmp0
+
tmp7
);
wsptr
[
DCTSIZE
*
1
]
+=
(
tmp1
+
tmp6
);
wsptr
[
DCTSIZE
*
2
]
+=
(
tmp2
+
tmp5
);
wsptr
[
DCTSIZE
*
3
]
+=
(
tmp3
-
tmp4
);
wsptr
[
DCTSIZE
*
4
]
+=
(
tmp3
+
tmp4
);
wsptr
[
DCTSIZE
*
5
]
+=
(
tmp2
-
tmp5
);
wsptr
[
DCTSIZE
*
6
]
=
(
tmp1
-
tmp6
);
wsptr
[
DCTSIZE
*
7
]
=
(
tmp0
-
tmp7
);
//
dataptr
++
;
//next column
wsptr
++
;
threshold
++
;
}
dataptr
+=
8
;
//skip each second start pos
wsptr
+=
8
;
}
}
#else
/* HAVE_MMX_INLINE */
static
void
column_fidct_mmx
(
int16_t
*
thr_adr
,
int16_t
*
data
,
int16_t
*
output
,
int
cnt
)
{
DECLARE_ALIGNED
(
8
,
uint64_t
,
temps
)[
4
];
__asm__
volatile
(
ASMALIGN
(
4
)
"1:
\n\t
"
"movq "
DCTSIZE_S
"*0*2(%%"
REG_S
"), %%mm1
\n\t
"
//
"movq "
DCTSIZE_S
"*3*2(%%"
REG_S
"), %%mm7
\n\t
"
"movq %%mm1, %%mm0
\n\t
"
"paddw "
DCTSIZE_S
"*7*2(%%"
REG_S
"), %%mm1
\n\t
"
//t0
"movq %%mm7, %%mm3
\n\t
"
"paddw "
DCTSIZE_S
"*4*2(%%"
REG_S
"), %%mm7
\n\t
"
//t3
"movq %%mm1, %%mm5
\n\t
"
"movq "
DCTSIZE_S
"*1*2(%%"
REG_S
"), %%mm6
\n\t
"
"psubw %%mm7, %%mm1
\n\t
"
//t13
"movq "
DCTSIZE_S
"*2*2(%%"
REG_S
"), %%mm2
\n\t
"
"movq %%mm6, %%mm4
\n\t
"
"paddw "
DCTSIZE_S
"*6*2(%%"
REG_S
"), %%mm6
\n\t
"
//t1
"paddw %%mm7, %%mm5
\n\t
"
//t10
"paddw "
DCTSIZE_S
"*5*2(%%"
REG_S
"), %%mm2
\n\t
"
//t2
"movq %%mm6, %%mm7
\n\t
"
"paddw %%mm2, %%mm6
\n\t
"
//t11
"psubw %%mm2, %%mm7
\n\t
"
//t12
"movq %%mm5, %%mm2
\n\t
"
"paddw %%mm6, %%mm5
\n\t
"
//d0
// i0 t13 t12 i3 i1 d0 - d4
"psubw %%mm6, %%mm2
\n\t
"
//d4
"paddw %%mm1, %%mm7
\n\t
"
"movq 4*16(%%"
REG_d
"), %%mm6
\n\t
"
"psllw $2, %%mm7
\n\t
"
"psubw 0*16(%%"
REG_d
"), %%mm5
\n\t
"
"psubw %%mm6, %%mm2
\n\t
"
"paddusw 0*16(%%"
REG_d
"), %%mm5
\n\t
"
"paddusw %%mm6, %%mm2
\n\t
"
"pmulhw "
MANGLE
(
ff_MM_FIX_0_707106781
)
", %%mm7
\n\t
"
//
"paddw 0*16(%%"
REG_d
"), %%mm5
\n\t
"
"paddw %%mm6, %%mm2
\n\t
"
"psubusw 0*16(%%"
REG_d
"), %%mm5
\n\t
"
"psubusw %%mm6, %%mm2
\n\t
"
//This func is totally compute-bound, operates at huge speed. So, DC shortcut
// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
"paddw "
MANGLE
(
MM_2
)
", %%mm5
\n\t
"
"movq %%mm2, %%mm6
\n\t
"
"paddw %%mm5, %%mm2
\n\t
"
"psubw %%mm6, %%mm5
\n\t
"
"movq %%mm1, %%mm6
\n\t
"
"paddw %%mm7, %%mm1
\n\t
"
//d2
"psubw 2*16(%%"
REG_d
"), %%mm1
\n\t
"
"psubw %%mm7, %%mm6
\n\t
"
//d6
"movq 6*16(%%"
REG_d
"), %%mm7
\n\t
"
"psraw $2, %%mm5
\n\t
"
"paddusw 2*16(%%"
REG_d
"), %%mm1
\n\t
"
"psubw %%mm7, %%mm6
\n\t
"
// t7 d2 /t11 t4 t6 - d6 /t10
"paddw 2*16(%%"
REG_d
"), %%mm1
\n\t
"
"paddusw %%mm7, %%mm6
\n\t
"
"psubusw 2*16(%%"
REG_d
"), %%mm1
\n\t
"
"paddw %%mm7, %%mm6
\n\t
"
"psubw "
DCTSIZE_S
"*4*2(%%"
REG_S
"), %%mm3
\n\t
"
"psubusw %%mm7, %%mm6
\n\t
"
//movq [edi+"DCTSIZE_S"*2*2], mm1
//movq [edi+"DCTSIZE_S"*6*2], mm6
"movq %%mm1, %%mm7
\n\t
"
"psraw $2, %%mm2
\n\t
"
"psubw "
DCTSIZE_S
"*6*2(%%"
REG_S
"), %%mm4
\n\t
"
"psubw %%mm6, %%mm1
\n\t
"
"psubw "
DCTSIZE_S
"*7*2(%%"
REG_S
"), %%mm0
\n\t
"
"paddw %%mm7, %%mm6
\n\t
"
//'t13
"psraw $2, %%mm6
\n\t
"
//paddw mm6, MM_2 !! ---
"movq %%mm2, %%mm7
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_414213562_A
)
", %%mm1
\n\t
"
"paddw %%mm6, %%mm2
\n\t
"
//'t0
"movq %%mm2, 0*8+%3
\n\t
"
//!
"psubw %%mm6, %%mm7
\n\t
"
//'t3
"movq "
DCTSIZE_S
"*2*2(%%"
REG_S
"), %%mm2
\n\t
"
"psubw %%mm6, %%mm1
\n\t
"
//'t12
"psubw "
DCTSIZE_S
"*5*2(%%"
REG_S
"), %%mm2
\n\t
"
//t5
"movq %%mm5, %%mm6
\n\t
"
"movq %%mm7, 3*8+%3
\n\t
"
"paddw %%mm2, %%mm3
\n\t
"
//t10
"paddw %%mm4, %%mm2
\n\t
"
//t11
"paddw %%mm0, %%mm4
\n\t
"
//t12
"movq %%mm3, %%mm7
\n\t
"
"psubw %%mm4, %%mm3
\n\t
"
"psllw $2, %%mm3
\n\t
"
"psllw $2, %%mm7
\n\t
"
//opt for P6
"pmulhw "
MANGLE
(
MM_FIX_0_382683433
)
", %%mm3
\n\t
"
"psllw $2, %%mm4
\n\t
"
"pmulhw "
MANGLE
(
ff_MM_FIX_0_541196100
)
", %%mm7
\n\t
"
"psllw $2, %%mm2
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_306562965
)
", %%mm4
\n\t
"
"paddw %%mm1, %%mm5
\n\t
"
//'t1
"pmulhw "
MANGLE
(
ff_MM_FIX_0_707106781
)
", %%mm2
\n\t
"
"psubw %%mm1, %%mm6
\n\t
"
//'t2
// t7 't12 't11 t4 t6 - 't13 't10 ---
"paddw %%mm3, %%mm7
\n\t
"
//z2
"movq %%mm5, 1*8+%3
\n\t
"
"paddw %%mm3, %%mm4
\n\t
"
//z4
"movq 3*16(%%"
REG_d
"), %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm6, 2*8+%3
\n\t
"
"psubw %%mm2, %%mm1
\n\t
"
//z13
//===
"paddw %%mm2, %%mm0
\n\t
"
//z11
"movq %%mm1, %%mm5
\n\t
"
"movq 5*16(%%"
REG_d
"), %%mm2
\n\t
"
"psubw %%mm7, %%mm1
\n\t
"
//d3
"paddw %%mm7, %%mm5
\n\t
"
//d5
"psubw %%mm3, %%mm1
\n\t
"
"movq 1*16(%%"
REG_d
"), %%mm7
\n\t
"
"psubw %%mm2, %%mm5
\n\t
"
"movq %%mm0, %%mm6
\n\t
"
"paddw %%mm4, %%mm0
\n\t
"
//d1
"paddusw %%mm3, %%mm1
\n\t
"
"psubw %%mm4, %%mm6
\n\t
"
//d7
// d1 d3 - - - d5 d7 -
"movq 7*16(%%"
REG_d
"), %%mm4
\n\t
"
"psubw %%mm7, %%mm0
\n\t
"
"psubw %%mm4, %%mm6
\n\t
"
"paddusw %%mm2, %%mm5
\n\t
"
"paddusw %%mm4, %%mm6
\n\t
"
"paddw %%mm3, %%mm1
\n\t
"
"paddw %%mm2, %%mm5
\n\t
"
"paddw %%mm4, %%mm6
\n\t
"
"psubusw %%mm3, %%mm1
\n\t
"
"psubusw %%mm2, %%mm5
\n\t
"
"psubusw %%mm4, %%mm6
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"por %%mm5, %%mm4
\n\t
"
"paddusw %%mm7, %%mm0
\n\t
"
"por %%mm6, %%mm4
\n\t
"
"paddw %%mm7, %%mm0
\n\t
"
"packssdw %%mm4, %%mm4
\n\t
"
"psubusw %%mm7, %%mm0
\n\t
"
"movd %%mm4, %%"
REG_a
"
\n\t
"
"or %%"
REG_a
", %%"
REG_a
"
\n\t
"
"jnz 2f
\n\t
"
//movq [edi+"DCTSIZE_S"*3*2], mm1
//movq [edi+"DCTSIZE_S"*5*2], mm5
//movq [edi+"DCTSIZE_S"*1*2], mm0
//movq [edi+"DCTSIZE_S"*7*2], mm6
// t4 t5 - - - t6 t7 -
//--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
"movq 0*8+%3, %%mm4
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_0_847759065
)
", %%mm0
\n\t
"
//tmp6
"movq %%mm1, %%mm2
\n\t
"
"movq "
DCTSIZE_S
"*0*2(%%"
REG_D
"), %%mm5
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_0_566454497
)
", %%mm1
\n\t
"
//tmp5
"paddw %%mm4, %%mm5
\n\t
"
"movq 1*8+%3, %%mm6
\n\t
"
//paddw mm3, MM_2
"psraw $2, %%mm3
\n\t
"
//tmp7
"pmulhw "
MANGLE
(
MM_FIX_0_198912367
)
", %%mm2
\n\t
"
//-tmp4
"psubw %%mm3, %%mm4
\n\t
"
"movq "
DCTSIZE_S
"*1*2(%%"
REG_D
"), %%mm7
\n\t
"
"paddw %%mm3, %%mm5
\n\t
"
"movq %%mm4, "
DCTSIZE_S
"*7*2(%%"
REG_D
")
\n\t
"
"paddw %%mm6, %%mm7
\n\t
"
"movq 2*8+%3, %%mm3
\n\t
"
"psubw %%mm0, %%mm6
\n\t
"
"movq "
DCTSIZE_S
"*2*2(%%"
REG_D
"), %%mm4
\n\t
"
"paddw %%mm0, %%mm7
\n\t
"
"movq %%mm5, "
DCTSIZE_S
"*0*2(%%"
REG_D
")
\n\t
"
"paddw %%mm3, %%mm4
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*6*2(%%"
REG_D
")
\n\t
"
"psubw %%mm1, %%mm3
\n\t
"
"movq "
DCTSIZE_S
"*5*2(%%"
REG_D
"), %%mm5
\n\t
"
"paddw %%mm1, %%mm4
\n\t
"
"movq "
DCTSIZE_S
"*3*2(%%"
REG_D
"), %%mm6
\n\t
"
"paddw %%mm3, %%mm5
\n\t
"
"movq 3*8+%3, %%mm0
\n\t
"
"add $8, %%"
REG_S
"
\n\t
"
"movq %%mm7, "
DCTSIZE_S
"*1*2(%%"
REG_D
")
\n\t
"
"paddw %%mm0, %%mm6
\n\t
"
"movq %%mm4, "
DCTSIZE_S
"*2*2(%%"
REG_D
")
\n\t
"
"psubw %%mm2, %%mm0
\n\t
"
"movq "
DCTSIZE_S
"*4*2(%%"
REG_D
"), %%mm7
\n\t
"
"paddw %%mm2, %%mm6
\n\t
"
"movq %%mm5, "
DCTSIZE_S
"*5*2(%%"
REG_D
")
\n\t
"
"paddw %%mm0, %%mm7
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*3*2(%%"
REG_D
")
\n\t
"
"movq %%mm7, "
DCTSIZE_S
"*4*2(%%"
REG_D
")
\n\t
"
"add $8, %%"
REG_D
"
\n\t
"
"jmp 4f
\n\t
"
"2:
\n\t
"
//--- non DC2
//psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
//psraw mm5, 2
//psraw mm0, 2
//psraw mm6, 2
"movq %%mm5, %%mm3
\n\t
"
"psubw %%mm1, %%mm5
\n\t
"
"psllw $1, %%mm5
\n\t
"
//'z10
"paddw %%mm1, %%mm3
\n\t
"
//'z13
"movq %%mm0, %%mm2
\n\t
"
"psubw %%mm6, %%mm0
\n\t
"
"movq %%mm5, %%mm1
\n\t
"
"psllw $1, %%mm0
\n\t
"
//'z12
"pmulhw "
MANGLE
(
MM_FIX_2_613125930
)
", %%mm1
\n\t
"
//-
"paddw %%mm0, %%mm5
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_847759065
)
", %%mm5
\n\t
"
//'z5
"paddw %%mm6, %%mm2
\n\t
"
//'z11
"pmulhw "
MANGLE
(
MM_FIX_1_082392200
)
", %%mm0
\n\t
"
"movq %%mm2, %%mm7
\n\t
"
//---
"movq 0*8+%3, %%mm4
\n\t
"
"psubw %%mm3, %%mm2
\n\t
"
"psllw $1, %%mm2
\n\t
"
"paddw %%mm3, %%mm7
\n\t
"
//'t7
"pmulhw "
MANGLE
(
MM_FIX_1_414213562
)
", %%mm2
\n\t
"
//'t11
"movq %%mm4, %%mm6
\n\t
"
//paddw mm7, MM_2
"psraw $2, %%mm7
\n\t
"
"paddw "
DCTSIZE_S
"*0*2(%%"
REG_D
"), %%mm4
\n\t
"
"psubw %%mm7, %%mm6
\n\t
"
"movq 1*8+%3, %%mm3
\n\t
"
"paddw %%mm7, %%mm4
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*7*2(%%"
REG_D
")
\n\t
"
"paddw %%mm5, %%mm1
\n\t
"
//'t12
"movq %%mm4, "
DCTSIZE_S
"*0*2(%%"
REG_D
")
\n\t
"
"psubw %%mm7, %%mm1
\n\t
"
//'t6
"movq 2*8+%3, %%mm7
\n\t
"
"psubw %%mm5, %%mm0
\n\t
"
//'t10
"movq 3*8+%3, %%mm6
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"paddw "
DCTSIZE_S
"*1*2(%%"
REG_D
"), %%mm3
\n\t
"
"psubw %%mm1, %%mm5
\n\t
"
"psubw %%mm1, %%mm2
\n\t
"
//'t5
"paddw %%mm1, %%mm3
\n\t
"
"movq %%mm5, "
DCTSIZE_S
"*6*2(%%"
REG_D
")
\n\t
"
"movq %%mm7, %%mm4
\n\t
"
"paddw "
DCTSIZE_S
"*2*2(%%"
REG_D
"), %%mm7
\n\t
"
"psubw %%mm2, %%mm4
\n\t
"
"paddw "
DCTSIZE_S
"*5*2(%%"
REG_D
"), %%mm4
\n\t
"
"paddw %%mm2, %%mm7
\n\t
"
"movq %%mm3, "
DCTSIZE_S
"*1*2(%%"
REG_D
")
\n\t
"
"paddw %%mm2, %%mm0
\n\t
"
//'t4
// 't4 't6 't5 - - - - 't7
"movq %%mm7, "
DCTSIZE_S
"*2*2(%%"
REG_D
")
\n\t
"
"movq %%mm6, %%mm1
\n\t
"
"paddw "
DCTSIZE_S
"*4*2(%%"
REG_D
"), %%mm6
\n\t
"
"psubw %%mm0, %%mm1
\n\t
"
"paddw "
DCTSIZE_S
"*3*2(%%"
REG_D
"), %%mm1
\n\t
"
"paddw %%mm0, %%mm6
\n\t
"
"movq %%mm4, "
DCTSIZE_S
"*5*2(%%"
REG_D
")
\n\t
"
"add $8, %%"
REG_S
"
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*4*2(%%"
REG_D
")
\n\t
"
"movq %%mm1, "
DCTSIZE_S
"*3*2(%%"
REG_D
")
\n\t
"
"add $8, %%"
REG_D
"
\n\t
"
"4:
\n\t
"
//=part 2 (the same)===========================================================
"movq "
DCTSIZE_S
"*0*2(%%"
REG_S
"), %%mm1
\n\t
"
//
"movq "
DCTSIZE_S
"*3*2(%%"
REG_S
"), %%mm7
\n\t
"
"movq %%mm1, %%mm0
\n\t
"
"paddw "
DCTSIZE_S
"*7*2(%%"
REG_S
"), %%mm1
\n\t
"
//t0
"movq %%mm7, %%mm3
\n\t
"
"paddw "
DCTSIZE_S
"*4*2(%%"
REG_S
"), %%mm7
\n\t
"
//t3
"movq %%mm1, %%mm5
\n\t
"
"movq "
DCTSIZE_S
"*1*2(%%"
REG_S
"), %%mm6
\n\t
"
"psubw %%mm7, %%mm1
\n\t
"
//t13
"movq "
DCTSIZE_S
"*2*2(%%"
REG_S
"), %%mm2
\n\t
"
"movq %%mm6, %%mm4
\n\t
"
"paddw "
DCTSIZE_S
"*6*2(%%"
REG_S
"), %%mm6
\n\t
"
//t1
"paddw %%mm7, %%mm5
\n\t
"
//t10
"paddw "
DCTSIZE_S
"*5*2(%%"
REG_S
"), %%mm2
\n\t
"
//t2
"movq %%mm6, %%mm7
\n\t
"
"paddw %%mm2, %%mm6
\n\t
"
//t11
"psubw %%mm2, %%mm7
\n\t
"
//t12
"movq %%mm5, %%mm2
\n\t
"
"paddw %%mm6, %%mm5
\n\t
"
//d0
// i0 t13 t12 i3 i1 d0 - d4
"psubw %%mm6, %%mm2
\n\t
"
//d4
"paddw %%mm1, %%mm7
\n\t
"
"movq 1*8+4*16(%%"
REG_d
"), %%mm6
\n\t
"
"psllw $2, %%mm7
\n\t
"
"psubw 1*8+0*16(%%"
REG_d
"), %%mm5
\n\t
"
"psubw %%mm6, %%mm2
\n\t
"
"paddusw 1*8+0*16(%%"
REG_d
"), %%mm5
\n\t
"
"paddusw %%mm6, %%mm2
\n\t
"
"pmulhw "
MANGLE
(
ff_MM_FIX_0_707106781
)
", %%mm7
\n\t
"
//
"paddw 1*8+0*16(%%"
REG_d
"), %%mm5
\n\t
"
"paddw %%mm6, %%mm2
\n\t
"
"psubusw 1*8+0*16(%%"
REG_d
"), %%mm5
\n\t
"
"psubusw %%mm6, %%mm2
\n\t
"
//This func is totally compute-bound, operates at huge speed. So, DC shortcut
// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
"paddw "
MANGLE
(
MM_2
)
", %%mm5
\n\t
"
"movq %%mm2, %%mm6
\n\t
"
"paddw %%mm5, %%mm2
\n\t
"
"psubw %%mm6, %%mm5
\n\t
"
"movq %%mm1, %%mm6
\n\t
"
"paddw %%mm7, %%mm1
\n\t
"
//d2
"psubw 1*8+2*16(%%"
REG_d
"), %%mm1
\n\t
"
"psubw %%mm7, %%mm6
\n\t
"
//d6
"movq 1*8+6*16(%%"
REG_d
"), %%mm7
\n\t
"
"psraw $2, %%mm5
\n\t
"
"paddusw 1*8+2*16(%%"
REG_d
"), %%mm1
\n\t
"
"psubw %%mm7, %%mm6
\n\t
"
// t7 d2 /t11 t4 t6 - d6 /t10
"paddw 1*8+2*16(%%"
REG_d
"), %%mm1
\n\t
"
"paddusw %%mm7, %%mm6
\n\t
"
"psubusw 1*8+2*16(%%"
REG_d
"), %%mm1
\n\t
"
"paddw %%mm7, %%mm6
\n\t
"
"psubw "
DCTSIZE_S
"*4*2(%%"
REG_S
"), %%mm3
\n\t
"
"psubusw %%mm7, %%mm6
\n\t
"
//movq [edi+"DCTSIZE_S"*2*2], mm1
//movq [edi+"DCTSIZE_S"*6*2], mm6
"movq %%mm1, %%mm7
\n\t
"
"psraw $2, %%mm2
\n\t
"
"psubw "
DCTSIZE_S
"*6*2(%%"
REG_S
"), %%mm4
\n\t
"
"psubw %%mm6, %%mm1
\n\t
"
"psubw "
DCTSIZE_S
"*7*2(%%"
REG_S
"), %%mm0
\n\t
"
"paddw %%mm7, %%mm6
\n\t
"
//'t13
"psraw $2, %%mm6
\n\t
"
//paddw mm6, MM_2 !! ---
"movq %%mm2, %%mm7
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_414213562_A
)
", %%mm1
\n\t
"
"paddw %%mm6, %%mm2
\n\t
"
//'t0
"movq %%mm2, 0*8+%3
\n\t
"
//!
"psubw %%mm6, %%mm7
\n\t
"
//'t3
"movq "
DCTSIZE_S
"*2*2(%%"
REG_S
"), %%mm2
\n\t
"
"psubw %%mm6, %%mm1
\n\t
"
//'t12
"psubw "
DCTSIZE_S
"*5*2(%%"
REG_S
"), %%mm2
\n\t
"
//t5
"movq %%mm5, %%mm6
\n\t
"
"movq %%mm7, 3*8+%3
\n\t
"
"paddw %%mm2, %%mm3
\n\t
"
//t10
"paddw %%mm4, %%mm2
\n\t
"
//t11
"paddw %%mm0, %%mm4
\n\t
"
//t12
"movq %%mm3, %%mm7
\n\t
"
"psubw %%mm4, %%mm3
\n\t
"
"psllw $2, %%mm3
\n\t
"
"psllw $2, %%mm7
\n\t
"
//opt for P6
"pmulhw "
MANGLE
(
MM_FIX_0_382683433
)
", %%mm3
\n\t
"
"psllw $2, %%mm4
\n\t
"
"pmulhw "
MANGLE
(
ff_MM_FIX_0_541196100
)
", %%mm7
\n\t
"
"psllw $2, %%mm2
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_306562965
)
", %%mm4
\n\t
"
"paddw %%mm1, %%mm5
\n\t
"
//'t1
"pmulhw "
MANGLE
(
ff_MM_FIX_0_707106781
)
", %%mm2
\n\t
"
"psubw %%mm1, %%mm6
\n\t
"
//'t2
// t7 't12 't11 t4 t6 - 't13 't10 ---
"paddw %%mm3, %%mm7
\n\t
"
//z2
"movq %%mm5, 1*8+%3
\n\t
"
"paddw %%mm3, %%mm4
\n\t
"
//z4
"movq 1*8+3*16(%%"
REG_d
"), %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm6, 2*8+%3
\n\t
"
"psubw %%mm2, %%mm1
\n\t
"
//z13
//===
"paddw %%mm2, %%mm0
\n\t
"
//z11
"movq %%mm1, %%mm5
\n\t
"
"movq 1*8+5*16(%%"
REG_d
"), %%mm2
\n\t
"
"psubw %%mm7, %%mm1
\n\t
"
//d3
"paddw %%mm7, %%mm5
\n\t
"
//d5
"psubw %%mm3, %%mm1
\n\t
"
"movq 1*8+1*16(%%"
REG_d
"), %%mm7
\n\t
"
"psubw %%mm2, %%mm5
\n\t
"
"movq %%mm0, %%mm6
\n\t
"
"paddw %%mm4, %%mm0
\n\t
"
//d1
"paddusw %%mm3, %%mm1
\n\t
"
"psubw %%mm4, %%mm6
\n\t
"
//d7
// d1 d3 - - - d5 d7 -
"movq 1*8+7*16(%%"
REG_d
"), %%mm4
\n\t
"
"psubw %%mm7, %%mm0
\n\t
"
"psubw %%mm4, %%mm6
\n\t
"
"paddusw %%mm2, %%mm5
\n\t
"
"paddusw %%mm4, %%mm6
\n\t
"
"paddw %%mm3, %%mm1
\n\t
"
"paddw %%mm2, %%mm5
\n\t
"
"paddw %%mm4, %%mm6
\n\t
"
"psubusw %%mm3, %%mm1
\n\t
"
"psubusw %%mm2, %%mm5
\n\t
"
"psubusw %%mm4, %%mm6
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"por %%mm5, %%mm4
\n\t
"
"paddusw %%mm7, %%mm0
\n\t
"
"por %%mm6, %%mm4
\n\t
"
"paddw %%mm7, %%mm0
\n\t
"
"packssdw %%mm4, %%mm4
\n\t
"
"psubusw %%mm7, %%mm0
\n\t
"
"movd %%mm4, %%"
REG_a
"
\n\t
"
"or %%"
REG_a
", %%"
REG_a
"
\n\t
"
"jnz 3f
\n\t
"
//movq [edi+"DCTSIZE_S"*3*2], mm1
//movq [edi+"DCTSIZE_S"*5*2], mm5
//movq [edi+"DCTSIZE_S"*1*2], mm0
//movq [edi+"DCTSIZE_S"*7*2], mm6
// t4 t5 - - - t6 t7 -
//--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
"movq 0*8+%3, %%mm4
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_0_847759065
)
", %%mm0
\n\t
"
//tmp6
"movq %%mm1, %%mm2
\n\t
"
"movq "
DCTSIZE_S
"*0*2(%%"
REG_D
"), %%mm5
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_0_566454497
)
", %%mm1
\n\t
"
//tmp5
"paddw %%mm4, %%mm5
\n\t
"
"movq 1*8+%3, %%mm6
\n\t
"
//paddw mm3, MM_2
"psraw $2, %%mm3
\n\t
"
//tmp7
"pmulhw "
MANGLE
(
MM_FIX_0_198912367
)
", %%mm2
\n\t
"
//-tmp4
"psubw %%mm3, %%mm4
\n\t
"
"movq "
DCTSIZE_S
"*1*2(%%"
REG_D
"), %%mm7
\n\t
"
"paddw %%mm3, %%mm5
\n\t
"
"movq %%mm4, "
DCTSIZE_S
"*7*2(%%"
REG_D
")
\n\t
"
"paddw %%mm6, %%mm7
\n\t
"
"movq 2*8+%3, %%mm3
\n\t
"
"psubw %%mm0, %%mm6
\n\t
"
"movq "
DCTSIZE_S
"*2*2(%%"
REG_D
"), %%mm4
\n\t
"
"paddw %%mm0, %%mm7
\n\t
"
"movq %%mm5, "
DCTSIZE_S
"*0*2(%%"
REG_D
")
\n\t
"
"paddw %%mm3, %%mm4
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*6*2(%%"
REG_D
")
\n\t
"
"psubw %%mm1, %%mm3
\n\t
"
"movq "
DCTSIZE_S
"*5*2(%%"
REG_D
"), %%mm5
\n\t
"
"paddw %%mm1, %%mm4
\n\t
"
"movq "
DCTSIZE_S
"*3*2(%%"
REG_D
"), %%mm6
\n\t
"
"paddw %%mm3, %%mm5
\n\t
"
"movq 3*8+%3, %%mm0
\n\t
"
"add $24, %%"
REG_S
"
\n\t
"
"movq %%mm7, "
DCTSIZE_S
"*1*2(%%"
REG_D
")
\n\t
"
"paddw %%mm0, %%mm6
\n\t
"
"movq %%mm4, "
DCTSIZE_S
"*2*2(%%"
REG_D
")
\n\t
"
"psubw %%mm2, %%mm0
\n\t
"
"movq "
DCTSIZE_S
"*4*2(%%"
REG_D
"), %%mm7
\n\t
"
"paddw %%mm2, %%mm6
\n\t
"
"movq %%mm5, "
DCTSIZE_S
"*5*2(%%"
REG_D
")
\n\t
"
"paddw %%mm0, %%mm7
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*3*2(%%"
REG_D
")
\n\t
"
"movq %%mm7, "
DCTSIZE_S
"*4*2(%%"
REG_D
")
\n\t
"
"add $24, %%"
REG_D
"
\n\t
"
"sub $2, %%"
REG_c
"
\n\t
"
"jnz 1b
\n\t
"
"jmp 5f
\n\t
"
"3:
\n\t
"
//--- non DC2
//psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
//psraw mm5, 2
//psraw mm0, 2
//psraw mm6, 2
"movq %%mm5, %%mm3
\n\t
"
"psubw %%mm1, %%mm5
\n\t
"
"psllw $1, %%mm5
\n\t
"
//'z10
"paddw %%mm1, %%mm3
\n\t
"
//'z13
"movq %%mm0, %%mm2
\n\t
"
"psubw %%mm6, %%mm0
\n\t
"
"movq %%mm5, %%mm1
\n\t
"
"psllw $1, %%mm0
\n\t
"
//'z12
"pmulhw "
MANGLE
(
MM_FIX_2_613125930
)
", %%mm1
\n\t
"
//-
"paddw %%mm0, %%mm5
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_847759065
)
", %%mm5
\n\t
"
//'z5
"paddw %%mm6, %%mm2
\n\t
"
//'z11
"pmulhw "
MANGLE
(
MM_FIX_1_082392200
)
", %%mm0
\n\t
"
"movq %%mm2, %%mm7
\n\t
"
//---
"movq 0*8+%3, %%mm4
\n\t
"
"psubw %%mm3, %%mm2
\n\t
"
"psllw $1, %%mm2
\n\t
"
"paddw %%mm3, %%mm7
\n\t
"
//'t7
"pmulhw "
MANGLE
(
MM_FIX_1_414213562
)
", %%mm2
\n\t
"
//'t11
"movq %%mm4, %%mm6
\n\t
"
//paddw mm7, MM_2
"psraw $2, %%mm7
\n\t
"
"paddw "
DCTSIZE_S
"*0*2(%%"
REG_D
"), %%mm4
\n\t
"
"psubw %%mm7, %%mm6
\n\t
"
"movq 1*8+%3, %%mm3
\n\t
"
"paddw %%mm7, %%mm4
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*7*2(%%"
REG_D
")
\n\t
"
"paddw %%mm5, %%mm1
\n\t
"
//'t12
"movq %%mm4, "
DCTSIZE_S
"*0*2(%%"
REG_D
")
\n\t
"
"psubw %%mm7, %%mm1
\n\t
"
//'t6
"movq 2*8+%3, %%mm7
\n\t
"
"psubw %%mm5, %%mm0
\n\t
"
//'t10
"movq 3*8+%3, %%mm6
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"paddw "
DCTSIZE_S
"*1*2(%%"
REG_D
"), %%mm3
\n\t
"
"psubw %%mm1, %%mm5
\n\t
"
"psubw %%mm1, %%mm2
\n\t
"
//'t5
"paddw %%mm1, %%mm3
\n\t
"
"movq %%mm5, "
DCTSIZE_S
"*6*2(%%"
REG_D
")
\n\t
"
"movq %%mm7, %%mm4
\n\t
"
"paddw "
DCTSIZE_S
"*2*2(%%"
REG_D
"), %%mm7
\n\t
"
"psubw %%mm2, %%mm4
\n\t
"
"paddw "
DCTSIZE_S
"*5*2(%%"
REG_D
"), %%mm4
\n\t
"
"paddw %%mm2, %%mm7
\n\t
"
"movq %%mm3, "
DCTSIZE_S
"*1*2(%%"
REG_D
")
\n\t
"
"paddw %%mm2, %%mm0
\n\t
"
//'t4
// 't4 't6 't5 - - - - 't7
"movq %%mm7, "
DCTSIZE_S
"*2*2(%%"
REG_D
")
\n\t
"
"movq %%mm6, %%mm1
\n\t
"
"paddw "
DCTSIZE_S
"*4*2(%%"
REG_D
"), %%mm6
\n\t
"
"psubw %%mm0, %%mm1
\n\t
"
"paddw "
DCTSIZE_S
"*3*2(%%"
REG_D
"), %%mm1
\n\t
"
"paddw %%mm0, %%mm6
\n\t
"
"movq %%mm4, "
DCTSIZE_S
"*5*2(%%"
REG_D
")
\n\t
"
"add $24, %%"
REG_S
"
\n\t
"
"movq %%mm6, "
DCTSIZE_S
"*4*2(%%"
REG_D
")
\n\t
"
"movq %%mm1, "
DCTSIZE_S
"*3*2(%%"
REG_D
")
\n\t
"
"add $24, %%"
REG_D
"
\n\t
"
"sub $2, %%"
REG_c
"
\n\t
"
"jnz 1b
\n\t
"
"5:
\n\t
"
:
"+S"
(
data
),
"+D"
(
output
),
"+c"
(
cnt
),
"=o"
(
temps
)
:
"d"
(
thr_adr
)
NAMED_CONSTRAINTS_ADD
(
ff_MM_FIX_0_707106781
,
MM_2
,
MM_FIX_1_414213562_A
,
MM_FIX_1_414213562
,
MM_FIX_0_382683433
,
ff_MM_FIX_0_541196100
,
MM_FIX_1_306562965
,
MM_FIX_0_847759065
)
NAMED_CONSTRAINTS_ADD
(
MM_FIX_0_566454497
,
MM_FIX_0_198912367
,
MM_FIX_2_613125930
,
MM_FIX_1_847759065
,
MM_FIX_1_082392200
)
:
"%"
REG_a
);
}
#endif // HAVE_MMX_INLINE
#if !HAVE_MMX_INLINE
static
void
row_idct_c
(
int16_t
*
workspace
,
int16_t
*
output_adr
,
int
output_stride
,
int
cnt
)
{
int_simd16_t
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
;
int_simd16_t
tmp10
,
tmp11
,
tmp12
,
tmp13
;
int_simd16_t
z5
,
z10
,
z11
,
z12
,
z13
;
int16_t
*
outptr
;
int16_t
*
wsptr
;
cnt
*=
4
;
wsptr
=
workspace
;
outptr
=
output_adr
;
for
(;
cnt
>
0
;
cnt
--
)
{
// Even part
//Simd version reads 4x4 block and transposes it
tmp10
=
(
wsptr
[
2
]
+
wsptr
[
3
]);
tmp11
=
(
wsptr
[
2
]
-
wsptr
[
3
]);
tmp13
=
(
wsptr
[
0
]
+
wsptr
[
1
]);
tmp12
=
(
MULTIPLY16H
(
wsptr
[
0
]
-
wsptr
[
1
],
FIX_1_414213562_A
)
<<
2
)
-
tmp13
;
//this shift order to avoid overflow
tmp0
=
tmp10
+
tmp13
;
//->temps
tmp3
=
tmp10
-
tmp13
;
//->temps
tmp1
=
tmp11
+
tmp12
;
tmp2
=
tmp11
-
tmp12
;
// Odd part
//Also transpose, with previous:
// ---- ---- ||||
// ---- ---- idct ||||
// ---- ---- ---> ||||
// ---- ---- ||||
z13
=
wsptr
[
4
]
+
wsptr
[
5
];
z10
=
wsptr
[
4
]
-
wsptr
[
5
];
z11
=
wsptr
[
6
]
+
wsptr
[
7
];
z12
=
wsptr
[
6
]
-
wsptr
[
7
];
tmp7
=
z11
+
z13
;
tmp11
=
MULTIPLY16H
(
z11
-
z13
,
FIX_1_414213562
);
z5
=
MULTIPLY16H
(
z10
+
z12
,
FIX_1_847759065
);
tmp10
=
MULTIPLY16H
(
z12
,
FIX_1_082392200
)
-
z5
;
tmp12
=
MULTIPLY16H
(
z10
,
FIX_2_613125930
)
+
z5
;
// - FIX_
tmp6
=
(
tmp12
<<
3
)
-
tmp7
;
tmp5
=
(
tmp11
<<
3
)
-
tmp6
;
tmp4
=
(
tmp10
<<
3
)
+
tmp5
;
// Final output stage: descale and write column
outptr
[
0
*
output_stride
]
+=
DESCALE
(
tmp0
+
tmp7
,
3
);
outptr
[
1
*
output_stride
]
+=
DESCALE
(
tmp1
+
tmp6
,
3
);
outptr
[
2
*
output_stride
]
+=
DESCALE
(
tmp2
+
tmp5
,
3
);
outptr
[
3
*
output_stride
]
+=
DESCALE
(
tmp3
-
tmp4
,
3
);
outptr
[
4
*
output_stride
]
+=
DESCALE
(
tmp3
+
tmp4
,
3
);
outptr
[
5
*
output_stride
]
+=
DESCALE
(
tmp2
-
tmp5
,
3
);
outptr
[
6
*
output_stride
]
+=
DESCALE
(
tmp1
-
tmp6
,
3
);
//no += ?
outptr
[
7
*
output_stride
]
+=
DESCALE
(
tmp0
-
tmp7
,
3
);
//no += ?
outptr
++
;
wsptr
+=
DCTSIZE
;
// advance pointer to next row
}
}
#else
/* HAVE_MMX_INLINE */
static
void
row_idct_mmx
(
int16_t
*
workspace
,
int16_t
*
output_adr
,
int
output_stride
,
int
cnt
)
{
DECLARE_ALIGNED
(
8
,
uint64_t
,
temps
)[
4
];
__asm__
volatile
(
"lea (%%"
REG_a
",%%"
REG_a
",2), %%"
REG_d
"
\n\t
"
"1:
\n\t
"
"movq "
DCTSIZE_S
"*0*2(%%"
REG_S
"), %%mm0
\n\t
"
//
"movq "
DCTSIZE_S
"*1*2(%%"
REG_S
"), %%mm1
\n\t
"
"movq %%mm0, %%mm4
\n\t
"
"movq "
DCTSIZE_S
"*2*2(%%"
REG_S
"), %%mm2
\n\t
"
"punpcklwd %%mm1, %%mm0
\n\t
"
"movq "
DCTSIZE_S
"*3*2(%%"
REG_S
"), %%mm3
\n\t
"
"punpckhwd %%mm1, %%mm4
\n\t
"
//transpose 4x4
"movq %%mm2, %%mm7
\n\t
"
"punpcklwd %%mm3, %%mm2
\n\t
"
"movq %%mm0, %%mm6
\n\t
"
"punpckldq %%mm2, %%mm0
\n\t
"
//0
"punpckhdq %%mm2, %%mm6
\n\t
"
//1
"movq %%mm0, %%mm5
\n\t
"
"punpckhwd %%mm3, %%mm7
\n\t
"
"psubw %%mm6, %%mm0
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_414213562_A
)
", %%mm0
\n\t
"
"movq %%mm4, %%mm2
\n\t
"
"punpckldq %%mm7, %%mm4
\n\t
"
//2
"paddw %%mm6, %%mm5
\n\t
"
"punpckhdq %%mm7, %%mm2
\n\t
"
//3
"movq %%mm4, %%mm1
\n\t
"
"psllw $2, %%mm0
\n\t
"
"paddw %%mm2, %%mm4
\n\t
"
//t10
"movq "
DCTSIZE_S
"*0*2+"
DCTSIZE_S
"(%%"
REG_S
"), %%mm3
\n\t
"
"psubw %%mm2, %%mm1
\n\t
"
//t11
"movq "
DCTSIZE_S
"*1*2+"
DCTSIZE_S
"(%%"
REG_S
"), %%mm2
\n\t
"
"psubw %%mm5, %%mm0
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"paddw %%mm5, %%mm4
\n\t
"
//t0
"psubw %%mm5, %%mm6
\n\t
"
//t3
"movq %%mm1, %%mm7
\n\t
"
"movq "
DCTSIZE_S
"*2*2+"
DCTSIZE_S
"(%%"
REG_S
"), %%mm5
\n\t
"
"paddw %%mm0, %%mm1
\n\t
"
//t1
"movq %%mm4, 0*8+%3
\n\t
"
//t0
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm6, 1*8+%3
\n\t
"
//t3
"punpcklwd %%mm2, %%mm3
\n\t
"
//transpose 4x4
"movq "
DCTSIZE_S
"*3*2+"
DCTSIZE_S
"(%%"
REG_S
"), %%mm6
\n\t
"
"punpckhwd %%mm2, %%mm4
\n\t
"
"movq %%mm5, %%mm2
\n\t
"
"punpcklwd %%mm6, %%mm5
\n\t
"
"psubw %%mm0, %%mm7
\n\t
"
//t2
"punpckhwd %%mm6, %%mm2
\n\t
"
"movq %%mm3, %%mm0
\n\t
"
"punpckldq %%mm5, %%mm3
\n\t
"
//4
"punpckhdq %%mm5, %%mm0
\n\t
"
//5
"movq %%mm4, %%mm5
\n\t
"
//
"movq %%mm3, %%mm6
\n\t
"
"punpckldq %%mm2, %%mm4
\n\t
"
//6
"psubw %%mm0, %%mm3
\n\t
"
//z10
"punpckhdq %%mm2, %%mm5
\n\t
"
//7
"paddw %%mm0, %%mm6
\n\t
"
//z13
"movq %%mm4, %%mm2
\n\t
"
"movq %%mm3, %%mm0
\n\t
"
"psubw %%mm5, %%mm4
\n\t
"
//z12
"pmulhw "
MANGLE
(
MM_FIX_2_613125930
)
", %%mm0
\n\t
"
//-
"paddw %%mm4, %%mm3
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_847759065
)
", %%mm3
\n\t
"
//z5
"paddw %%mm5, %%mm2
\n\t
"
//z11 >
"pmulhw "
MANGLE
(
MM_FIX_1_082392200
)
", %%mm4
\n\t
"
"movq %%mm2, %%mm5
\n\t
"
"psubw %%mm6, %%mm2
\n\t
"
"paddw %%mm6, %%mm5
\n\t
"
//t7
"pmulhw "
MANGLE
(
MM_FIX_1_414213562
)
", %%mm2
\n\t
"
//t11
"paddw %%mm3, %%mm0
\n\t
"
//t12
"psllw $3, %%mm0
\n\t
"
"psubw %%mm3, %%mm4
\n\t
"
//t10
"movq 0*8+%3, %%mm6
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"psllw $3, %%mm4
\n\t
"
"psubw %%mm5, %%mm0
\n\t
"
//t6
"psllw $3, %%mm2
\n\t
"
"paddw %%mm0, %%mm1
\n\t
"
//d1
"psubw %%mm0, %%mm2
\n\t
"
//t5
"psubw %%mm0, %%mm3
\n\t
"
//d6
"paddw %%mm2, %%mm4
\n\t
"
//t4
"movq %%mm7, %%mm0
\n\t
"
"paddw %%mm2, %%mm7
\n\t
"
//d2
"psubw %%mm2, %%mm0
\n\t
"
//d5
"movq "
MANGLE
(
MM_DESCALE_RND
)
", %%mm2
\n\t
"
//4
"psubw %%mm5, %%mm6
\n\t
"
//d7
"paddw 0*8+%3, %%mm5
\n\t
"
//d0
"paddw %%mm2, %%mm1
\n\t
"
"paddw %%mm2, %%mm5
\n\t
"
"psraw $3, %%mm1
\n\t
"
"paddw %%mm2, %%mm7
\n\t
"
"psraw $3, %%mm5
\n\t
"
"paddw (%%"
REG_D
"), %%mm5
\n\t
"
"psraw $3, %%mm7
\n\t
"
"paddw (%%"
REG_D
",%%"
REG_a
"), %%mm1
\n\t
"
"paddw %%mm2, %%mm0
\n\t
"
"paddw (%%"
REG_D
",%%"
REG_a
",2), %%mm7
\n\t
"
"paddw %%mm2, %%mm3
\n\t
"
"movq %%mm5, (%%"
REG_D
")
\n\t
"
"paddw %%mm2, %%mm6
\n\t
"
"movq %%mm1, (%%"
REG_D
",%%"
REG_a
")
\n\t
"
"psraw $3, %%mm0
\n\t
"
"movq %%mm7, (%%"
REG_D
",%%"
REG_a
",2)
\n\t
"
"add %%"
REG_d
", %%"
REG_D
"
\n\t
"
//3*ls
"movq 1*8+%3, %%mm5
\n\t
"
//t3
"psraw $3, %%mm3
\n\t
"
"paddw (%%"
REG_D
",%%"
REG_a
",2), %%mm0
\n\t
"
"psubw %%mm4, %%mm5
\n\t
"
//d3
"paddw (%%"
REG_D
",%%"
REG_d
"), %%mm3
\n\t
"
"psraw $3, %%mm6
\n\t
"
"paddw 1*8+%3, %%mm4
\n\t
"
//d4
"paddw %%mm2, %%mm5
\n\t
"
"paddw (%%"
REG_D
",%%"
REG_a
",4), %%mm6
\n\t
"
"paddw %%mm2, %%mm4
\n\t
"
"movq %%mm0, (%%"
REG_D
",%%"
REG_a
",2)
\n\t
"
"psraw $3, %%mm5
\n\t
"
"paddw (%%"
REG_D
"), %%mm5
\n\t
"
"psraw $3, %%mm4
\n\t
"
"paddw (%%"
REG_D
",%%"
REG_a
"), %%mm4
\n\t
"
"add $"
DCTSIZE_S
"*2*4, %%"
REG_S
"
\n\t
"
//4 rows
"movq %%mm3, (%%"
REG_D
",%%"
REG_d
")
\n\t
"
"movq %%mm6, (%%"
REG_D
",%%"
REG_a
",4)
\n\t
"
"movq %%mm5, (%%"
REG_D
")
\n\t
"
"movq %%mm4, (%%"
REG_D
",%%"
REG_a
")
\n\t
"
"sub %%"
REG_d
", %%"
REG_D
"
\n\t
"
"add $8, %%"
REG_D
"
\n\t
"
"dec %%"
REG_c
"
\n\t
"
"jnz 1b
\n\t
"
:
"+S"
(
workspace
),
"+D"
(
output_adr
),
"+c"
(
cnt
),
"=o"
(
temps
)
:
"a"
(
output_stride
*
sizeof
(
short
))
NAMED_CONSTRAINTS_ADD
(
MM_FIX_1_414213562_A
,
MM_FIX_2_613125930
,
MM_FIX_1_847759065
,
MM_FIX_1_082392200
,
MM_FIX_1_414213562
,
MM_DESCALE_RND
)
:
"%"
REG_d
);
}
#endif // HAVE_MMX_INLINE
#if !HAVE_MMX_INLINE
static
void
row_fdct_c
(
int16_t
*
data
,
const
uint8_t
*
pixels
,
int
line_size
,
int
cnt
)
{
int_simd16_t
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
;
int_simd16_t
tmp10
,
tmp11
,
tmp12
,
tmp13
;
int_simd16_t
z1
,
z2
,
z3
,
z4
,
z5
,
z11
,
z13
;
int16_t
*
dataptr
;
cnt
*=
4
;
// Pass 1: process rows.
dataptr
=
data
;
for
(;
cnt
>
0
;
cnt
--
)
{
tmp0
=
pixels
[
line_size
*
0
]
+
pixels
[
line_size
*
7
];
tmp7
=
pixels
[
line_size
*
0
]
-
pixels
[
line_size
*
7
];
tmp1
=
pixels
[
line_size
*
1
]
+
pixels
[
line_size
*
6
];
tmp6
=
pixels
[
line_size
*
1
]
-
pixels
[
line_size
*
6
];
tmp2
=
pixels
[
line_size
*
2
]
+
pixels
[
line_size
*
5
];
tmp5
=
pixels
[
line_size
*
2
]
-
pixels
[
line_size
*
5
];
tmp3
=
pixels
[
line_size
*
3
]
+
pixels
[
line_size
*
4
];
tmp4
=
pixels
[
line_size
*
3
]
-
pixels
[
line_size
*
4
];
// Even part
tmp10
=
tmp0
+
tmp3
;
tmp13
=
tmp0
-
tmp3
;
tmp11
=
tmp1
+
tmp2
;
tmp12
=
tmp1
-
tmp2
;
//Even columns are written first, this leads to different order of columns
//in column_fidct(), but they are processed independently, so all ok.
//Later in the row_idct() columns readed at the same order.
dataptr
[
2
]
=
tmp10
+
tmp11
;
dataptr
[
3
]
=
tmp10
-
tmp11
;
z1
=
MULTIPLY16H
((
tmp12
+
tmp13
)
<<
2
,
FIX_0_707106781
);
dataptr
[
0
]
=
tmp13
+
z1
;
dataptr
[
1
]
=
tmp13
-
z1
;
// Odd part
tmp10
=
(
tmp4
+
tmp5
)
<<
2
;
tmp11
=
(
tmp5
+
tmp6
)
<<
2
;
tmp12
=
(
tmp6
+
tmp7
)
<<
2
;
z5
=
MULTIPLY16H
(
tmp10
-
tmp12
,
FIX_0_382683433
);
z2
=
MULTIPLY16H
(
tmp10
,
FIX_0_541196100
)
+
z5
;
z4
=
MULTIPLY16H
(
tmp12
,
FIX_1_306562965
)
+
z5
;
z3
=
MULTIPLY16H
(
tmp11
,
FIX_0_707106781
);
z11
=
tmp7
+
z3
;
z13
=
tmp7
-
z3
;
dataptr
[
4
]
=
z13
+
z2
;
dataptr
[
5
]
=
z13
-
z2
;
dataptr
[
6
]
=
z11
+
z4
;
dataptr
[
7
]
=
z11
-
z4
;
pixels
++
;
// advance pointer to next column
dataptr
+=
DCTSIZE
;
}
}
#else
/* HAVE_MMX_INLINE */
static
void
row_fdct_mmx
(
int16_t
*
data
,
const
uint8_t
*
pixels
,
int
line_size
,
int
cnt
)
{
DECLARE_ALIGNED
(
8
,
uint64_t
,
temps
)[
4
];
__asm__
volatile
(
"lea (%%"
REG_a
",%%"
REG_a
",2), %%"
REG_d
"
\n\t
"
"6:
\n\t
"
"movd (%%"
REG_S
"), %%mm0
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"movd (%%"
REG_S
",%%"
REG_a
"), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"movd (%%"
REG_S
",%%"
REG_a
",2), %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"add %%"
REG_d
", %%"
REG_S
"
\n\t
"
"movq %%mm0, %%mm5
\n\t
"
//
"movd (%%"
REG_S
",%%"
REG_a
",4), %%mm3
\n\t
"
//7 ;prefetch!
"movq %%mm1, %%mm6
\n\t
"
"movd (%%"
REG_S
",%%"
REG_d
"), %%mm4
\n\t
"
//6
"punpcklbw %%mm7, %%mm3
\n\t
"
"psubw %%mm3, %%mm5
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"paddw %%mm3, %%mm0
\n\t
"
"psubw %%mm4, %%mm6
\n\t
"
"movd (%%"
REG_S
",%%"
REG_a
",2), %%mm3
\n\t
"
//5
"paddw %%mm4, %%mm1
\n\t
"
"movq %%mm5, %3
\n\t
"
//t7
"punpcklbw %%mm7, %%mm3
\n\t
"
"movq %%mm6, %4
\n\t
"
//t6
"movq %%mm2, %%mm4
\n\t
"
"movd (%%"
REG_S
"), %%mm5
\n\t
"
//3
"paddw %%mm3, %%mm2
\n\t
"
"movd (%%"
REG_S
",%%"
REG_a
"), %%mm6
\n\t
"
//4
"punpcklbw %%mm7, %%mm5
\n\t
"
"psubw %%mm3, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm6
\n\t
"
"movq %%mm5, %%mm3
\n\t
"
"paddw %%mm6, %%mm5
\n\t
"
//t3
"psubw %%mm6, %%mm3
\n\t
"
//t4 ; t0 t1 t2 t4 t5 t3 - -
"movq %%mm0, %%mm6
\n\t
"
"movq %%mm1, %%mm7
\n\t
"
"psubw %%mm5, %%mm0
\n\t
"
//t13
"psubw %%mm2, %%mm1
\n\t
"
"paddw %%mm2, %%mm7
\n\t
"
//t11
"paddw %%mm0, %%mm1
\n\t
"
"movq %%mm7, %%mm2
\n\t
"
"psllw $2, %%mm1
\n\t
"
"paddw %%mm5, %%mm6
\n\t
"
//t10
"pmulhw "
MANGLE
(
ff_MM_FIX_0_707106781
)
", %%mm1
\n\t
"
"paddw %%mm6, %%mm7
\n\t
"
//d2
"psubw %%mm2, %%mm6
\n\t
"
//d3
"movq %%mm0, %%mm5
\n\t
"
//transpose 4x4
"movq %%mm7, %%mm2
\n\t
"
"punpcklwd %%mm6, %%mm7
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
//d0
"punpckhwd %%mm6, %%mm2
\n\t
"
"psubw %%mm1, %%mm5
\n\t
"
//d1
"movq %%mm0, %%mm6
\n\t
"
"movq %4, %%mm1
\n\t
"
"punpcklwd %%mm5, %%mm0
\n\t
"
"punpckhwd %%mm5, %%mm6
\n\t
"
"movq %%mm0, %%mm5
\n\t
"
"punpckldq %%mm7, %%mm0
\n\t
"
//0
"paddw %%mm4, %%mm3
\n\t
"
"punpckhdq %%mm7, %%mm5
\n\t
"
//1
"movq %%mm6, %%mm7
\n\t
"
"movq %%mm0, "
DCTSIZE_S
"*0*2(%%"
REG_D
")
\n\t
"
"punpckldq %%mm2, %%mm6
\n\t
"
//2
"movq %%mm5, "
DCTSIZE_S
"*1*2(%%"
REG_D
")
\n\t
"
"punpckhdq %%mm2, %%mm7
\n\t
"
//3
"movq %%mm6, "
DCTSIZE_S
"*2*2(%%"
REG_D
")
\n\t
"
"paddw %%mm1, %%mm4
\n\t
"
"movq %%mm7, "
DCTSIZE_S
"*3*2(%%"
REG_D
")
\n\t
"
"psllw $2, %%mm3
\n\t
"
//t10
"movq %3, %%mm2
\n\t
"
"psllw $2, %%mm4
\n\t
"
//t11
"pmulhw "
MANGLE
(
ff_MM_FIX_0_707106781
)
", %%mm4
\n\t
"
//z3
"paddw %%mm2, %%mm1
\n\t
"
"psllw $2, %%mm1
\n\t
"
//t12
"movq %%mm3, %%mm0
\n\t
"
"pmulhw "
MANGLE
(
ff_MM_FIX_0_541196100
)
", %%mm0
\n\t
"
"psubw %%mm1, %%mm3
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_0_382683433
)
", %%mm3
\n\t
"
//z5
"movq %%mm2, %%mm5
\n\t
"
"pmulhw "
MANGLE
(
MM_FIX_1_306562965
)
", %%mm1
\n\t
"
"psubw %%mm4, %%mm2
\n\t
"
//z13
"paddw %%mm4, %%mm5
\n\t
"
//z11
"movq %%mm2, %%mm6
\n\t
"
"paddw %%mm3, %%mm0
\n\t
"
//z2
"movq %%mm5, %%mm7
\n\t
"
"paddw %%mm0, %%mm2
\n\t
"
//d4
"psubw %%mm0, %%mm6
\n\t
"
//d5
"movq %%mm2, %%mm4
\n\t
"
"paddw %%mm3, %%mm1
\n\t
"
//z4
//transpose 4x4
"punpcklwd %%mm6, %%mm2
\n\t
"
"paddw %%mm1, %%mm5
\n\t
"
//d6
"punpckhwd %%mm6, %%mm4
\n\t
"
"psubw %%mm1, %%mm7
\n\t
"
//d7
"movq %%mm5, %%mm6
\n\t
"
"punpcklwd %%mm7, %%mm5
\n\t
"
"punpckhwd %%mm7, %%mm6
\n\t
"
"movq %%mm2, %%mm7
\n\t
"
"punpckldq %%mm5, %%mm2
\n\t
"
//4
"sub %%"
REG_d
", %%"
REG_S
"
\n\t
"
"punpckhdq %%mm5, %%mm7
\n\t
"
//5
"movq %%mm4, %%mm5
\n\t
"
"movq %%mm2, "
DCTSIZE_S
"*0*2+"
DCTSIZE_S
"(%%"
REG_D
")
\n\t
"
"punpckldq %%mm6, %%mm4
\n\t
"
//6
"movq %%mm7, "
DCTSIZE_S
"*1*2+"
DCTSIZE_S
"(%%"
REG_D
")
\n\t
"
"punpckhdq %%mm6, %%mm5
\n\t
"
//7
"movq %%mm4, "
DCTSIZE_S
"*2*2+"
DCTSIZE_S
"(%%"
REG_D
")
\n\t
"
"add $4, %%"
REG_S
"
\n\t
"
"movq %%mm5, "
DCTSIZE_S
"*3*2+"
DCTSIZE_S
"(%%"
REG_D
")
\n\t
"
"add $"
DCTSIZE_S
"*2*4, %%"
REG_D
"
\n\t
"
//4 rows
"dec %%"
REG_c
"
\n\t
"
"jnz 6b
\n\t
"
:
"+S"
(
pixels
),
"+D"
(
data
),
"+c"
(
cnt
),
"=o"
(
temps
),
"=o"
(
temps
[
1
])
:
"a"
(
line_size
)
NAMED_CONSTRAINTS_ADD
(
ff_MM_FIX_0_707106781
,
ff_MM_FIX_0_541196100
,
MM_FIX_0_382683433
,
MM_FIX_1_306562965
)
:
"%"
REG_d
);
}
#endif // HAVE_MMX_INLINE
libavfilter/version.h
View file @
a51c78c6
...
...
@@ -31,7 +31,7 @@
#define LIBAVFILTER_VERSION_MAJOR 5
#define LIBAVFILTER_VERSION_MINOR 5
#define LIBAVFILTER_VERSION_MICRO 10
0
#define LIBAVFILTER_VERSION_MICRO 10
1
#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
LIBAVFILTER_VERSION_MINOR, \
...
...
libavfilter/vf_mp.c
View file @
a51c78c6
...
...
@@ -127,7 +127,6 @@ static const struct {
extern
const
vf_info_t
ff_vf_info_eq2
;
extern
const
vf_info_t
ff_vf_info_eq
;
extern
const
vf_info_t
ff_vf_info_fspp
;
extern
const
vf_info_t
ff_vf_info_ilpack
;
extern
const
vf_info_t
ff_vf_info_pp7
;
extern
const
vf_info_t
ff_vf_info_softpulldown
;
...
...
@@ -135,7 +134,6 @@ extern const vf_info_t ff_vf_info_softpulldown;
static
const
vf_info_t
*
const
filters
[]
=
{
&
ff_vf_info_eq2
,
&
ff_vf_info_eq
,
&
ff_vf_info_fspp
,
&
ff_vf_info_ilpack
,
&
ff_vf_info_pp7
,
&
ff_vf_info_softpulldown
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment