Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
a29e59a7
Commit
a29e59a7
authored
Jun 13, 2019
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Rename parameters in AVX512 implementation of v_load_deinterleave and v_store_interleave
parent
1c661e75
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
82 additions
and
82 deletions
+82
-82
intrin_avx512.hpp
modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+82
-82
No files found.
modules/core/include/opencv2/core/hal/intrin_avx512.hpp
View file @
a29e59a7
...
...
@@ -2044,7 +2044,7 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b
b
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
ab0
,
mask1
,
ab1
));
}
inline
void
v_load_deinterleave
(
const
uchar
*
ptr
,
v_uint8x64
&
b
,
v_uint8x64
&
g
,
v_uint8x64
&
r
)
inline
void
v_load_deinterleave
(
const
uchar
*
ptr
,
v_uint8x64
&
a
,
v_uint8x64
&
b
,
v_uint8x64
&
c
)
{
__m512i
bgr0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgr1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
64
));
...
...
@@ -2062,22 +2062,22 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g,
77
,
74
,
71
,
68
,
65
,
127
,
124
,
121
,
118
,
115
,
112
,
109
,
106
,
103
,
100
,
97
,
94
,
91
,
88
,
85
,
82
,
79
,
76
,
73
,
70
,
67
,
64
,
61
,
58
,
55
,
52
,
49
,
46
,
43
,
40
,
37
,
34
,
31
,
28
,
25
,
22
,
19
,
16
,
13
,
10
,
7
,
4
,
1
),
bgr2
);
b
=
v_uint8x64
(
_mm512_mask_compress_epi8
(
r12b2
,
0xffffffffffe00000
,
r0b01
));
g
=
v_uint8x64
(
_mm512_mask_compress_epi8
(
b1g12
,
0x2492492492492492
,
bgr0
));
r
=
v_uint8x64
(
_mm512_mask_expand_epi8
(
r0b01
,
0xffffffffffe00000
,
r12b2
));
a
=
v_uint8x64
(
_mm512_mask_compress_epi8
(
r12b2
,
0xffffffffffe00000
,
r0b01
));
b
=
v_uint8x64
(
_mm512_mask_compress_epi8
(
b1g12
,
0x2492492492492492
,
bgr0
));
c
=
v_uint8x64
(
_mm512_mask_expand_epi8
(
r0b01
,
0xffffffffffe00000
,
r12b2
));
#elif CV_AVX_512VBMI
__m512i
b0g0b1
=
_mm512_mask_blend_epi8
(
0xb6db6db6db6db6db
,
bgr1
,
bgr0
);
__m512i
g1r1g2
=
_mm512_mask_blend_epi8
(
0xb6db6db6db6db6db
,
bgr2
,
bgr1
);
__m512i
r2b2r0
=
_mm512_mask_blend_epi8
(
0xb6db6db6db6db6db
,
bgr0
,
bgr2
);
b
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
b0g0b1
,
_v512_set_epu8
(
125
,
122
,
119
,
116
,
113
,
110
,
107
,
104
,
101
,
98
,
95
,
92
,
89
,
86
,
83
,
80
,
a
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
b0g0b1
,
_v512_set_epu8
(
125
,
122
,
119
,
116
,
113
,
110
,
107
,
104
,
101
,
98
,
95
,
92
,
89
,
86
,
83
,
80
,
77
,
74
,
71
,
68
,
65
,
63
,
61
,
60
,
58
,
57
,
55
,
54
,
52
,
51
,
49
,
48
,
46
,
45
,
43
,
42
,
40
,
39
,
37
,
36
,
34
,
33
,
31
,
30
,
28
,
27
,
25
,
24
,
23
,
21
,
20
,
18
,
17
,
15
,
14
,
12
,
11
,
9
,
8
,
6
,
5
,
3
,
2
,
0
),
bgr2
));
g
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
g1r1g2
,
_v512_set_epu8
(
63
,
61
,
60
,
58
,
57
,
55
,
54
,
52
,
51
,
49
,
48
,
46
,
45
,
43
,
42
,
40
,
b
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
g1r1g2
,
_v512_set_epu8
(
63
,
61
,
60
,
58
,
57
,
55
,
54
,
52
,
51
,
49
,
48
,
46
,
45
,
43
,
42
,
40
,
39
,
37
,
36
,
34
,
33
,
31
,
30
,
28
,
27
,
25
,
24
,
23
,
21
,
20
,
18
,
17
,
15
,
14
,
12
,
11
,
9
,
8
,
6
,
5
,
3
,
2
,
0
,
126
,
123
,
120
,
117
,
114
,
111
,
108
,
105
,
102
,
99
,
96
,
93
,
90
,
87
,
84
,
81
,
78
,
75
,
72
,
69
,
66
),
bgr0
));
r
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
r2b2r0
,
_v512_set_epu8
(
63
,
60
,
57
,
54
,
51
,
48
,
45
,
42
,
39
,
36
,
33
,
30
,
27
,
24
,
21
,
18
,
c
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
r2b2r0
,
_v512_set_epu8
(
63
,
60
,
57
,
54
,
51
,
48
,
45
,
42
,
39
,
36
,
33
,
30
,
27
,
24
,
21
,
18
,
15
,
12
,
9
,
6
,
3
,
0
,
125
,
122
,
119
,
116
,
113
,
110
,
107
,
104
,
101
,
98
,
95
,
92
,
89
,
86
,
83
,
80
,
77
,
74
,
71
,
68
,
65
,
62
,
59
,
56
,
53
,
50
,
47
,
44
,
41
,
38
,
35
,
32
,
29
,
26
,
23
,
20
,
17
,
14
,
11
,
8
,
5
,
2
),
bgr1
));
...
...
@@ -2092,13 +2092,13 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g,
__m512i
r0b1
=
_mm512_permutex2var_epi16
(
bgr1
,
_v512_set_epu16
(
42
,
41
,
40
,
39
,
38
,
37
,
36
,
35
,
34
,
33
,
32
,
29
,
26
,
23
,
20
,
17
,
14
,
11
,
8
,
5
,
2
,
53
,
52
,
51
,
50
,
49
,
48
,
47
,
46
,
45
,
44
,
43
),
g20r0
);
__m512i
g1r1
=
_mm512_alignr_epi32
(
r12b2
,
g20r0
,
11
);
b
=
v_uint8x64
(
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
b0g0
,
r0b1
));
r
=
v_uint8x64
(
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
r0b1
,
g1r1
));
g
=
v_uint8x64
(
_mm512_shuffle_epi8
(
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
g1r1
,
b0g0
),
_mm512_set4_epi32
(
0x0e0f0c0d
,
0x0a0b0809
,
0x06070405
,
0x02030001
)));
a
=
v_uint8x64
(
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
b0g0
,
r0b1
));
c
=
v_uint8x64
(
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
r0b1
,
g1r1
));
b
=
v_uint8x64
(
_mm512_shuffle_epi8
(
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
g1r1
,
b0g0
),
_mm512_set4_epi32
(
0x0e0f0c0d
,
0x0a0b0809
,
0x06070405
,
0x02030001
)));
#endif
}
inline
void
v_load_deinterleave
(
const
ushort
*
ptr
,
v_uint16x32
&
b
,
v_uint16x32
&
g
,
v_uint16x32
&
r
)
inline
void
v_load_deinterleave
(
const
ushort
*
ptr
,
v_uint16x32
&
a
,
v_uint16x32
&
b
,
v_uint16x32
&
c
)
{
__m512i
bgr0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgr1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
32
));
...
...
@@ -2110,13 +2110,13 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32&
__m512i
r12b2
=
_mm512_permutex2var_epi16
(
bgr1
,
mask0
,
bgr2
);
__m512i
g20r0
=
_mm512_permutex2var_epi16
(
bgr2
,
mask0
,
bgr0
);
b
=
v_uint16x32
(
_mm512_mask_blend_epi32
(
0xf800
,
b01g1
,
r12b2
));
g
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
bgr1
,
_v512_set_epu16
(
42
,
41
,
40
,
39
,
38
,
37
,
36
,
35
,
34
,
33
,
32
,
29
,
26
,
23
,
20
,
17
,
a
=
v_uint16x32
(
_mm512_mask_blend_epi32
(
0xf800
,
b01g1
,
r12b2
));
b
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
bgr1
,
_v512_set_epu16
(
42
,
41
,
40
,
39
,
38
,
37
,
36
,
35
,
34
,
33
,
32
,
29
,
26
,
23
,
20
,
17
,
14
,
11
,
8
,
5
,
2
,
53
,
52
,
51
,
50
,
49
,
48
,
47
,
46
,
45
,
44
,
43
),
g20r0
));
r
=
v_uint16x32
(
_mm512_alignr_epi32
(
r12b2
,
g20r0
,
11
));
c
=
v_uint16x32
(
_mm512_alignr_epi32
(
r12b2
,
g20r0
,
11
));
}
inline
void
v_load_deinterleave
(
const
unsigned
*
ptr
,
v_uint32x16
&
b
,
v_uint32x16
&
g
,
v_uint32x16
&
r
)
inline
void
v_load_deinterleave
(
const
unsigned
*
ptr
,
v_uint32x16
&
a
,
v_uint32x16
&
b
,
v_uint32x16
&
c
)
{
__m512i
bgr0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgr1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
16
));
...
...
@@ -2127,12 +2127,12 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x1
__m512i
g12b2
=
_mm512_permutex2var_epi32
(
bgr1
,
mask0
,
bgr2
);
__m512i
r20g0
=
_mm512_permutex2var_epi32
(
bgr2
,
mask0
,
bgr0
);
b
=
v_uint32x16
(
_mm512_mask_blend_epi32
(
0xf800
,
b01r1
,
g12b2
));
g
=
v_uint32x16
(
_mm512_alignr_epi32
(
g12b2
,
r20g0
,
11
));
r
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
bgr1
,
_v512_set_epu32
(
21
,
20
,
19
,
18
,
17
,
16
,
13
,
10
,
7
,
4
,
1
,
26
,
25
,
24
,
23
,
22
),
r20g0
));
a
=
v_uint32x16
(
_mm512_mask_blend_epi32
(
0xf800
,
b01r1
,
g12b2
));
b
=
v_uint32x16
(
_mm512_alignr_epi32
(
g12b2
,
r20g0
,
11
));
c
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
bgr1
,
_v512_set_epu32
(
21
,
20
,
19
,
18
,
17
,
16
,
13
,
10
,
7
,
4
,
1
,
26
,
25
,
24
,
23
,
22
),
r20g0
));
}
inline
void
v_load_deinterleave
(
const
uint64
*
ptr
,
v_uint64x8
&
b
,
v_uint64x8
&
g
,
v_uint64x8
&
r
)
inline
void
v_load_deinterleave
(
const
uint64
*
ptr
,
v_uint64x8
&
a
,
v_uint64x8
&
b
,
v_uint64x8
&
c
)
{
__m512i
bgr0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgr1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
8
));
...
...
@@ -2143,12 +2143,12 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g
__m512i
r12b2
=
_mm512_permutex2var_epi64
(
bgr1
,
mask0
,
bgr2
);
__m512i
g20r0
=
_mm512_permutex2var_epi64
(
bgr2
,
mask0
,
bgr0
);
b
=
v_uint64x8
(
_mm512_mask_blend_epi64
(
0xc0
,
b01g1
,
r12b2
));
r
=
v_uint64x8
(
_mm512_alignr_epi64
(
r12b2
,
g20r0
,
6
));
g
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
bgr1
,
_v512_set_epu64
(
10
,
9
,
8
,
5
,
2
,
13
,
12
,
11
),
g20r0
));
a
=
v_uint64x8
(
_mm512_mask_blend_epi64
(
0xc0
,
b01g1
,
r12b2
));
c
=
v_uint64x8
(
_mm512_alignr_epi64
(
r12b2
,
g20r0
,
6
));
b
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
bgr1
,
_v512_set_epu64
(
10
,
9
,
8
,
5
,
2
,
13
,
12
,
11
),
g20r0
));
}
inline
void
v_load_deinterleave
(
const
uchar
*
ptr
,
v_uint8x64
&
b
,
v_uint8x64
&
g
,
v_uint8x64
&
r
,
v_uint8x64
&
a
)
inline
void
v_load_deinterleave
(
const
uchar
*
ptr
,
v_uint8x64
&
a
,
v_uint8x64
&
b
,
v_uint8x64
&
c
,
v_uint8x64
&
d
)
{
__m512i
bgra0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgra1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
64
));
...
...
@@ -2170,10 +2170,10 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g,
__m512i
br23
=
_mm512_permutex2var_epi8
(
bgra2
,
mask0
,
bgra3
);
__m512i
ga23
=
_mm512_permutex2var_epi8
(
bgra2
,
mask1
,
bgra3
);
b
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
br01
,
mask0
,
br23
));
r
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
br01
,
mask1
,
br23
));
g
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
ga01
,
mask0
,
ga23
));
a
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
ga01
,
mask1
,
ga23
));
a
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
br01
,
mask0
,
br23
));
c
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
br01
,
mask1
,
br23
));
b
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
ga01
,
mask0
,
ga23
));
d
=
v_uint8x64
(
_mm512_permutex2var_epi8
(
ga01
,
mask1
,
ga23
));
#else
__m512i
mask
=
_mm512_set4_epi32
(
0x0f0b0703
,
0x0e0a0602
,
0x0d090501
,
0x0c080400
);
__m512i
b0g0r0a0
=
_mm512_shuffle_epi8
(
bgra0
,
mask
);
...
...
@@ -2189,14 +2189,14 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g,
__m512i
br23
=
_mm512_permutex2var_epi32
(
b2g2r2a2
,
mask0
,
b3g3r3a3
);
__m512i
ga23
=
_mm512_permutex2var_epi32
(
b2g2r2a2
,
mask1
,
b3g3r3a3
);
b
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
br01
,
mask0
,
br23
));
r
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
br01
,
mask1
,
br23
));
g
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
ga01
,
mask0
,
ga23
));
a
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
ga01
,
mask1
,
ga23
));
a
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
br01
,
mask0
,
br23
));
c
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
br01
,
mask1
,
br23
));
b
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
ga01
,
mask0
,
ga23
));
d
=
v_uint8x64
(
_mm512_permutex2var_epi32
(
ga01
,
mask1
,
ga23
));
#endif
}
inline
void
v_load_deinterleave
(
const
ushort
*
ptr
,
v_uint16x32
&
b
,
v_uint16x32
&
g
,
v_uint16x32
&
r
,
v_uint16x32
&
a
)
inline
void
v_load_deinterleave
(
const
ushort
*
ptr
,
v_uint16x32
&
a
,
v_uint16x32
&
b
,
v_uint16x32
&
c
,
v_uint16x32
&
d
)
{
__m512i
bgra0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgra1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
32
));
...
...
@@ -2213,13 +2213,13 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32&
__m512i
br23
=
_mm512_permutex2var_epi16
(
bgra2
,
mask0
,
bgra3
);
__m512i
ga23
=
_mm512_permutex2var_epi16
(
bgra2
,
mask1
,
bgra3
);
b
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
br01
,
mask0
,
br23
));
r
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
br01
,
mask1
,
br23
));
g
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
ga01
,
mask0
,
ga23
));
a
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
ga01
,
mask1
,
ga23
));
a
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
br01
,
mask0
,
br23
));
c
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
br01
,
mask1
,
br23
));
b
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
ga01
,
mask0
,
ga23
));
d
=
v_uint16x32
(
_mm512_permutex2var_epi16
(
ga01
,
mask1
,
ga23
));
}
inline
void
v_load_deinterleave
(
const
unsigned
*
ptr
,
v_uint32x16
&
b
,
v_uint32x16
&
g
,
v_uint32x16
&
r
,
v_uint32x16
&
a
)
inline
void
v_load_deinterleave
(
const
unsigned
*
ptr
,
v_uint32x16
&
a
,
v_uint32x16
&
b
,
v_uint32x16
&
c
,
v_uint32x16
&
d
)
{
__m512i
bgra0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgra1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
16
));
...
...
@@ -2234,13 +2234,13 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x1
__m512i
br23
=
_mm512_permutex2var_epi32
(
bgra2
,
mask0
,
bgra3
);
__m512i
ga23
=
_mm512_permutex2var_epi32
(
bgra2
,
mask1
,
bgra3
);
b
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
br01
,
mask0
,
br23
));
r
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
br01
,
mask1
,
br23
));
g
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
ga01
,
mask0
,
ga23
));
a
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
ga01
,
mask1
,
ga23
));
a
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
br01
,
mask0
,
br23
));
c
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
br01
,
mask1
,
br23
));
b
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
ga01
,
mask0
,
ga23
));
d
=
v_uint32x16
(
_mm512_permutex2var_epi32
(
ga01
,
mask1
,
ga23
));
}
inline
void
v_load_deinterleave
(
const
uint64
*
ptr
,
v_uint64x8
&
b
,
v_uint64x8
&
g
,
v_uint64x8
&
r
,
v_uint64x8
&
a
)
inline
void
v_load_deinterleave
(
const
uint64
*
ptr
,
v_uint64x8
&
a
,
v_uint64x8
&
b
,
v_uint64x8
&
c
,
v_uint64x8
&
d
)
{
__m512i
bgra0
=
_mm512_loadu_si512
((
const
__m512i
*
)
ptr
);
__m512i
bgra1
=
_mm512_loadu_si512
((
const
__m512i
*
)(
ptr
+
8
));
...
...
@@ -2255,10 +2255,10 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g
__m512i
br23
=
_mm512_permutex2var_epi64
(
bgra2
,
mask0
,
bgra3
);
__m512i
ga23
=
_mm512_permutex2var_epi64
(
bgra2
,
mask1
,
bgra3
);
b
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
br01
,
mask0
,
br23
));
r
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
br01
,
mask1
,
br23
));
g
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
ga01
,
mask0
,
ga23
));
a
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
ga01
,
mask1
,
ga23
));
a
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
br01
,
mask0
,
br23
));
c
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
br01
,
mask1
,
br23
));
b
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
ga01
,
mask0
,
ga23
));
d
=
v_uint64x8
(
_mm512_permutex2var_epi64
(
ga01
,
mask1
,
ga23
));
}
///////////////////////////// store interleave /////////////////////////////////////
...
...
@@ -2351,7 +2351,7 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64
}
}
inline
void
v_store_interleave
(
uchar
*
ptr
,
const
v_uint8x64
&
b
,
const
v_uint8x64
&
g
,
const
v_uint8x64
&
r
,
inline
void
v_store_interleave
(
uchar
*
ptr
,
const
v_uint8x64
&
a
,
const
v_uint8x64
&
b
,
const
v_uint8x64
&
c
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
#if CV_AVX_512VBMI
...
...
@@ -2367,18 +2367,18 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6
122
,
58
,
100
,
121
,
57
,
99
,
120
,
56
,
98
,
119
,
55
,
97
,
118
,
54
,
96
,
117
,
53
,
95
,
116
,
52
,
94
,
115
,
51
,
93
,
114
,
50
,
92
,
113
,
49
,
91
,
112
,
48
,
90
,
111
,
47
,
89
,
110
,
46
,
88
,
109
,
45
,
87
,
108
,
44
,
86
,
107
,
43
,
85
);
__m512i
r2g0r0
=
_mm512_permutex2var_epi8
(
g
.
val
,
mask0
,
r
.
val
);
__m512i
b0r1b1
=
_mm512_permutex2var_epi8
(
b
.
val
,
mask1
,
r
.
val
);
__m512i
g1b2g2
=
_mm512_permutex2var_epi8
(
b
.
val
,
mask2
,
g
.
val
);
__m512i
r2g0r0
=
_mm512_permutex2var_epi8
(
b
.
val
,
mask0
,
c
.
val
);
__m512i
b0r1b1
=
_mm512_permutex2var_epi8
(
a
.
val
,
mask1
,
c
.
val
);
__m512i
g1b2g2
=
_mm512_permutex2var_epi8
(
a
.
val
,
mask2
,
b
.
val
);
__m512i
bgr0
=
_mm512_mask_blend_epi8
(
0x9249249249249249
,
r2g0r0
,
b0r1b1
);
__m512i
bgr1
=
_mm512_mask_blend_epi8
(
0x9249249249249249
,
b0r1b1
,
g1b2g2
);
__m512i
bgr2
=
_mm512_mask_blend_epi8
(
0x9249249249249249
,
g1b2g2
,
r2g0r0
);
#else
__m512i
g1g0
=
_mm512_shuffle_epi8
(
g
.
val
,
_mm512_set4_epi32
(
0x0e0f0c0d
,
0x0a0b0809
,
0x06070405
,
0x02030001
));
__m512i
b0g0
=
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
b
.
val
,
g1g0
);
__m512i
r0b1
=
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
r
.
val
,
b
.
val
);
__m512i
g1r1
=
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
g1g0
,
r
.
val
);
__m512i
g1g0
=
_mm512_shuffle_epi8
(
b
.
val
,
_mm512_set4_epi32
(
0x0e0f0c0d
,
0x0a0b0809
,
0x06070405
,
0x02030001
));
__m512i
b0g0
=
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
a
.
val
,
g1g0
);
__m512i
r0b1
=
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
c
.
val
,
a
.
val
);
__m512i
g1r1
=
_mm512_mask_blend_epi8
(
0xAAAAAAAAAAAAAAAA
,
g1g0
,
c
.
val
);
__m512i
mask0
=
_v512_set_epu16
(
42
,
10
,
31
,
41
,
9
,
30
,
40
,
8
,
29
,
39
,
7
,
28
,
38
,
6
,
27
,
37
,
5
,
26
,
36
,
4
,
25
,
35
,
3
,
24
,
34
,
2
,
23
,
33
,
1
,
22
,
32
,
0
);
...
...
@@ -2415,7 +2415,7 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6
}
}
inline
void
v_store_interleave
(
ushort
*
ptr
,
const
v_uint16x32
&
b
,
const
v_uint16x32
&
g
,
const
v_uint16x32
&
r
,
inline
void
v_store_interleave
(
ushort
*
ptr
,
const
v_uint16x32
&
a
,
const
v_uint16x32
&
b
,
const
v_uint16x32
&
c
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
__m512i
mask0
=
_v512_set_epu16
(
42
,
10
,
31
,
41
,
9
,
30
,
40
,
8
,
29
,
39
,
7
,
28
,
38
,
6
,
27
,
37
,
...
...
@@ -2424,9 +2424,9 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1
47
,
36
,
15
,
46
,
35
,
14
,
45
,
34
,
13
,
44
,
33
,
12
,
43
,
32
,
11
,
42
);
__m512i
mask2
=
_v512_set_epu16
(
63
,
31
,
20
,
62
,
30
,
19
,
61
,
29
,
18
,
60
,
28
,
17
,
59
,
27
,
16
,
58
,
26
,
15
,
57
,
25
,
14
,
56
,
24
,
13
,
55
,
23
,
12
,
54
,
22
,
11
,
53
,
21
);
__m512i
b0g0b2
=
_mm512_permutex2var_epi16
(
b
.
val
,
mask0
,
g
.
val
);
__m512i
r1b1r0
=
_mm512_permutex2var_epi16
(
b
.
val
,
mask1
,
r
.
val
);
__m512i
g2r2g1
=
_mm512_permutex2var_epi16
(
g
.
val
,
mask2
,
r
.
val
);
__m512i
b0g0b2
=
_mm512_permutex2var_epi16
(
a
.
val
,
mask0
,
b
.
val
);
__m512i
r1b1r0
=
_mm512_permutex2var_epi16
(
a
.
val
,
mask1
,
c
.
val
);
__m512i
g2r2g1
=
_mm512_permutex2var_epi16
(
b
.
val
,
mask2
,
c
.
val
);
__m512i
bgr0
=
_mm512_mask_blend_epi16
(
0x24924924
,
b0g0b2
,
r1b1r0
);
__m512i
bgr1
=
_mm512_mask_blend_epi16
(
0x24924924
,
r1b1r0
,
g2r2g1
);
...
...
@@ -2452,15 +2452,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1
}
}
inline
void
v_store_interleave
(
unsigned
*
ptr
,
const
v_uint32x16
&
b
,
const
v_uint32x16
&
g
,
const
v_uint32x16
&
r
,
inline
void
v_store_interleave
(
unsigned
*
ptr
,
const
v_uint32x16
&
a
,
const
v_uint32x16
&
b
,
const
v_uint32x16
&
c
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
__m512i
mask0
=
_v512_set_epu32
(
26
,
31
,
15
,
25
,
30
,
14
,
24
,
29
,
13
,
23
,
28
,
12
,
22
,
27
,
11
,
21
);
__m512i
mask1
=
_v512_set_epu32
(
31
,
10
,
25
,
30
,
9
,
24
,
29
,
8
,
23
,
28
,
7
,
22
,
27
,
6
,
21
,
26
);
__m512i
g1b2g2
=
_mm512_permutex2var_epi32
(
b
.
val
,
mask0
,
g
.
val
);
__m512i
r2r1b1
=
_mm512_permutex2var_epi32
(
b
.
val
,
mask1
,
r
.
val
);
__m512i
g1b2g2
=
_mm512_permutex2var_epi32
(
a
.
val
,
mask0
,
b
.
val
);
__m512i
r2r1b1
=
_mm512_permutex2var_epi32
(
a
.
val
,
mask1
,
c
.
val
);
__m512i
bgr0
=
_mm512_mask_expand_epi32
(
_mm512_mask_expand_epi32
(
_mm512_maskz_expand_epi32
(
0x9249
,
b
.
val
),
0x2492
,
g
.
val
),
0x4924
,
r
.
val
);
__m512i
bgr0
=
_mm512_mask_expand_epi32
(
_mm512_mask_expand_epi32
(
_mm512_maskz_expand_epi32
(
0x9249
,
a
.
val
),
0x2492
,
b
.
val
),
0x4924
,
c
.
val
);
__m512i
bgr1
=
_mm512_mask_blend_epi32
(
0x9249
,
r2r1b1
,
g1b2g2
);
__m512i
bgr2
=
_mm512_mask_blend_epi32
(
0x9249
,
g1b2g2
,
r2r1b1
);
...
...
@@ -2484,15 +2484,15 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uin
}
}
inline
void
v_store_interleave
(
uint64
*
ptr
,
const
v_uint64x8
&
b
,
const
v_uint64x8
&
g
,
const
v_uint64x8
&
r
,
inline
void
v_store_interleave
(
uint64
*
ptr
,
const
v_uint64x8
&
a
,
const
v_uint64x8
&
b
,
const
v_uint64x8
&
c
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
__m512i
mask0
=
_v512_set_epu64
(
5
,
12
,
7
,
4
,
11
,
6
,
3
,
10
);
__m512i
mask1
=
_v512_set_epu64
(
15
,
7
,
4
,
14
,
6
,
3
,
13
,
5
);
__m512i
r1b1b2
=
_mm512_permutex2var_epi64
(
b
.
val
,
mask0
,
r
.
val
);
__m512i
g2r2g1
=
_mm512_permutex2var_epi64
(
g
.
val
,
mask1
,
r
.
val
);
__m512i
r1b1b2
=
_mm512_permutex2var_epi64
(
a
.
val
,
mask0
,
c
.
val
);
__m512i
g2r2g1
=
_mm512_permutex2var_epi64
(
b
.
val
,
mask1
,
c
.
val
);
__m512i
bgr0
=
_mm512_mask_expand_epi64
(
_mm512_mask_expand_epi64
(
_mm512_maskz_expand_epi64
(
0x49
,
b
.
val
),
0x92
,
g
.
val
),
0x24
,
r
.
val
);
__m512i
bgr0
=
_mm512_mask_expand_epi64
(
_mm512_mask_expand_epi64
(
_mm512_maskz_expand_epi64
(
0x49
,
a
.
val
),
0x92
,
b
.
val
),
0x24
,
c
.
val
);
__m512i
bgr1
=
_mm512_mask_blend_epi64
(
0xdb
,
g2r2g1
,
r1b1b2
);
__m512i
bgr2
=
_mm512_mask_blend_epi64
(
0xdb
,
r1b1b2
,
g2r2g1
);
...
...
@@ -2516,13 +2516,13 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64
}
}
inline
void
v_store_interleave
(
uchar
*
ptr
,
const
v_uint8x64
&
b
,
const
v_uint8x64
&
g
,
const
v_uint8x64
&
r
,
const
v_uint8x64
&
a
,
inline
void
v_store_interleave
(
uchar
*
ptr
,
const
v_uint8x64
&
a
,
const
v_uint8x64
&
b
,
const
v_uint8x64
&
c
,
const
v_uint8x64
&
d
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
v_uint8x64
br01
,
br23
,
ga01
,
ga23
;
v_zip
(
b
,
r
,
br01
,
br23
);
v_zip
(
g
,
a
,
ga01
,
ga23
);
v_zip
(
a
,
c
,
br01
,
br23
);
v_zip
(
b
,
d
,
ga01
,
ga23
);
v_uint8x64
bgra0
,
bgra1
,
bgra2
,
bgra3
;
v_zip
(
br01
,
ga01
,
bgra0
,
bgra1
);
v_zip
(
br23
,
ga23
,
bgra2
,
bgra3
);
...
...
@@ -2550,13 +2550,13 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6
}
}
inline
void
v_store_interleave
(
ushort
*
ptr
,
const
v_uint16x32
&
b
,
const
v_uint16x32
&
g
,
const
v_uint16x32
&
r
,
const
v_uint16x32
&
a
,
inline
void
v_store_interleave
(
ushort
*
ptr
,
const
v_uint16x32
&
a
,
const
v_uint16x32
&
b
,
const
v_uint16x32
&
c
,
const
v_uint16x32
&
d
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
v_uint16x32
br01
,
br23
,
ga01
,
ga23
;
v_zip
(
b
,
r
,
br01
,
br23
);
v_zip
(
g
,
a
,
ga01
,
ga23
);
v_zip
(
a
,
c
,
br01
,
br23
);
v_zip
(
b
,
d
,
ga01
,
ga23
);
v_uint16x32
bgra0
,
bgra1
,
bgra2
,
bgra3
;
v_zip
(
br01
,
ga01
,
bgra0
,
bgra1
);
v_zip
(
br23
,
ga23
,
bgra2
,
bgra3
);
...
...
@@ -2584,13 +2584,13 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1
}
}
inline
void
v_store_interleave
(
unsigned
*
ptr
,
const
v_uint32x16
&
b
,
const
v_uint32x16
&
g
,
const
v_uint32x16
&
r
,
const
v_uint32x16
&
a
,
inline
void
v_store_interleave
(
unsigned
*
ptr
,
const
v_uint32x16
&
a
,
const
v_uint32x16
&
b
,
const
v_uint32x16
&
c
,
const
v_uint32x16
&
d
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
v_uint32x16
br01
,
br23
,
ga01
,
ga23
;
v_zip
(
b
,
r
,
br01
,
br23
);
v_zip
(
g
,
a
,
ga01
,
ga23
);
v_zip
(
a
,
c
,
br01
,
br23
);
v_zip
(
b
,
d
,
ga01
,
ga23
);
v_uint32x16
bgra0
,
bgra1
,
bgra2
,
bgra3
;
v_zip
(
br01
,
ga01
,
bgra0
,
bgra1
);
v_zip
(
br23
,
ga23
,
bgra2
,
bgra3
);
...
...
@@ -2618,13 +2618,13 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uin
}
}
inline
void
v_store_interleave
(
uint64
*
ptr
,
const
v_uint64x8
&
b
,
const
v_uint64x8
&
g
,
const
v_uint64x8
&
r
,
const
v_uint64x8
&
a
,
inline
void
v_store_interleave
(
uint64
*
ptr
,
const
v_uint64x8
&
a
,
const
v_uint64x8
&
b
,
const
v_uint64x8
&
c
,
const
v_uint64x8
&
d
,
hal
::
StoreMode
mode
=
hal
::
STORE_UNALIGNED
)
{
v_uint64x8
br01
,
br23
,
ga01
,
ga23
;
v_zip
(
b
,
r
,
br01
,
br23
);
v_zip
(
g
,
a
,
ga01
,
ga23
);
v_zip
(
a
,
c
,
br01
,
br23
);
v_zip
(
b
,
d
,
ga01
,
ga23
);
v_uint64x8
bgra0
,
bgra1
,
bgra2
,
bgra3
;
v_zip
(
br01
,
ga01
,
bgra0
,
bgra1
);
v_zip
(
br23
,
ga23
,
bgra2
,
bgra3
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment