Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d43597c1
Commit
d43597c1
authored
Sep 19, 2018
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
transform() implementation updated to utilize wide universal intrinsics
parent
73bf1708
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
166 additions
and
182 deletions
+166
-182
matmul.simd.hpp
modules/core/src/matmul.simd.hpp
+166
-182
No files found.
modules/core/src/matmul.simd.hpp
View file @
d43597c1
...
...
@@ -1451,115 +1451,82 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
}
}
#if CV_SIMD128 && !defined(__aarch64__)
static
inline
void
load3x3Matrix
(
const
float
*
m
,
v_float32x4
&
m0
,
v_float32x4
&
m1
,
v_float32x4
&
m2
,
v_float32x4
&
m3
)
{
m0
=
v_float32x4
(
m
[
0
],
m
[
4
],
m
[
8
],
0
);
m1
=
v_float32x4
(
m
[
1
],
m
[
5
],
m
[
9
],
0
);
m2
=
v_float32x4
(
m
[
2
],
m
[
6
],
m
[
10
],
0
);
m3
=
v_float32x4
(
m
[
3
],
m
[
7
],
m
[
11
],
0
);
}
#endif
#if CV_SIMD128
static
inline
v_int16x8
v_matmulvec
(
const
v_int16x8
&
v0
,
const
v_int16x8
&
m0
,
const
v_int16x8
&
m1
,
const
v_int16x8
&
m2
,
const
v_int32x4
&
m3
,
const
int
BITS
)
{
// v0 : 0 b0 g0 r0 b1 g1 r1 ?
v_int32x4
t0
=
v_dotprod
(
v0
,
m0
);
// a0 b0 a1 b1
v_int32x4
t1
=
v_dotprod
(
v0
,
m1
);
// c0 d0 c1 d1
v_int32x4
t2
=
v_dotprod
(
v0
,
m2
);
// e0 f0 e1 f1
v_int32x4
t3
=
v_setzero_s32
();
v_int32x4
s0
,
s1
,
s2
,
s3
;
v_transpose4x4
(
t0
,
t1
,
t2
,
t3
,
s0
,
s1
,
s2
,
s3
);
s0
=
s0
+
s1
+
m3
;
// B0 G0 R0 ?
s2
=
s2
+
s3
+
m3
;
// B1 G1 R1 ?
s0
=
s0
>>
BITS
;
s2
=
s2
>>
BITS
;
v_int16x8
result
=
v_pack
(
s0
,
v_setzero_s32
());
// B0 G0 R0 0 0 0 0 0
result
=
v_reinterpret_as_s16
(
v_reinterpret_as_s64
(
result
)
<<
16
);
// 0 B0 G0 R0 0 0 0 0
result
=
result
|
v_pack
(
v_setzero_s32
(),
s2
);
// 0 B0 G0 R0 B1 G1 R1 0
return
result
;
}
#endif
static
void
transform_8u
(
const
uchar
*
src
,
uchar
*
dst
,
const
float
*
m
,
int
len
,
int
scn
,
int
dcn
)
{
#if CV_SIMD
128
#if CV_SIMD
const
int
BITS
=
10
,
SCALE
=
1
<<
BITS
;
const
float
MAX_M
=
(
float
)(
1
<<
(
15
-
BITS
));
if
(
hasSIMD128
()
&&
scn
==
3
&&
dcn
==
3
&&
std
::
abs
(
m
[
0
])
<
MAX_M
&&
std
::
abs
(
m
[
1
])
<
MAX_M
&&
std
::
abs
(
m
[
2
])
<
MAX_M
&&
std
::
abs
(
m
[
3
])
<
MAX_M
*
256
&&
std
::
abs
(
m
[
4
])
<
MAX_M
&&
std
::
abs
(
m
[
5
])
<
MAX_M
&&
std
::
abs
(
m
[
6
])
<
MAX_M
&&
std
::
abs
(
m
[
7
])
<
MAX_M
*
256
&&
std
::
abs
(
m
[
8
])
<
MAX_M
&&
std
::
abs
(
m
[
9
])
<
MAX_M
&&
std
::
abs
(
m
[
10
])
<
MAX_M
&&
std
::
abs
(
m
[
11
])
<
MAX_M
*
256
)
if
(
scn
==
3
&&
dcn
==
3
&&
std
::
abs
(
m
[
0
])
<
MAX_M
&&
std
::
abs
(
m
[
1
])
<
MAX_M
&&
std
::
abs
(
m
[
2
])
<
MAX_M
*
256
&&
std
::
abs
(
m
[
3
])
<
MAX_M
*
256
&&
std
::
abs
(
m
[
4
])
<
MAX_M
&&
std
::
abs
(
m
[
5
])
<
MAX_M
&&
std
::
abs
(
m
[
6
])
<
MAX_M
*
256
&&
std
::
abs
(
m
[
7
])
<
MAX_M
*
256
&&
std
::
abs
(
m
[
8
])
<
MAX_M
&&
std
::
abs
(
m
[
9
])
<
MAX_M
&&
std
::
abs
(
m
[
10
])
<
MAX_M
*
256
&&
std
::
abs
(
m
[
11
])
<
MAX_M
*
256
)
{
const
int
nChannels
=
3
;
const
int
cWidth
=
v_int16x8
::
nlanes
;
// faster fixed-point transformation
short
m00
=
saturate_cast
<
short
>
(
m
[
0
]
*
SCALE
),
m01
=
saturate_cast
<
short
>
(
m
[
1
]
*
SCALE
),
m02
=
saturate_cast
<
short
>
(
m
[
2
]
*
SCALE
),
m10
=
saturate_cast
<
short
>
(
m
[
4
]
*
SCALE
),
m11
=
saturate_cast
<
short
>
(
m
[
5
]
*
SCALE
),
m12
=
saturate_cast
<
short
>
(
m
[
6
]
*
SCALE
),
m20
=
saturate_cast
<
short
>
(
m
[
8
]
*
SCALE
),
m21
=
saturate_cast
<
short
>
(
m
[
9
]
*
SCALE
),
m22
=
saturate_cast
<
short
>
(
m
[
10
]
*
SCALE
);
int
m03
=
saturate_cast
<
int
>
((
m
[
3
]
+
0.5
f
)
*
SCALE
),
m13
=
saturate_cast
<
int
>
((
m
[
7
]
+
0.5
f
)
*
SCALE
),
m23
=
saturate_cast
<
int
>
((
m
[
11
]
+
0.5
f
)
*
SCALE
);
v_int16x8
m0
=
v_int16x8
(
0
,
m00
,
m01
,
m02
,
m00
,
m01
,
m02
,
0
);
v_int16x8
m1
=
v_int16x8
(
0
,
m10
,
m11
,
m12
,
m10
,
m11
,
m12
,
0
);
v_int16x8
m2
=
v_int16x8
(
0
,
m20
,
m21
,
m22
,
m20
,
m21
,
m22
,
0
);
v_int32x4
m3
=
v_int32x4
(
m03
,
m13
,
m23
,
0
);
int
x
=
0
;
for
(;
x
<=
(
len
-
cWidth
)
*
nChannels
;
x
+=
cWidth
*
nChannels
)
union
{
short
s
[
6
];
int
p
[
3
];
}
m16
;
m16
.
s
[
0
]
=
saturate_cast
<
short
>
(
m
[
0
]
*
SCALE
);
m16
.
s
[
1
]
=
saturate_cast
<
short
>
(
m
[
1
]
*
SCALE
);
m16
.
s
[
2
]
=
saturate_cast
<
short
>
(
m
[
4
]
*
SCALE
);
m16
.
s
[
3
]
=
saturate_cast
<
short
>
(
m
[
5
]
*
SCALE
);
m16
.
s
[
4
]
=
saturate_cast
<
short
>
(
m
[
8
]
*
SCALE
);
m16
.
s
[
5
]
=
saturate_cast
<
short
>
(
m
[
9
]
*
SCALE
);
int
m32
[]
=
{
saturate_cast
<
int
>
(
m
[
2
]
*
SCALE
),
saturate_cast
<
int
>
(
m
[
3
]
*
SCALE
),
saturate_cast
<
int
>
(
m
[
6
]
*
SCALE
),
saturate_cast
<
int
>
(
m
[
7
]
*
SCALE
),
saturate_cast
<
int
>
(
m
[
10
]
*
SCALE
),
saturate_cast
<
int
>
(
m
[
11
]
*
SCALE
)};
v_int16
m01
=
v_reinterpret_as_s16
(
vx_setall_s32
(
m16
.
p
[
0
]));
v_int32
m2
=
vx_setall_s32
(
m32
[
0
]);
v_int32
m3
=
vx_setall_s32
(
m32
[
1
]);
v_int16
m45
=
v_reinterpret_as_s16
(
vx_setall_s32
(
m16
.
p
[
1
]));
v_int32
m6
=
vx_setall_s32
(
m32
[
2
]);
v_int32
m7
=
vx_setall_s32
(
m32
[
3
]);
v_int16
m89
=
v_reinterpret_as_s16
(
vx_setall_s32
(
m16
.
p
[
2
]));
v_int32
m10
=
vx_setall_s32
(
m32
[
4
]);
v_int32
m11
=
vx_setall_s32
(
m32
[
5
]);
int
x
=
0
;
for
(;
x
<=
(
len
-
v_uint8
::
nlanes
)
*
nChannels
;
x
+=
v_uint8
::
nlanes
*
nChannels
)
{
// load 8 pixels
v_int16x8
v0
=
v_reinterpret_as_s16
(
v_load_expand
(
src
+
x
));
v_int16x8
v1
=
v_reinterpret_as_s16
(
v_load_expand
(
src
+
x
+
cWidth
));
v_int16x8
v2
=
v_reinterpret_as_s16
(
v_load_expand
(
src
+
x
+
cWidth
*
2
));
v_int16x8
v3
;
// rotate and pack
v3
=
v_rotate_right
<
1
>
(
v2
);
// 0 b6 g6 r6 b7 g7 r7 0
v2
=
v_rotate_left
<
5
>
(
v2
,
v1
);
// 0 b4 g4 r4 b5 g5 r5 0
v1
=
v_rotate_left
<
3
>
(
v1
,
v0
);
// 0 b2 g2 r2 b3 g3 r3 0
v0
=
v_rotate_left
<
1
>
(
v0
);
// 0 b0 g0 r0 b1 g1 r1 0
// multiply with matrix and normalize
v0
=
v_matmulvec
(
v0
,
m0
,
m1
,
m2
,
m3
,
BITS
);
// 0 B0 G0 R0 B1 G1 R1 0
v1
=
v_matmulvec
(
v1
,
m0
,
m1
,
m2
,
m3
,
BITS
);
// 0 B2 G2 R2 B3 G3 R3 0
v2
=
v_matmulvec
(
v2
,
m0
,
m1
,
m2
,
m3
,
BITS
);
// 0 B4 G4 R4 B5 G5 R5 0
v3
=
v_matmulvec
(
v3
,
m0
,
m1
,
m2
,
m3
,
BITS
);
// 0 B6 G6 R6 B7 G7 R7 0
// narrow down as uint8x16
v_uint8x16
z0
=
v_pack_u
(
v0
,
v_setzero_s16
());
// 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0
v_uint8x16
z1
=
v_pack_u
(
v1
,
v_setzero_s16
());
// 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0
v_uint8x16
z2
=
v_pack_u
(
v2
,
v_setzero_s16
());
// 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0
v_uint8x16
z3
=
v_pack_u
(
v3
,
v_setzero_s16
());
// 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0
// rotate and pack
z0
=
v_reinterpret_as_u8
(
v_reinterpret_as_u64
(
z0
)
>>
8
)
|
v_reinterpret_as_u8
(
v_reinterpret_as_u64
(
z1
)
<<
40
);
// B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0
z1
=
v_reinterpret_as_u8
(
v_reinterpret_as_u64
(
z1
)
>>
24
)
|
v_reinterpret_as_u8
(
v_reinterpret_as_u64
(
z2
)
<<
24
);
// R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0
z2
=
v_reinterpret_as_u8
(
v_reinterpret_as_u64
(
z2
)
>>
40
)
|
v_reinterpret_as_u8
(
v_reinterpret_as_u64
(
z3
)
<<
8
);
// G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0
// store on memory
v_store_low
(
dst
+
x
,
z0
);
v_store_low
(
dst
+
x
+
cWidth
,
z1
);
v_store_low
(
dst
+
x
+
cWidth
*
2
,
z2
);
v_uint8
b
,
g
,
r
;
v_load_deinterleave
(
src
+
x
,
b
,
g
,
r
);
v_uint8
bgl
,
bgh
;
v_zip
(
b
,
g
,
bgl
,
bgh
);
v_uint16
rl
,
rh
;
v_expand
(
r
,
rl
,
rh
);
v_int16
dbl
,
dbh
,
dgl
,
dgh
,
drl
,
drh
;
v_uint16
p0
,
p2
;
v_int32
p1
,
p3
;
v_expand
(
bgl
,
p0
,
p2
);
v_expand
(
v_reinterpret_as_s16
(
rl
),
p1
,
p3
);
dbl
=
v_rshr_pack
<
BITS
>
(
v_dotprod
(
v_reinterpret_as_s16
(
p0
),
m01
)
+
p1
*
m2
+
m3
,
v_dotprod
(
v_reinterpret_as_s16
(
p2
),
m01
)
+
p3
*
m2
+
m3
);
dgl
=
v_rshr_pack
<
BITS
>
(
v_dotprod
(
v_reinterpret_as_s16
(
p0
),
m45
)
+
p1
*
m6
+
m7
,
v_dotprod
(
v_reinterpret_as_s16
(
p2
),
m45
)
+
p3
*
m6
+
m7
);
drl
=
v_rshr_pack
<
BITS
>
(
v_dotprod
(
v_reinterpret_as_s16
(
p0
),
m89
)
+
p1
*
m10
+
m11
,
v_dotprod
(
v_reinterpret_as_s16
(
p2
),
m89
)
+
p3
*
m10
+
m11
);
v_expand
(
bgh
,
p0
,
p2
);
v_expand
(
v_reinterpret_as_s16
(
rh
),
p1
,
p3
);
dbh
=
v_rshr_pack
<
BITS
>
(
v_dotprod
(
v_reinterpret_as_s16
(
p0
),
m01
)
+
p1
*
m2
+
m3
,
v_dotprod
(
v_reinterpret_as_s16
(
p2
),
m01
)
+
p3
*
m2
+
m3
);
dgh
=
v_rshr_pack
<
BITS
>
(
v_dotprod
(
v_reinterpret_as_s16
(
p0
),
m45
)
+
p1
*
m6
+
m7
,
v_dotprod
(
v_reinterpret_as_s16
(
p2
),
m45
)
+
p3
*
m6
+
m7
);
drh
=
v_rshr_pack
<
BITS
>
(
v_dotprod
(
v_reinterpret_as_s16
(
p0
),
m89
)
+
p1
*
m10
+
m11
,
v_dotprod
(
v_reinterpret_as_s16
(
p2
),
m89
)
+
p3
*
m10
+
m11
);
v_store_interleave
(
dst
+
x
,
v_pack_u
(
dbl
,
dbh
),
v_pack_u
(
dgl
,
dgh
),
v_pack_u
(
drl
,
drh
));
}
m32
[
1
]
=
saturate_cast
<
int
>
((
m
[
3
]
+
0.5
f
)
*
SCALE
);
m32
[
3
]
=
saturate_cast
<
int
>
((
m
[
7
]
+
0.5
f
)
*
SCALE
);
m32
[
5
]
=
saturate_cast
<
int
>
((
m
[
11
]
+
0.5
f
)
*
SCALE
);
for
(
;
x
<
len
*
nChannels
;
x
+=
nChannels
)
{
int
v0
=
src
[
x
],
v1
=
src
[
x
+
1
],
v2
=
src
[
x
+
2
];
uchar
t0
=
saturate_cast
<
uchar
>
((
m
00
*
v0
+
m01
*
v1
+
m02
*
v2
+
m03
)
>>
BITS
);
uchar
t1
=
saturate_cast
<
uchar
>
((
m1
0
*
v0
+
m11
*
v1
+
m12
*
v2
+
m13
)
>>
BITS
);
uchar
t2
=
saturate_cast
<
uchar
>
((
m
20
*
v0
+
m21
*
v1
+
m22
*
v2
+
m23
)
>>
BITS
);
uchar
t0
=
saturate_cast
<
uchar
>
((
m
16
.
s
[
0
]
*
v0
+
m16
.
s
[
1
]
*
v1
+
m32
[
0
]
*
v2
+
m32
[
1
])
>>
BITS
);
uchar
t1
=
saturate_cast
<
uchar
>
((
m1
6
.
s
[
2
]
*
v0
+
m16
.
s
[
3
]
*
v1
+
m32
[
2
]
*
v2
+
m32
[
3
])
>>
BITS
);
uchar
t2
=
saturate_cast
<
uchar
>
((
m
16
.
s
[
4
]
*
v0
+
m16
.
s
[
5
]
*
v1
+
m32
[
4
]
*
v2
+
m32
[
5
])
>>
BITS
);
dst
[
x
]
=
t0
;
dst
[
x
+
1
]
=
t1
;
dst
[
x
+
2
]
=
t2
;
}
vx_cleanup
();
return
;
}
#endif
...
...
@@ -1570,64 +1537,65 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
static
void
transform_16u
(
const
ushort
*
src
,
ushort
*
dst
,
const
float
*
m
,
int
len
,
int
scn
,
int
dcn
)
{
#if CV_SIMD
128
&& !defined(__aarch64__)
if
(
hasSIMD128
()
&&
scn
==
3
&&
dcn
==
3
)
#if CV_SIMD && !defined(__aarch64__)
if
(
scn
==
3
&&
dcn
==
3
)
{
const
int
nChannels
=
3
;
const
int
cWidth
=
v_float32x4
::
nlanes
;
v_int16x8
delta
=
v_int16x8
(
0
,
-
32768
,
-
32768
,
-
32768
,
-
32768
,
-
32768
,
-
32768
,
0
);
v_float32x4
m0
,
m1
,
m2
,
m3
;
load3x3Matrix
(
m
,
m0
,
m1
,
m2
,
m3
);
m3
-=
v_float32x4
(
32768.
f
,
32768.
f
,
32768.
f
,
0.
f
);
int
x
=
0
;
for
(
;
x
<=
(
len
-
cWidth
)
*
nChannels
;
x
+=
cWidth
*
nChannels
)
#if CV_SIMD_WIDTH > 16
v_float32
m0
=
vx_setall_f32
(
m
[
0
]);
v_float32
m1
=
vx_setall_f32
(
m
[
1
]);
v_float32
m2
=
vx_setall_f32
(
m
[
2
]);
v_float32
m3
=
vx_setall_f32
(
m
[
3
]
-
32768.
f
);
v_float32
m4
=
vx_setall_f32
(
m
[
4
]);
v_float32
m5
=
vx_setall_f32
(
m
[
5
]);
v_float32
m6
=
vx_setall_f32
(
m
[
6
]);
v_float32
m7
=
vx_setall_f32
(
m
[
7
]
-
32768.
f
);
v_float32
m8
=
vx_setall_f32
(
m
[
8
]);
v_float32
m9
=
vx_setall_f32
(
m
[
9
]);
v_float32
m10
=
vx_setall_f32
(
m
[
10
]);
v_float32
m11
=
vx_setall_f32
(
m
[
11
]
-
32768.
f
);
v_int16
delta
=
vx_setall_s16
(
-
32768
);
for
(;
x
<=
(
len
-
v_uint16
::
nlanes
)
*
3
;
x
+=
v_uint16
::
nlanes
*
3
)
{
// load 4 pixels
v_uint16x8
v0_16
=
v_load
(
src
+
x
);
// b0 g0 r0 b1 g1 r1 b2 g2
v_uint16x8
v2_16
=
v_load_low
(
src
+
x
+
cWidth
*
2
);
// r2 b3 g3 r3 ? ? ? ?
// expand to 4 vectors
v_uint32x4
v0_32
,
v1_32
,
v2_32
,
v3_32
,
dummy_32
;
v_expand
(
v_rotate_right
<
3
>
(
v0_16
),
v1_32
,
dummy_32
);
// b1 g1 r1
v_expand
(
v_rotate_right
<
1
>
(
v2_16
),
v3_32
,
dummy_32
);
// b3 g3 r3
v_expand
(
v_rotate_right
<
6
>
(
v0_16
,
v2_16
),
v2_32
,
dummy_32
);
// b2 g2 r2
v_expand
(
v0_16
,
v0_32
,
dummy_32
);
// b0 g0 r0
// convert to float32x4
v_float32x4
x0
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v0_32
));
// b0 g0 r0
v_float32x4
x1
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v1_32
));
// b1 g1 r1
v_float32x4
x2
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v2_32
));
// b2 g2 r2
v_float32x4
x3
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v3_32
));
// b3 g3 r3
// multiply and convert back to int32x4
v_int32x4
y0
,
y1
,
y2
,
y3
;
y0
=
v_round
(
v_matmuladd
(
x0
,
m0
,
m1
,
m2
,
m3
));
// B0 G0 R0
y1
=
v_round
(
v_matmuladd
(
x1
,
m0
,
m1
,
m2
,
m3
));
// B1 G1 R1
y2
=
v_round
(
v_matmuladd
(
x2
,
m0
,
m1
,
m2
,
m3
));
// B2 G2 R2
y3
=
v_round
(
v_matmuladd
(
x3
,
m0
,
m1
,
m2
,
m3
));
// B3 G3 R3
// narrow down to int16x8
v_int16x8
v0
=
v_add_wrap
(
v_pack
(
v_rotate_left
<
1
>
(
y0
),
y1
),
delta
);
// 0 B0 G0 R0 B1 G1 R1 0
v_int16x8
v2
=
v_add_wrap
(
v_pack
(
v_rotate_left
<
1
>
(
y2
),
y3
),
delta
);
// 0 B2 G2 R2 B3 G3 R3 0
// rotate and pack
v0
=
v_rotate_right
<
1
>
(
v0
)
|
v_rotate_left
<
5
>
(
v2
);
// B0 G0 R0 B1 G1 R1 B2 G2
v2
=
v_rotate_right
<
3
>
(
v2
);
// R2 B3 G3 R3 0 0 0 0
// store 4 pixels
v_store
(
dst
+
x
,
v_reinterpret_as_u16
(
v0
));
v_store_low
(
dst
+
x
+
cWidth
*
2
,
v_reinterpret_as_u16
(
v2
));
v_uint16
b
,
g
,
r
;
v_load_deinterleave
(
src
+
x
,
b
,
g
,
r
);
v_uint32
bl
,
bh
,
gl
,
gh
,
rl
,
rh
;
v_expand
(
b
,
bl
,
bh
);
v_expand
(
g
,
gl
,
gh
);
v_expand
(
r
,
rl
,
rh
);
v_int16
db
,
dg
,
dr
;
db
=
v_add_wrap
(
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
bl
)),
m0
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
gl
)),
m1
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
rl
)),
m2
,
m3
)))),
v_round
(
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
bh
)),
m0
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
gh
)),
m1
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
rh
)),
m2
,
m3
))))),
delta
);
dg
=
v_add_wrap
(
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
bl
)),
m4
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
gl
)),
m5
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
rl
)),
m6
,
m7
)))),
v_round
(
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
bh
)),
m4
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
gh
)),
m5
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
rh
)),
m6
,
m7
))))),
delta
);
dr
=
v_add_wrap
(
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
bl
)),
m8
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
gl
)),
m9
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
rl
)),
m10
,
m11
)))),
v_round
(
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
bh
)),
m8
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
gh
)),
m9
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
rh
)),
m10
,
m11
))))),
delta
);
v_store_interleave
(
dst
+
x
,
v_reinterpret_as_u16
(
db
),
v_reinterpret_as_u16
(
dg
),
v_reinterpret_as_u16
(
dr
));
}
for
(
;
x
<
len
*
nChannels
;
x
+=
nChannels
)
#endif
v_float32x4
_m0l
(
m
[
0
],
m
[
4
],
m
[
8
],
0.
f
);
v_float32x4
_m1l
(
m
[
1
],
m
[
5
],
m
[
9
],
0.
f
);
v_float32x4
_m2l
(
m
[
2
],
m
[
6
],
m
[
10
],
0.
f
);
v_float32x4
_m3l
(
m
[
3
]
-
32768.
f
,
m
[
7
]
-
32768.
f
,
m
[
11
]
-
32768.
f
,
0.
f
);
v_float32x4
_m0h
=
v_rotate_left
<
1
>
(
_m0l
);
v_float32x4
_m1h
=
v_rotate_left
<
1
>
(
_m1l
);
v_float32x4
_m2h
=
v_rotate_left
<
1
>
(
_m2l
);
v_float32x4
_m3h
=
v_rotate_left
<
1
>
(
_m3l
);
v_int16x8
_delta
(
0
,
-
32768
,
-
32768
,
-
32768
,
-
32768
,
-
32768
,
-
32768
,
0
);
for
(
;
x
<=
len
*
3
-
v_uint16x8
::
nlanes
;
x
+=
3
*
v_uint16x8
::
nlanes
/
4
)
v_store
(
dst
+
x
,
v_rotate_right
<
1
>
(
v_reinterpret_as_u16
(
v_add_wrap
(
v_pack
(
v_round
(
v_matmuladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_load_expand
(
src
+
x
))),
_m0h
,
_m1h
,
_m2h
,
_m3h
)),
v_round
(
v_matmuladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_load_expand
(
src
+
x
+
3
))),
_m0l
,
_m1l
,
_m2l
,
_m3l
))),
_delta
))));
for
(
;
x
<
len
*
3
;
x
+=
3
)
{
float
v0
=
src
[
x
],
v1
=
src
[
x
+
1
],
v2
=
src
[
x
+
2
];
ushort
t0
=
saturate_cast
<
ushort
>
(
m
[
0
]
*
v0
+
m
[
1
]
*
v1
+
m
[
2
]
*
v2
+
m
[
3
]);
ushort
t1
=
saturate_cast
<
ushort
>
(
m
[
4
]
*
v0
+
m
[
5
]
*
v1
+
m
[
6
]
*
v2
+
m
[
7
]);
ushort
t0
=
saturate_cast
<
ushort
>
(
m
[
0
]
*
v0
+
m
[
1
]
*
v1
+
m
[
2
]
*
v2
+
m
[
3
]);
ushort
t1
=
saturate_cast
<
ushort
>
(
m
[
4
]
*
v0
+
m
[
5
]
*
v1
+
m
[
6
]
*
v2
+
m
[
7
]);
ushort
t2
=
saturate_cast
<
ushort
>
(
m
[
8
]
*
v0
+
m
[
9
]
*
v1
+
m
[
10
]
*
v2
+
m
[
11
]);
dst
[
x
]
=
t0
;
dst
[
x
+
1
]
=
t1
;
dst
[
x
+
2
]
=
t2
;
}
vx_cleanup
();
return
;
}
#endif
...
...
@@ -1638,52 +1606,68 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
static
void
transform_32f
(
const
float
*
src
,
float
*
dst
,
const
float
*
m
,
int
len
,
int
scn
,
int
dcn
)
{
#if CV_SIMD128 && !defined(__aarch64__)
if
(
hasSIMD128
()
)
#if CV_SIMD && !defined(__aarch64__)
int
x
=
0
;
if
(
scn
==
3
&&
dcn
==
3
)
{
int
x
=
0
;
if
(
scn
==
3
&&
dcn
==
3
)
int
idx
[
v_float32
::
nlanes
/
2
]
;
for
(
int
i
=
0
;
i
<
v_float32
::
nlanes
/
4
;
i
++
)
{
const
int
cWidth
=
3
;
v_float32x4
m0
,
m1
,
m2
,
m3
;
load3x3Matrix
(
m
,
m0
,
m1
,
m2
,
m3
);
for
(
;
x
<
(
len
-
1
)
*
cWidth
;
x
+=
cWidth
)
{
v_float32x4
x0
=
v_load
(
src
+
x
);
v_float32x4
y0
=
v_matmuladd
(
x0
,
m0
,
m1
,
m2
,
m3
);
v_store_low
(
dst
+
x
,
y0
);
dst
[
x
+
2
]
=
v_combine_high
(
y0
,
y0
).
get0
();
}
for
(
;
x
<
len
*
cWidth
;
x
+=
cWidth
)
{
float
v0
=
src
[
x
],
v1
=
src
[
x
+
1
],
v2
=
src
[
x
+
2
];
float
t0
=
saturate_cast
<
float
>
(
m
[
0
]
*
v0
+
m
[
1
]
*
v1
+
m
[
2
]
*
v2
+
m
[
3
]);
float
t1
=
saturate_cast
<
float
>
(
m
[
4
]
*
v0
+
m
[
5
]
*
v1
+
m
[
6
]
*
v2
+
m
[
7
]);
float
t2
=
saturate_cast
<
float
>
(
m
[
8
]
*
v0
+
m
[
9
]
*
v1
+
m
[
10
]
*
v2
+
m
[
11
]);
dst
[
x
]
=
t0
;
dst
[
x
+
1
]
=
t1
;
dst
[
x
+
2
]
=
t2
;
}
return
;
idx
[
i
]
=
3
*
i
;
idx
[
i
+
v_float32
::
nlanes
/
4
]
=
0
;
}
float
_m
[]
=
{
m
[
0
],
m
[
4
],
m
[
8
],
0.
f
,
m
[
1
],
m
[
5
],
m
[
9
],
0.
f
,
m
[
2
],
m
[
6
],
m
[
10
],
0.
f
,
m
[
3
],
m
[
7
],
m
[
11
],
0.
f
};
v_float32
m0
=
vx_lut_quads
(
_m
,
idx
+
v_float32
::
nlanes
/
4
);
v_float32
m1
=
vx_lut_quads
(
_m
+
4
,
idx
+
v_float32
::
nlanes
/
4
);
v_float32
m2
=
vx_lut_quads
(
_m
+
8
,
idx
+
v_float32
::
nlanes
/
4
);
v_float32
m3
=
vx_lut_quads
(
_m
+
12
,
idx
+
v_float32
::
nlanes
/
4
);
for
(
;
x
<=
len
*
3
-
v_float32
::
nlanes
;
x
+=
3
*
v_float32
::
nlanes
/
4
)
v_store
(
dst
+
x
,
v_pack_triplets
(
v_matmuladd
(
vx_lut_quads
(
src
+
x
,
idx
),
m0
,
m1
,
m2
,
m3
)));
for
(
;
x
<
len
*
3
;
x
+=
3
)
{
float
v0
=
src
[
x
],
v1
=
src
[
x
+
1
],
v2
=
src
[
x
+
2
];
float
t0
=
saturate_cast
<
float
>
(
m
[
0
]
*
v0
+
m
[
1
]
*
v1
+
m
[
2
]
*
v2
+
m
[
3
]);
float
t1
=
saturate_cast
<
float
>
(
m
[
4
]
*
v0
+
m
[
5
]
*
v1
+
m
[
6
]
*
v2
+
m
[
7
]);
float
t2
=
saturate_cast
<
float
>
(
m
[
8
]
*
v0
+
m
[
9
]
*
v1
+
m
[
10
]
*
v2
+
m
[
11
]);
dst
[
x
]
=
t0
;
dst
[
x
+
1
]
=
t1
;
dst
[
x
+
2
]
=
t2
;
}
vx_cleanup
();
return
;
}
if
(
scn
==
4
&&
dcn
==
4
)
if
(
scn
==
4
&&
dcn
==
4
)
{
#if CV_SIMD_WIDTH > 16
int
idx
[
v_float32
::
nlanes
/
4
];
for
(
int
i
=
0
;
i
<
v_float32
::
nlanes
/
4
;
i
++
)
idx
[
i
]
=
0
;
float
_m
[]
=
{
m
[
4
],
m
[
9
],
m
[
14
],
m
[
19
]
};
v_float32
m0
=
vx_lut_quads
(
m
,
idx
);
v_float32
m1
=
vx_lut_quads
(
m
+
5
,
idx
);
v_float32
m2
=
vx_lut_quads
(
m
+
10
,
idx
);
v_float32
m3
=
vx_lut_quads
(
m
+
15
,
idx
);
v_float32
m4
=
vx_lut_quads
(
_m
,
idx
);
for
(
;
x
<=
len
*
4
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
{
const
int
cWidth
=
4
;
v_float32x4
m0
=
v_float32x4
(
m
[
0
],
m
[
5
],
m
[
10
],
m
[
15
]);
v_float32x4
m1
=
v_float32x4
(
m
[
1
],
m
[
6
],
m
[
11
],
m
[
16
]);
v_float32x4
m2
=
v_float32x4
(
m
[
2
],
m
[
7
],
m
[
12
],
m
[
17
]);
v_float32x4
m3
=
v_float32x4
(
m
[
3
],
m
[
8
],
m
[
13
],
m
[
18
]);
v_float32x4
m4
=
v_float32x4
(
m
[
4
],
m
[
9
],
m
[
14
],
m
[
19
]);
for
(
;
x
<
len
*
cWidth
;
x
+=
cWidth
)
{
v_float32x4
x0
=
v_load
(
src
+
x
);
v_float32x4
y0
=
v_matmul
(
x0
,
m0
,
m1
,
m2
,
m3
)
+
m4
;
v_store
(
dst
+
x
,
y0
);
}
return
;
v_float32
v_src
=
vx_load
(
src
+
x
);
v_store
(
dst
+
x
,
v_reduce_sum4
(
v_src
*
m0
,
v_src
*
m1
,
v_src
*
m2
,
v_src
*
m3
)
+
m4
);
}
#endif
v_float32x4
_m0
=
v_load
(
m
);
v_float32x4
_m1
=
v_load
(
m
+
5
);
v_float32x4
_m2
=
v_load
(
m
+
10
);
v_float32x4
_m3
=
v_load
(
m
+
15
);
v_float32x4
_m4
(
m
[
4
],
m
[
9
],
m
[
14
],
m
[
19
]);
for
(
;
x
<
len
*
4
;
x
+=
v_float32x4
::
nlanes
)
{
v_float32x4
v_src
=
v_load
(
src
+
x
);
v_store
(
dst
+
x
,
v_reduce_sum4
(
v_src
*
_m0
,
v_src
*
_m1
,
v_src
*
_m2
,
v_src
*
_m3
)
+
_m4
);
}
vx_cleanup
();
return
;
}
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment