Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
34df28db
Commit
34df28db
authored
Oct 17, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #15180 from terfendail:wintr_stereobm
parents
a2b3cd9a
0a1b9573
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
206 additions
and
201 deletions
+206
-201
stereobm.cpp
modules/calib3d/src/stereobm.cpp
+206
-201
No files found.
modules/calib3d/src/stereobm.cpp
View file @
34df28db
...
...
@@ -216,30 +216,30 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
dptr0
[
0
]
=
dptr0
[
size
.
width
-
1
]
=
dptr1
[
0
]
=
dptr1
[
size
.
width
-
1
]
=
val0
;
x
=
1
;
#if CV_SIMD
128
#if CV_SIMD
{
v_int16
x8
ftz
=
v
_setall_s16
((
short
)
ftzero
);
v_int16
x8
ftz2
=
v
_setall_s16
((
short
)(
ftzero
*
2
));
v_int16
x8
z
=
v
_setzero_s16
();
v_int16
ftz
=
vx
_setall_s16
((
short
)
ftzero
);
v_int16
ftz2
=
vx
_setall_s16
((
short
)(
ftzero
*
2
));
v_int16
z
=
vx
_setzero_s16
();
for
(;
x
<=
(
size
.
width
-
1
)
-
8
;
x
+=
8
)
for
(;
x
<=
(
size
.
width
-
1
)
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
{
v_int16
x8
s00
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow0
+
x
+
1
));
v_int16
x8
s01
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow0
+
x
-
1
));
v_int16
x8
s10
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow1
+
x
+
1
));
v_int16
x8
s11
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow1
+
x
-
1
));
v_int16
x8
s20
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow2
+
x
+
1
));
v_int16
x8
s21
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow2
+
x
-
1
));
v_int16
x8
s30
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow3
+
x
+
1
));
v_int16
x8
s31
=
v_reinterpret_as_s16
(
v
_load_expand
(
srow3
+
x
-
1
));
v_int16
x8
d0
=
s00
-
s01
;
v_int16
x8
d1
=
s10
-
s11
;
v_int16
x8
d2
=
s20
-
s21
;
v_int16
x8
d3
=
s30
-
s31
;
v_uint16
x8
v0
=
v_reinterpret_as_u16
(
v_max
(
v_min
(
d0
+
d1
+
d1
+
d2
+
ftz
,
ftz2
),
z
));
v_uint16
x8
v1
=
v_reinterpret_as_u16
(
v_max
(
v_min
(
d1
+
d2
+
d2
+
d3
+
ftz
,
ftz2
),
z
));
v_int16
s00
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow0
+
x
+
1
));
v_int16
s01
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow0
+
x
-
1
));
v_int16
s10
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow1
+
x
+
1
));
v_int16
s11
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow1
+
x
-
1
));
v_int16
s20
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow2
+
x
+
1
));
v_int16
s21
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow2
+
x
-
1
));
v_int16
s30
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow3
+
x
+
1
));
v_int16
s31
=
v_reinterpret_as_s16
(
vx
_load_expand
(
srow3
+
x
-
1
));
v_int16
d0
=
s00
-
s01
;
v_int16
d1
=
s10
-
s11
;
v_int16
d2
=
s20
-
s21
;
v_int16
d3
=
s30
-
s31
;
v_uint16
v0
=
v_reinterpret_as_u16
(
v_max
(
v_min
(
d0
+
d1
+
d1
+
d2
+
ftz
,
ftz2
),
z
));
v_uint16
v1
=
v_reinterpret_as_u16
(
v_max
(
v_min
(
d1
+
d2
+
d2
+
d3
+
ftz
,
ftz2
),
z
));
v_pack_store
(
dptr0
+
x
,
v0
);
v_pack_store
(
dptr1
+
x
,
v1
);
...
...
@@ -262,10 +262,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
{
uchar
*
dptr
=
dst
.
ptr
<
uchar
>
(
y
);
x
=
0
;
#if CV_SIMD
128
#if CV_SIMD
{
v_uint8
x16
val0_16
=
v
_setall_u8
(
val0
);
for
(;
x
<=
size
.
width
-
16
;
x
+=
16
)
v_uint8
val0_16
=
vx
_setall_u8
(
val0
);
for
(;
x
<=
size
.
width
-
v_uint8
::
nlanes
;
x
+=
v_uint8
::
nlanes
)
v_store
(
dptr
+
x
,
val0_16
);
}
#endif
...
...
@@ -309,13 +309,13 @@ inline int dispDescale(int v1, int v2, int d)
return
(
int
)(
v1
*
256
+
(
d
!=
0
?
v2
*
256
/
d
:
0
));
// no need to add 127, this will be converted to float
}
#if CV_SIMD
128
#if CV_SIMD
template
<
typename
dType
>
static
void
findStereoCorrespondenceBM_SIMD
(
const
Mat
&
left
,
const
Mat
&
right
,
Mat
&
disp
,
Mat
&
cost
,
StereoBMParams
&
state
,
uchar
*
buf
,
int
_dy0
,
int
_dy1
)
{
const
int
ALIGN
=
16
;
const
int
ALIGN
=
CV_SIMD_WIDTH
;
int
x
,
y
,
d
;
int
wsz
=
state
.
SADWindowSize
,
wsz2
=
wsz
/
2
;
int
dy0
=
MIN
(
_dy0
,
wsz2
+
1
),
dy1
=
MIN
(
_dy1
,
wsz2
+
1
);
...
...
@@ -345,7 +345,9 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
int
coststep
=
cost
.
data
?
(
int
)(
cost
.
step
/
sizeof
(
costbuf
))
:
0
;
const
int
TABSZ
=
256
;
uchar
tab
[
TABSZ
];
const
v_int16x8
d0_8
=
v_int16x8
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
),
dd_8
=
v_setall_s16
(
8
);
short
v_seq
[
v_int16
::
nlanes
];
for
(
short
i
=
0
;
i
<
v_int16
::
nlanes
;
++
i
)
v_seq
[
i
]
=
i
;
sad
=
(
ushort
*
)
alignPtr
(
buf
+
sizeof
(
sad
[
0
]),
ALIGN
);
hsad0
=
(
ushort
*
)
alignPtr
(
sad
+
ndisp
+
1
+
dy0
*
ndisp
,
ALIGN
);
...
...
@@ -368,20 +370,26 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
for
(
y
=
-
dy0
;
y
<
height
+
dy1
;
y
++
,
hsad
+=
ndisp
,
cbuf
+=
ndisp
,
lptr
+=
sstep
,
rptr
+=
sstep
)
{
int
lval
=
lptr
[
0
];
v_uint8
x16
lv
=
v
_setall_u8
((
uchar
)
lval
);
for
(
d
=
0
;
d
<
ndisp
;
d
+=
16
)
v_uint8
lv
=
vx
_setall_u8
((
uchar
)
lval
);
for
(
d
=
0
;
d
<
=
ndisp
-
v_uint8
::
nlanes
;
d
+=
v_uint8
::
nlanes
)
{
v_uint8x16
rv
=
v_load
(
rptr
+
d
);
v_uint16x8
hsad_l
=
v_load
(
hsad
+
d
);
v_uint16x8
hsad_h
=
v_load
(
hsad
+
d
+
8
);
v_uint8x16
diff
=
v_absdiff
(
lv
,
rv
);
v_uint8
diff
=
v_absdiff
(
lv
,
vx_load
(
rptr
+
d
));
v_store
(
cbuf
+
d
,
diff
);
v_uint16x8
diff0
,
diff1
;
v_expand
(
diff
,
diff0
,
diff1
);
hsad_l
+=
diff0
;
hsad_h
+=
diff1
;
v_store
(
hsad
+
d
,
hsad_l
);
v_store
(
hsad
+
d
+
8
,
hsad_h
);
v_store
(
hsad
+
d
,
vx_load
(
hsad
+
d
)
+
v_expand_low
(
diff
));
v_store
(
hsad
+
d
+
v_uint16
::
nlanes
,
vx_load
(
hsad
+
d
+
v_uint16
::
nlanes
)
+
v_expand_high
(
diff
));
}
if
(
d
<=
ndisp
-
v_uint16
::
nlanes
)
{
v_uint8
diff
=
v_absdiff
(
lv
,
vx_load_low
(
rptr
+
d
));
v_store_low
(
cbuf
+
d
,
diff
);
v_store
(
hsad
+
d
,
vx_load
(
hsad
+
d
)
+
v_expand_low
(
diff
));
d
+=
v_uint16
::
nlanes
;
}
for
(
;
d
<
ndisp
;
d
++
)
{
int
diff
=
abs
(
lval
-
rptr
[
d
]);
cbuf
[
d
]
=
(
uchar
)
diff
;
hsad
[
d
]
+=
(
ushort
)
diff
;
}
htext
[
y
]
+=
tab
[
lval
];
}
...
...
@@ -412,24 +420,27 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
hsad
+=
ndisp
,
lptr
+=
sstep
,
lptr_sub
+=
sstep
,
rptr
+=
sstep
)
{
int
lval
=
lptr
[
0
];
v_uint8
x16
lv
=
v
_setall_u8
((
uchar
)
lval
);
for
(
d
=
0
;
d
<
ndisp
;
d
+=
16
)
v_uint8
lv
=
vx
_setall_u8
((
uchar
)
lval
);
for
(
d
=
0
;
d
<
=
ndisp
-
v_uint8
::
nlanes
;
d
+=
v_uint8
::
nlanes
)
{
v_uint8x16
rv
=
v_load
(
rptr
+
d
);
v_uint16x8
hsad_l
=
v_load
(
hsad
+
d
);
v_uint16x8
hsad_h
=
v_load
(
hsad
+
d
+
8
);
v_uint8x16
cbs
=
v_load
(
cbuf_sub
+
d
);
v_uint8x16
diff
=
v_absdiff
(
lv
,
rv
);
v_int16x8
diff_l
,
diff_h
,
cbs_l
,
cbs_h
;
v_uint8
diff
=
v_absdiff
(
lv
,
vx_load
(
rptr
+
d
));
v_int8
cbs
=
v_reinterpret_as_s8
(
vx_load
(
cbuf_sub
+
d
));
v_store
(
cbuf
+
d
,
diff
);
v_expand
(
v_reinterpret_as_s8
(
diff
),
diff_l
,
diff_h
);
v_expand
(
v_reinterpret_as_s8
(
cbs
),
cbs_l
,
cbs_h
);
diff_l
-=
cbs_l
;
diff_h
-=
cbs_h
;
hsad_h
=
v_reinterpret_as_u16
(
v_reinterpret_as_s16
(
hsad_h
)
+
diff_h
);
hsad_l
=
v_reinterpret_as_u16
(
v_reinterpret_as_s16
(
hsad_l
)
+
diff_l
);
v_store
(
hsad
+
d
,
hsad_l
);
v_store
(
hsad
+
d
+
8
,
hsad_h
);
v_store
(
hsad
+
d
,
v_reinterpret_as_u16
(
v_reinterpret_as_s16
(
vx_load
(
hsad
+
d
)
+
v_expand_low
(
diff
))
-
v_expand_low
(
cbs
)));
v_store
(
hsad
+
d
+
v_uint16
::
nlanes
,
v_reinterpret_as_u16
(
v_reinterpret_as_s16
(
vx_load
(
hsad
+
d
+
v_uint16
::
nlanes
)
+
v_expand_high
(
diff
))
-
v_expand_high
(
cbs
)));
}
if
(
d
<=
ndisp
-
v_uint16
::
nlanes
)
{
v_uint8
diff
=
v_absdiff
(
lv
,
vx_load_low
(
rptr
+
d
));
v_store_low
(
cbuf
+
d
,
diff
);
v_store
(
hsad
+
d
,
v_reinterpret_as_u16
(
v_reinterpret_as_s16
(
vx_load
(
hsad
+
d
)
+
v_expand_low
(
diff
))
-
vx_load_expand
((
schar
*
)
cbuf_sub
+
d
)));
d
+=
v_uint16
::
nlanes
;
}
for
(
;
d
<
ndisp
;
d
++
)
{
int
diff
=
abs
(
lval
-
rptr
[
d
]);
cbuf
[
d
]
=
(
uchar
)
diff
;
hsad
[
d
]
=
hsad
[
d
]
+
(
ushort
)
diff
-
cbuf_sub
[
d
];
}
htext
[
y
]
+=
tab
[
lval
]
-
tab
[
lptr_sub
[
0
]];
}
...
...
@@ -446,17 +457,25 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
hsad
=
hsad0
+
(
1
-
dy0
)
*
ndisp
;
for
(
y
=
1
-
dy0
;
y
<
wsz2
;
y
++
,
hsad
+=
ndisp
)
for
(
d
=
0
;
d
<=
ndisp
-
16
;
d
+=
16
)
{
for
(
d
=
0
;
d
<=
ndisp
-
2
*
v_uint16
::
nlanes
;
d
+=
2
*
v_uint16
::
nlanes
)
{
v_store
(
sad
+
d
,
vx_load
(
sad
+
d
)
+
vx_load
(
hsad
+
d
));
v_store
(
sad
+
d
+
v_uint16
::
nlanes
,
vx_load
(
sad
+
d
+
v_uint16
::
nlanes
)
+
vx_load
(
hsad
+
d
+
v_uint16
::
nlanes
));
}
if
(
d
<=
ndisp
-
v_uint16
::
nlanes
)
{
v_uint16x8
s0
=
v_load
(
sad
+
d
);
v_uint16x8
s1
=
v_load
(
sad
+
d
+
8
);
v_uint16x8
t0
=
v_load
(
hsad
+
d
);
v_uint16x8
t1
=
v_load
(
hsad
+
d
+
8
);
s0
=
s0
+
t0
;
s1
=
s1
+
t1
;
v_store
(
sad
+
d
,
s0
);
v_store
(
sad
+
d
+
8
,
s1
);
v_store
(
sad
+
d
,
vx_load
(
sad
+
d
)
+
vx_load
(
hsad
+
d
));
d
+=
v_uint16
::
nlanes
;
}
if
(
d
<=
ndisp
-
v_uint16
::
nlanes
/
2
)
{
v_store_low
(
sad
+
d
,
vx_load_low
(
sad
+
d
)
+
vx_load_low
(
hsad
+
d
));
d
+=
v_uint16
::
nlanes
/
2
;
}
for
(
;
d
<
ndisp
;
d
++
)
sad
[
d
]
=
sad
[
d
]
+
hsad
[
d
];
}
int
tsum
=
0
;
for
(
y
=
-
wsz2
-
1
;
y
<
wsz2
;
y
++
)
tsum
+=
htext
[
y
];
...
...
@@ -467,38 +486,41 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
int
minsad
=
INT_MAX
,
mind
=
-
1
;
hsad
=
hsad0
+
MIN
(
y
+
wsz2
,
height
+
dy1
-
1
)
*
ndisp
;
hsad_sub
=
hsad0
+
MAX
(
y
-
wsz2
-
1
,
-
dy0
)
*
ndisp
;
v_int16
x8
minsad8
=
v
_setall_s16
(
SHRT_MAX
);
v_int16
x8
mind8
=
v_setall_s16
(
0
),
d8
=
d0_8
;
v_int16
minsad8
=
vx
_setall_s16
(
SHRT_MAX
);
v_int16
mind8
=
vx_setall_s16
(
0
)
;
for
(
d
=
0
;
d
<
ndisp
;
d
+=
16
)
for
(
d
=
0
;
d
<
=
ndisp
-
2
*
v_int16
::
nlanes
;
d
+=
2
*
v_int16
::
nlanes
)
{
v_int16x8
u0
=
v_reinterpret_as_s16
(
v_load
(
hsad_sub
+
d
));
v_int16x8
u1
=
v_reinterpret_as_s16
(
v_load
(
hsad
+
d
));
v_int16x8
v0
=
v_reinterpret_as_s16
(
v_load
(
hsad_sub
+
d
+
8
));
v_int16x8
v1
=
v_reinterpret_as_s16
(
v_load
(
hsad
+
d
+
8
));
v_int16x8
usad8
=
v_reinterpret_as_s16
(
v_load
(
sad
+
d
));
v_int16x8
vsad8
=
v_reinterpret_as_s16
(
v_load
(
sad
+
d
+
8
));
u1
-=
u0
;
v1
-=
v0
;
usad8
+=
u1
;
vsad8
+=
v1
;
v_int16x8
mask
=
minsad8
>
usad8
;
minsad8
=
v_min
(
minsad8
,
usad8
);
mind8
=
v_max
(
mind8
,
(
mask
&
d8
));
v_store
(
sad
+
d
,
v_reinterpret_as_u16
(
usad8
));
v_store
(
sad
+
d
+
8
,
v_reinterpret_as_u16
(
vsad8
));
mask
=
minsad8
>
vsad8
;
minsad8
=
v_min
(
minsad8
,
vsad8
);
d8
=
d8
+
dd_8
;
mind8
=
v_max
(
mind8
,
(
mask
&
d8
));
d8
=
d8
+
dd_8
;
v_int16
sad8
=
v_reinterpret_as_s16
(
vx_load
(
hsad
+
d
))
-
v_reinterpret_as_s16
(
vx_load
(
hsad_sub
+
d
))
+
v_reinterpret_as_s16
(
vx_load
(
sad
+
d
));
v_store
(
sad
+
d
,
v_reinterpret_as_u16
(
sad8
));
mind8
=
v_max
(
mind8
,
(
minsad8
>
sad8
)
&
vx_setall_s16
((
short
)
d
));
minsad8
=
v_min
(
minsad8
,
sad8
);
sad8
=
v_reinterpret_as_s16
(
vx_load
(
hsad
+
d
+
v_int16
::
nlanes
))
-
v_reinterpret_as_s16
(
vx_load
(
hsad_sub
+
d
+
v_int16
::
nlanes
))
+
v_reinterpret_as_s16
(
vx_load
(
sad
+
d
+
v_int16
::
nlanes
));
v_store
(
sad
+
d
+
v_int16
::
nlanes
,
v_reinterpret_as_u16
(
sad8
));
mind8
=
v_max
(
mind8
,
(
minsad8
>
sad8
)
&
vx_setall_s16
((
short
)
d
+
v_int16
::
nlanes
));
minsad8
=
v_min
(
minsad8
,
sad8
);
}
if
(
d
<=
ndisp
-
v_int16
::
nlanes
)
{
v_int16
sad8
=
v_reinterpret_as_s16
(
vx_load
(
hsad
+
d
))
-
v_reinterpret_as_s16
(
vx_load
(
hsad_sub
+
d
))
+
v_reinterpret_as_s16
(
vx_load
(
sad
+
d
));
v_store
(
sad
+
d
,
v_reinterpret_as_u16
(
sad8
));
mind8
=
v_max
(
mind8
,
(
minsad8
>
sad8
)
&
vx_setall_s16
((
short
)
d
));
minsad8
=
v_min
(
minsad8
,
sad8
);
d
+=
v_int16
::
nlanes
;
}
minsad
=
v_reduce_min
(
minsad8
);
v_int16
v_mask
=
(
vx_setall_s16
((
short
)
minsad
)
==
minsad8
);
mind
=
v_reduce_min
(((
mind8
+
vx_load
(
v_seq
))
&
v_mask
)
|
(
vx_setall_s16
(
SHRT_MAX
)
&
~
v_mask
));
for
(
;
d
<
ndisp
;
d
++
)
{
int
sad8
=
(
int
)(
hsad
[
d
])
-
hsad_sub
[
d
]
+
sad
[
d
];
sad
[
d
]
=
(
ushort
)
sad8
;
if
(
minsad
>
sad8
)
{
mind
=
d
;
minsad
=
sad8
;
}
}
tsum
+=
htext
[
y
+
wsz2
]
-
htext
[
y
-
wsz2
-
1
];
...
...
@@ -508,41 +530,42 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
continue
;
}
ushort
CV_DECL_ALIGNED
(
16
)
minsad_buf
[
8
],
mind_buf
[
8
];
v_store
(
minsad_buf
,
v_reinterpret_as_u16
(
minsad8
));
v_store
(
mind_buf
,
v_reinterpret_as_u16
(
mind8
));
for
(
d
=
0
;
d
<
8
;
d
++
)
if
(
minsad
>
(
int
)
minsad_buf
[
d
]
||
(
minsad
==
(
int
)
minsad_buf
[
d
]
&&
mind
>
mind_buf
[
d
]))
{
minsad
=
minsad_buf
[
d
];
mind
=
mind_buf
[
d
];
}
if
(
uniquenessRatio
>
0
)
{
int
thresh
=
minsad
+
(
minsad
*
uniquenessRatio
/
100
);
v_int32x4
thresh4
=
v_setall_s32
(
thresh
+
1
);
v_int32x4
d1
=
v_setall_s32
(
mind
-
1
),
d2
=
v_setall_s32
(
mind
+
1
);
v_int32x4
dd_4
=
v_setall_s32
(
4
);
v_int32x4
d4
=
v_int32x4
(
0
,
1
,
2
,
3
);
v_int32x4
mask4
;
v_int32
thresh4
=
vx_setall_s32
(
thresh
+
1
);
v_int32
d1
=
vx_setall_s32
(
mind
-
1
),
d2
=
vx_setall_s32
(
mind
+
1
);
v_int32
dd_4
=
vx_setall_s32
(
v_int32
::
nlanes
);
v_int32
d4
=
vx_load_expand
(
v_seq
);
for
(
d
=
0
;
d
<
ndisp
;
d
+=
8
)
for
(
d
=
0
;
d
<
=
ndisp
-
v_int16
::
nlanes
;
d
+=
v_int16
::
nlanes
)
{
v_int16x8
sad8
=
v_reinterpret_as_s16
(
v_load
(
sad
+
d
));
v_int32x4
sad4_l
,
sad4_h
;
v_expand
(
sad8
,
sad4_l
,
sad4_h
);
mask4
=
thresh4
>
sad4_l
;
mask4
=
mask4
&
((
d1
>
d4
)
|
(
d4
>
d2
));
if
(
v_check_any
(
mask4
)
)
v_int32
sad4_l
,
sad4_h
;
v_expand
(
v_reinterpret_as_s16
(
vx_load
(
sad
+
d
)),
sad4_l
,
sad4_h
);
if
(
v_check_any
((
thresh4
>
sad4_l
)
&
((
d1
>
d4
)
|
(
d4
>
d2
)))
)
break
;
d4
+=
dd_4
;
mask4
=
thresh4
>
sad4_h
;
mask4
=
mask4
&
((
d1
>
d4
)
|
(
d4
>
d2
));
if
(
v_check_any
(
mask4
)
)
if
(
v_check_any
((
thresh4
>
sad4_h
)
&
((
d1
>
d4
)
|
(
d4
>
d2
)))
)
break
;
d4
+=
dd_4
;
}
if
(
d
<=
ndisp
-
v_int16
::
nlanes
)
{
dptr
[
y
*
dstep
]
=
FILTERED
;
continue
;
}
if
(
d
<=
ndisp
-
v_int32
::
nlanes
)
{
v_int32
sad4_l
=
vx_load_expand
((
short
*
)
sad
+
d
);
if
(
v_check_any
((
thresh4
>
sad4_l
)
&
((
d1
>
d4
)
|
(
d4
>
d2
))))
continue
;
d
+=
v_int16
::
nlanes
;
}
for
(
;
d
<
ndisp
;
d
++
)
{
if
(
(
thresh
+
1
)
>
sad
[
d
]
&&
((
mind
-
1
)
>
d
||
d
>
(
mind
+
1
))
)
break
;
}
if
(
d
<
ndisp
)
{
dptr
[
y
*
dstep
]
=
FILTERED
;
...
...
@@ -571,7 +594,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
uchar
*
buf
,
int
_dy0
,
int
_dy1
)
{
const
int
ALIGN
=
16
;
const
int
ALIGN
=
CV_SIMD_WIDTH
;
int
x
,
y
,
d
;
int
wsz
=
state
.
SADWindowSize
,
wsz2
=
wsz
/
2
;
int
dy0
=
MIN
(
_dy0
,
wsz2
+
1
),
dy1
=
MIN
(
_dy1
,
wsz2
+
1
);
...
...
@@ -587,12 +610,6 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
const
int
disp_shift
=
dispShiftTemplate
<
mType
>::
value
;
mType
FILTERED
=
(
mType
)((
mindisp
-
1
)
<<
disp_shift
);
#if CV_SIMD128
{
CV_Assert
(
ndisp
%
8
==
0
);
}
#endif
int
*
sad
,
*
hsad0
,
*
hsad
,
*
hsad_sub
,
*
htext
;
uchar
*
cbuf0
,
*
cbuf
;
const
uchar
*
lptr0
=
left
.
ptr
()
+
lofs
;
...
...
@@ -607,6 +624,13 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
const
int
TABSZ
=
256
;
uchar
tab
[
TABSZ
];
#if CV_SIMD
int
v_seq
[
v_int32
::
nlanes
];
for
(
int
i
=
0
;
i
<
v_int32
::
nlanes
;
++
i
)
v_seq
[
i
]
=
i
;
v_int32
d0_4
=
vx_load
(
v_seq
),
dd_4
=
vx_setall_s32
(
v_int32
::
nlanes
);
#endif
sad
=
(
int
*
)
alignPtr
(
buf
+
sizeof
(
sad
[
0
]),
ALIGN
);
hsad0
=
(
int
*
)
alignPtr
(
sad
+
ndisp
+
1
+
dy0
*
ndisp
,
ALIGN
);
htext
=
(
int
*
)
alignPtr
((
int
*
)(
hsad0
+
(
height
+
dy1
)
*
ndisp
)
+
wsz2
+
2
,
ALIGN
);
...
...
@@ -628,22 +652,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
{
int
lval
=
lptr
[
0
];
d
=
0
;
#if CV_SIMD
128
#if CV_SIMD
{
v_uint8
x16
lv
=
v
_setall_u8
((
uchar
)
lval
);
v_uint8
lv
=
vx
_setall_u8
((
uchar
)
lval
);
for
(
;
d
<=
ndisp
-
16
;
d
+=
16
)
for
(
;
d
<=
ndisp
-
v_uint8
::
nlanes
;
d
+=
v_uint8
::
nlanes
)
{
v_uint8
x16
rv
=
v
_load
(
rptr
+
d
);
v_int32
x4
hsad_0
=
v
_load
(
hsad
+
d
);
v_int32
x4
hsad_1
=
v_load
(
hsad
+
d
+
4
);
v_int32
x4
hsad_2
=
v_load
(
hsad
+
d
+
8
);
v_int32
x4
hsad_3
=
v_load
(
hsad
+
d
+
12
);
v_uint8
x16
diff
=
v_absdiff
(
lv
,
rv
);
v_uint8
rv
=
vx
_load
(
rptr
+
d
);
v_int32
hsad_0
=
vx
_load
(
hsad
+
d
);
v_int32
hsad_1
=
vx_load
(
hsad
+
d
+
v_int32
::
nlanes
);
v_int32
hsad_2
=
vx_load
(
hsad
+
d
+
2
*
v_int32
::
nlanes
);
v_int32
hsad_3
=
vx_load
(
hsad
+
d
+
3
*
v_int32
::
nlanes
);
v_uint8
diff
=
v_absdiff
(
lv
,
rv
);
v_store
(
cbuf
+
d
,
diff
);
v_uint16
x8
diff0
,
diff1
;
v_uint32
x4
diff00
,
diff01
,
diff10
,
diff11
;
v_uint16
diff0
,
diff1
;
v_uint32
diff00
,
diff01
,
diff10
,
diff11
;
v_expand
(
diff
,
diff0
,
diff1
);
v_expand
(
diff0
,
diff00
,
diff01
);
v_expand
(
diff1
,
diff10
,
diff11
);
...
...
@@ -654,9 +678,9 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
hsad_3
+=
v_reinterpret_as_s32
(
diff11
);
v_store
(
hsad
+
d
,
hsad_0
);
v_store
(
hsad
+
d
+
4
,
hsad_1
);
v_store
(
hsad
+
d
+
8
,
hsad_2
);
v_store
(
hsad
+
d
+
12
,
hsad_3
);
v_store
(
hsad
+
d
+
v_int32
::
nlanes
,
hsad_1
);
v_store
(
hsad
+
d
+
2
*
v_int32
::
nlanes
,
hsad_2
);
v_store
(
hsad
+
d
+
3
*
v_int32
::
nlanes
,
hsad_3
);
}
}
#endif
...
...
@@ -696,22 +720,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
{
int
lval
=
lptr
[
0
];
d
=
0
;
#if CV_SIMD
128
#if CV_SIMD
{
v_uint8
x16
lv
=
v
_setall_u8
((
uchar
)
lval
);
for
(
;
d
<=
ndisp
-
16
;
d
+=
16
)
v_uint8
lv
=
vx
_setall_u8
((
uchar
)
lval
);
for
(
;
d
<=
ndisp
-
v_uint8
::
nlanes
;
d
+=
v_uint8
::
nlanes
)
{
v_uint8
x16
rv
=
v
_load
(
rptr
+
d
);
v_int32
x4
hsad_0
=
v
_load
(
hsad
+
d
);
v_int32
x4
hsad_1
=
v_load
(
hsad
+
d
+
4
);
v_int32
x4
hsad_2
=
v_load
(
hsad
+
d
+
8
);
v_int32
x4
hsad_3
=
v_load
(
hsad
+
d
+
12
);
v_uint8
x16
cbs
=
v
_load
(
cbuf_sub
+
d
);
v_uint8
x16
diff
=
v_absdiff
(
lv
,
rv
);
v_uint8
rv
=
vx
_load
(
rptr
+
d
);
v_int32
hsad_0
=
vx
_load
(
hsad
+
d
);
v_int32
hsad_1
=
vx_load
(
hsad
+
d
+
v_int32
::
nlanes
);
v_int32
hsad_2
=
vx_load
(
hsad
+
d
+
2
*
v_int32
::
nlanes
);
v_int32
hsad_3
=
vx_load
(
hsad
+
d
+
3
*
v_int32
::
nlanes
);
v_uint8
cbs
=
vx
_load
(
cbuf_sub
+
d
);
v_uint8
diff
=
v_absdiff
(
lv
,
rv
);
v_store
(
cbuf
+
d
,
diff
);
v_uint16
x8
diff0
,
diff1
,
cbs0
,
cbs1
;
v_int32
x4
diff00
,
diff01
,
diff10
,
diff11
,
cbs00
,
cbs01
,
cbs10
,
cbs11
;
v_uint16
diff0
,
diff1
,
cbs0
,
cbs1
;
v_int32
diff00
,
diff01
,
diff10
,
diff11
,
cbs00
,
cbs01
,
cbs10
,
cbs11
;
v_expand
(
diff
,
diff0
,
diff1
);
v_expand
(
cbs
,
cbs0
,
cbs1
);
v_expand
(
v_reinterpret_as_s16
(
diff0
),
diff00
,
diff01
);
...
...
@@ -719,19 +743,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
v_expand
(
v_reinterpret_as_s16
(
cbs0
),
cbs00
,
cbs01
);
v_expand
(
v_reinterpret_as_s16
(
cbs1
),
cbs10
,
cbs11
);
v_int32
x4
diff_0
=
diff00
-
cbs00
;
v_int32
x4
diff_1
=
diff01
-
cbs01
;
v_int32
x4
diff_2
=
diff10
-
cbs10
;
v_int32
x4
diff_3
=
diff11
-
cbs11
;
v_int32
diff_0
=
diff00
-
cbs00
;
v_int32
diff_1
=
diff01
-
cbs01
;
v_int32
diff_2
=
diff10
-
cbs10
;
v_int32
diff_3
=
diff11
-
cbs11
;
hsad_0
+=
diff_0
;
hsad_1
+=
diff_1
;
hsad_2
+=
diff_2
;
hsad_3
+=
diff_3
;
v_store
(
hsad
+
d
,
hsad_0
);
v_store
(
hsad
+
d
+
4
,
hsad_1
);
v_store
(
hsad
+
d
+
8
,
hsad_2
);
v_store
(
hsad
+
d
+
12
,
hsad_3
);
v_store
(
hsad
+
d
+
v_int32
::
nlanes
,
hsad_1
);
v_store
(
hsad
+
d
+
2
*
v_int32
::
nlanes
,
hsad_2
);
v_store
(
hsad
+
d
+
3
*
v_int32
::
nlanes
,
hsad_3
);
}
}
#endif
...
...
@@ -758,18 +782,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
for
(
y
=
1
-
dy0
;
y
<
wsz2
;
y
++
,
hsad
+=
ndisp
)
{
d
=
0
;
#if CV_SIMD
128
#if CV_SIMD
{
for
(
d
=
0
;
d
<=
ndisp
-
8
;
d
+=
8
)
for
(
d
=
0
;
d
<=
ndisp
-
2
*
v_int32
::
nlanes
;
d
+=
2
*
v_int32
::
nlanes
)
{
v_int32
x4
s0
=
v
_load
(
sad
+
d
);
v_int32
x4
s1
=
v_load
(
sad
+
d
+
4
);
v_int32
x4
t0
=
v
_load
(
hsad
+
d
);
v_int32
x4
t1
=
v_load
(
hsad
+
d
+
4
);
v_int32
s0
=
vx
_load
(
sad
+
d
);
v_int32
s1
=
vx_load
(
sad
+
d
+
v_int32
::
nlanes
);
v_int32
t0
=
vx
_load
(
hsad
+
d
);
v_int32
t1
=
vx_load
(
hsad
+
d
+
v_int32
::
nlanes
);
s0
+=
t0
;
s1
+=
t1
;
v_store
(
sad
+
d
,
s0
);
v_store
(
sad
+
d
+
4
,
s1
);
v_store
(
sad
+
d
+
v_int32
::
nlanes
,
s1
);
}
}
#endif
...
...
@@ -787,50 +811,31 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
hsad
=
hsad0
+
MIN
(
y
+
wsz2
,
height
+
dy1
-
1
)
*
ndisp
;
hsad_sub
=
hsad0
+
MAX
(
y
-
wsz2
-
1
,
-
dy0
)
*
ndisp
;
d
=
0
;
#if CV_SIMD
128
#if CV_SIMD
{
v_int32x4
d0_4
=
v_int32x4
(
0
,
1
,
2
,
3
);
v_int32x4
dd_4
=
v_setall_s32
(
4
);
v_int32x4
minsad4
=
v_setall_s32
(
INT_MAX
);
v_int32x4
mind4
=
v_setall_s32
(
0
),
d4
=
d0_4
;
v_int32
minsad4
=
vx_setall_s32
(
INT_MAX
);
v_int32
mind4
=
vx_setall_s32
(
0
),
d4
=
d0_4
;
for
(
;
d
<=
ndisp
-
8
;
d
+=
8
)
for
(
;
d
<=
ndisp
-
2
*
v_int32
::
nlanes
;
d
+=
2
*
v_int32
::
nlanes
)
{
v_int32x4
u0
=
v_load
(
hsad_sub
+
d
);
v_int32x4
u1
=
v_load
(
hsad
+
d
);
v_int32x4
v0
=
v_load
(
hsad_sub
+
d
+
4
);
v_int32x4
v1
=
v_load
(
hsad
+
d
+
4
);
v_int32x4
usad4
=
v_load
(
sad
+
d
);
v_int32x4
vsad4
=
v_load
(
sad
+
d
+
4
);
u1
-=
u0
;
v1
-=
v0
;
usad4
+=
u1
;
vsad4
+=
v1
;
v_store
(
sad
+
d
,
usad4
);
v_store
(
sad
+
d
+
4
,
vsad4
);
v_int32x4
mask
=
minsad4
>
usad4
;
minsad4
=
v_min
(
minsad4
,
usad4
);
mind4
=
v_select
(
mask
,
d4
,
mind4
);
v_int32
sad4
=
vx_load
(
sad
+
d
)
+
vx_load
(
hsad
+
d
)
-
vx_load
(
hsad_sub
+
d
);
v_store
(
sad
+
d
,
sad4
);
mind4
=
v_select
(
minsad4
>
sad4
,
d4
,
mind4
);
minsad4
=
v_min
(
minsad4
,
sad4
);
d4
+=
dd_4
;
mask
=
minsad4
>
vsad4
;
minsad4
=
v_min
(
minsad4
,
vsad4
);
mind4
=
v_select
(
mask
,
d4
,
mind4
);
sad4
=
vx_load
(
sad
+
d
+
v_int32
::
nlanes
)
+
vx_load
(
hsad
+
d
+
v_int32
::
nlanes
)
-
vx_load
(
hsad_sub
+
d
+
v_int32
::
nlanes
);
v_store
(
sad
+
d
+
v_int32
::
nlanes
,
sad4
);
mind4
=
v_select
(
minsad4
>
sad4
,
d4
,
mind4
);
minsad4
=
v_min
(
minsad4
,
sad4
);
d4
+=
dd_4
;
}
int
CV_DECL_ALIGNED
(
16
)
minsad_buf
[
4
],
mind_buf
[
4
];
int
CV_DECL_ALIGNED
(
CV_SIMD_WIDTH
)
minsad_buf
[
v_int32
::
nlanes
],
mind_buf
[
v_int32
::
nlanes
];
v_store
(
minsad_buf
,
minsad4
);
v_store
(
mind_buf
,
mind4
);
if
(
minsad_buf
[
0
]
<
minsad
||
(
minsad
==
minsad_buf
[
0
]
&&
mind_buf
[
0
]
<
mind
))
{
minsad
=
minsad_buf
[
0
];
mind
=
mind_buf
[
0
];
}
if
(
minsad_buf
[
1
]
<
minsad
||
(
minsad
==
minsad_buf
[
1
]
&&
mind_buf
[
1
]
<
mind
))
{
minsad
=
minsad_buf
[
1
];
mind
=
mind_buf
[
1
];
}
if
(
minsad_buf
[
2
]
<
minsad
||
(
minsad
==
minsad_buf
[
2
]
&&
mind_buf
[
2
]
<
mind
))
{
minsad
=
minsad_buf
[
2
];
mind
=
mind_buf
[
2
];
}
if
(
minsad_buf
[
3
]
<
minsad
||
(
minsad
==
minsad_buf
[
3
]
&&
mind_buf
[
3
]
<
mind
))
{
minsad
=
minsad_buf
[
3
];
mind
=
mind_buf
[
3
];
}
for
(
int
i
=
0
;
i
<
v_int32
::
nlanes
;
++
i
)
if
(
minsad_buf
[
i
]
<
minsad
||
(
minsad
==
minsad_buf
[
i
]
&&
mind_buf
[
i
]
<
mind
))
{
minsad
=
minsad_buf
[
i
];
mind
=
mind_buf
[
i
];
}
}
#endif
for
(
;
d
<
ndisp
;
d
++
)
...
...
@@ -1027,7 +1032,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
Mat
disp_i
=
disp
->
rowRange
(
row0
,
row1
);
Mat
cost_i
=
state
->
disp12MaxDiff
>=
0
?
cost
->
rowRange
(
row0
,
row1
)
:
Mat
();
#if CV_SIMD
128
#if CV_SIMD
if
(
useShorts
)
{
if
(
disp_i
.
type
()
==
CV_16S
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment