Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
ecb8fb96
Commit
ecb8fb96
authored
Oct 28, 2016
by
Vadim Pisarevsky
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #7572 from tomoaki0705:featureUniversalStereoSgbm
parents
7f2ac764
b823c8e9
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
450 additions
and
419 deletions
+450
-419
stereosgbm.cpp
modules/calib3d/src/stereosgbm.cpp
+380
-403
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+27
-15
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+40
-0
test_intrin.cpp
modules/core/test/test_intrin.cpp
+3
-1
No files found.
modules/calib3d/src/stereosgbm.cpp
View file @
ecb8fb96
...
...
@@ -131,6 +131,9 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
int
D
=
maxD
-
minD
,
width1
=
maxX1
-
minX1
,
width2
=
maxX2
-
minX2
;
const
PixType
*
row1
=
img1
.
ptr
<
PixType
>
(
y
),
*
row2
=
img2
.
ptr
<
PixType
>
(
y
);
PixType
*
prow1
=
buffer
+
width2
*
2
,
*
prow2
=
prow1
+
width
*
cn
*
2
;
#if CV_SIMD128
bool
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
tab
+=
tabOfs
;
...
...
@@ -181,7 +184,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
buffer
-=
minX2
;
cost
-=
minX1
*
D
+
minD
;
// simplify the cost indices inside the loop
#if 1
for
(
c
=
0
;
c
<
cn
*
2
;
c
++
,
prow1
+=
width
,
prow2
+=
width
)
{
int
diff_scale
=
c
<
cn
?
0
:
2
;
...
...
@@ -209,60 +211,27 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
int
u1
=
std
::
max
(
ul
,
ur
);
u1
=
std
::
max
(
u1
,
u
);
#if CV_SIMD128
v_uint8x16
_u
=
v_setall_u8
((
uchar
)
u
),
_u0
=
v_setall_u8
((
uchar
)
u0
);
v_uint8x16
_u1
=
v_setall_u8
((
uchar
)
u1
);
for
(
int
d
=
minD
;
d
<
maxD
;
d
+=
16
)
{
v_uint8x16
_v
=
v_load
(
prow2
+
width
-
x
-
1
+
d
);
v_uint8x16
_v0
=
v_load
(
buffer
+
width
-
x
-
1
+
d
);
v_uint8x16
_v1
=
v_load
(
buffer
+
width
-
x
-
1
+
d
+
width2
);
v_uint8x16
c0
=
v_max
(
_u
-
_v1
,
_v0
-
_u
);
v_uint8x16
c1
=
v_max
(
_v
-
_u1
,
_u0
-
_v
);
v_uint8x16
diff
=
v_min
(
c0
,
c1
);
v_int16x8
_c0
=
v_load_aligned
(
cost
+
x
*
D
+
d
);
v_int16x8
_c1
=
v_load_aligned
(
cost
+
x
*
D
+
d
+
8
);
v_uint16x8
diff1
,
diff2
;
v_expand
(
diff
,
diff1
,
diff2
);
v_store_aligned
(
cost
+
x
*
D
+
d
,
_c0
+
v_reinterpret_as_s16
(
diff1
>>
diff_scale
));
v_store_aligned
(
cost
+
x
*
D
+
d
+
8
,
_c1
+
v_reinterpret_as_s16
(
diff2
>>
diff_scale
));
}
#else
for
(
int
d
=
minD
;
d
<
maxD
;
d
++
)
{
int
v
=
prow2
[
width
-
x
-
1
+
d
];
int
v0
=
buffer
[
width
-
x
-
1
+
d
];
int
v1
=
buffer
[
width
-
x
-
1
+
d
+
width2
];
int
c0
=
std
::
max
(
0
,
u
-
v1
);
c0
=
std
::
max
(
c0
,
v0
-
u
);
int
c1
=
std
::
max
(
0
,
v
-
u1
);
c1
=
std
::
max
(
c1
,
u0
-
v
);
cost
[
x
*
D
+
d
]
=
(
CostType
)(
cost
[
x
*
D
+
d
]
+
(
std
::
min
(
c0
,
c1
)
>>
diff_scale
));
}
#endif
}
}
#else
for
(
c
=
0
;
c
<
cn
*
2
;
c
++
,
prow1
+=
width
,
prow2
+=
width
)
{
for
(
x
=
minX1
;
x
<
maxX1
;
x
++
)
{
int
u
=
prow1
[
x
];
#if CV_SSE2
if
(
useSIMD
)
{
__m128i
_u
=
_mm_set1_epi8
(
u
),
z
=
_mm_setzero_si128
();
v_uint8x16
_u
=
v_setall_u8
((
uchar
)
u
),
_u0
=
v_setall_u8
((
uchar
)
u0
);
v_uint8x16
_u1
=
v_setall_u8
((
uchar
)
u1
);
for
(
int
d
=
minD
;
d
<
maxD
;
d
+=
16
)
{
__m128i
_v
=
_mm_loadu_si128
((
const
__m128i
*
)(
prow2
+
width
-
1
-
x
+
d
));
__m128i
diff
=
_mm_adds_epu8
(
_mm_subs_epu8
(
_u
,
_v
),
_mm_subs_epu8
(
_v
,
_u
));
__m128i
c0
=
_mm_load_si128
((
__m128i
*
)(
cost
+
x
*
D
+
d
));
__m128i
c1
=
_mm_load_si128
((
__m128i
*
)(
cost
+
x
*
D
+
d
+
8
));
_mm_store_si128
((
__m128i
*
)(
cost
+
x
*
D
+
d
),
_mm_adds_epi16
(
c0
,
_mm_unpacklo_epi8
(
diff
,
z
)));
_mm_store_si128
((
__m128i
*
)(
cost
+
x
*
D
+
d
+
8
),
_mm_adds_epi16
(
c1
,
_mm_unpackhi_epi8
(
diff
,
z
)));
v_uint8x16
_v
=
v_load
(
prow2
+
width
-
x
-
1
+
d
);
v_uint8x16
_v0
=
v_load
(
buffer
+
width
-
x
-
1
+
d
);
v_uint8x16
_v1
=
v_load
(
buffer
+
width
-
x
-
1
+
d
+
width2
);
v_uint8x16
c0
=
v_max
(
_u
-
_v1
,
_v0
-
_u
);
v_uint8x16
c1
=
v_max
(
_v
-
_u1
,
_u0
-
_v
);
v_uint8x16
diff
=
v_min
(
c0
,
c1
);
v_int16x8
_c0
=
v_load_aligned
(
cost
+
x
*
D
+
d
);
v_int16x8
_c1
=
v_load_aligned
(
cost
+
x
*
D
+
d
+
8
);
v_uint16x8
diff1
,
diff2
;
v_expand
(
diff
,
diff1
,
diff2
);
v_store_aligned
(
cost
+
x
*
D
+
d
,
_c0
+
v_reinterpret_as_s16
(
diff1
>>
diff_scale
));
v_store_aligned
(
cost
+
x
*
D
+
d
+
8
,
_c1
+
v_reinterpret_as_s16
(
diff2
>>
diff_scale
));
}
}
else
...
...
@@ -270,13 +239,17 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
{
for
(
int
d
=
minD
;
d
<
maxD
;
d
++
)
{
int
v
=
prow2
[
width
-
1
-
x
+
d
];
cost
[
x
*
D
+
d
]
=
(
CostType
)(
cost
[
x
*
D
+
d
]
+
(
CostType
)
std
::
abs
(
u
-
v
));
int
v
=
prow2
[
width
-
x
-
1
+
d
];
int
v0
=
buffer
[
width
-
x
-
1
+
d
];
int
v1
=
buffer
[
width
-
x
-
1
+
d
+
width2
];
int
c0
=
std
::
max
(
0
,
u
-
v1
);
c0
=
std
::
max
(
c0
,
v0
-
u
);
int
c1
=
std
::
max
(
0
,
v
-
u1
);
c1
=
std
::
max
(
c1
,
u0
-
v
);
cost
[
x
*
D
+
d
]
=
(
CostType
)(
cost
[
x
*
D
+
d
]
+
(
std
::
min
(
c0
,
c1
)
>>
diff_scale
));
}
}
}
}
#endif
}
...
...
@@ -304,7 +277,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
Mat
&
disp1
,
const
StereoSGBMParams
&
params
,
Mat
&
buffer
)
{
#if CV_SSE2
#if CV_SIMD128
// maxDisparity is supposed to multiple of 16, so we can forget doing else
static
const
uchar
LSBTab
[]
=
{
0
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
3
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
4
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
3
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
...
...
@@ -316,8 +290,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
6
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
3
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
4
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
3
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
5
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
3
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
4
,
0
,
1
,
0
,
2
,
0
,
1
,
0
,
3
,
0
,
1
,
0
,
2
,
0
,
1
,
0
};
static
const
v_uint16x8
v_LSB
=
v_uint16x8
(
0x1
,
0x2
,
0x4
,
0x8
,
0x10
,
0x20
,
0x40
,
0x80
);
volatile
bool
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
);
bool
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
const
int
ALIGN
=
16
;
...
...
@@ -461,21 +436,20 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
const
CostType
*
pixAdd
=
pixDiff
+
std
::
min
(
x
+
SW2
*
D
,
(
width1
-
1
)
*
D
);
const
CostType
*
pixSub
=
pixDiff
+
std
::
max
(
x
-
(
SW2
+
1
)
*
D
,
0
);
#if CV_S
SE2
#if CV_S
IMD128
if
(
useSIMD
)
{
for
(
d
=
0
;
d
<
D
;
d
+=
8
)
{
__m128i
hv
=
_mm_load_si128
((
const
__m128i
*
)(
hsumAdd
+
x
-
D
+
d
));
__m128i
Cx
=
_mm_load_si128
((
__m128i
*
)(
Cprev
+
x
+
d
));
hv
=
_mm_adds_epi16
(
_mm_subs_epi16
(
hv
,
_mm_load_si128
((
const
__m128i
*
)(
pixSub
+
d
))),
_mm_load_si128
((
const
__m128i
*
)(
pixAdd
+
d
)));
Cx
=
_mm_adds_epi16
(
_mm_subs_epi16
(
Cx
,
_mm_load_si128
((
const
__m128i
*
)(
hsumSub
+
x
+
d
))),
hv
);
_mm_store_si128
((
__m128i
*
)(
hsumAdd
+
x
+
d
),
hv
);
_mm_store_si128
((
__m128i
*
)(
C
+
x
+
d
),
Cx
);
v_int16x8
hv
=
v_load
(
hsumAdd
+
x
-
D
+
d
);
v_int16x8
Cx
=
v_load
(
Cprev
+
x
+
d
);
v_int16x8
psub
=
v_load
(
pixSub
+
d
);
v_int16x8
padd
=
v_load
(
pixAdd
+
d
);
hv
=
(
hv
-
psub
+
padd
);
psub
=
v_load
(
hsumSub
+
x
+
d
);
Cx
=
Cx
-
psub
+
hv
;
v_store
(
hsumAdd
+
x
+
d
,
hv
);
v_store
(
C
+
x
+
d
,
Cx
);
}
}
else
...
...
@@ -558,73 +532,79 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
const
CostType
*
Cp
=
C
+
x
*
D
;
CostType
*
Sp
=
S
+
x
*
D
;
#if CV_S
SE2
#if CV_S
IMD128
if
(
useSIMD
)
{
__m128i
_P1
=
_mm_set1_epi
16
((
short
)
P1
);
v_int16x8
_P1
=
v_setall_s
16
((
short
)
P1
);
__m128i
_delta0
=
_mm_set1_epi
16
((
short
)
delta0
);
__m128i
_delta1
=
_mm_set1_epi
16
((
short
)
delta1
);
__m128i
_delta2
=
_mm_set1_epi
16
((
short
)
delta2
);
__m128i
_delta3
=
_mm_set1_epi
16
((
short
)
delta3
);
__m128i
_minL0
=
_mm_set1_epi
16
((
short
)
MAX_COST
);
v_int16x8
_delta0
=
v_setall_s
16
((
short
)
delta0
);
v_int16x8
_delta1
=
v_setall_s
16
((
short
)
delta1
);
v_int16x8
_delta2
=
v_setall_s
16
((
short
)
delta2
);
v_int16x8
_delta3
=
v_setall_s
16
((
short
)
delta3
);
v_int16x8
_minL0
=
v_setall_s
16
((
short
)
MAX_COST
);
for
(
d
=
0
;
d
<
D
;
d
+=
8
)
{
__m128i
Cpd
=
_mm_load_si128
((
const
__m128i
*
)(
Cp
+
d
)
);
__m128i
L0
,
L1
,
L2
,
L3
;
v_int16x8
Cpd
=
v_load
(
Cp
+
d
);
v_int16x8
L0
,
L1
,
L2
,
L3
;
L0
=
_mm_load_si128
((
const
__m128i
*
)(
Lr_p0
+
d
)
);
L1
=
_mm_load_si128
((
const
__m128i
*
)(
Lr_p1
+
d
)
);
L2
=
_mm_load_si128
((
const
__m128i
*
)(
Lr_p2
+
d
)
);
L3
=
_mm_load_si128
((
const
__m128i
*
)(
Lr_p3
+
d
)
);
L0
=
v_load
(
Lr_p0
+
d
);
L1
=
v_load
(
Lr_p1
+
d
);
L2
=
v_load
(
Lr_p2
+
d
);
L3
=
v_load
(
Lr_p3
+
d
);
L0
=
_mm_min_epi16
(
L0
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p0
+
d
-
1
)),
_P1
));
L0
=
_mm_min_epi16
(
L0
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p0
+
d
+
1
)),
_P1
));
L0
=
v_min
(
L0
,
(
v_load
(
Lr_p0
+
d
-
1
)
+
_P1
));
L0
=
v_min
(
L0
,
(
v_load
(
Lr_p0
+
d
+
1
)
+
_P1
));
L1
=
_mm_min_epi16
(
L1
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p1
+
d
-
1
)),
_P1
));
L1
=
_mm_min_epi16
(
L1
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p1
+
d
+
1
)),
_P1
));
L1
=
v_min
(
L1
,
(
v_load
(
Lr_p1
+
d
-
1
)
+
_P1
));
L1
=
v_min
(
L1
,
(
v_load
(
Lr_p1
+
d
+
1
)
+
_P1
));
L2
=
_mm_min_epi16
(
L2
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p2
+
d
-
1
)),
_P1
));
L2
=
_mm_min_epi16
(
L2
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p2
+
d
+
1
)),
_P1
));
L2
=
v_min
(
L2
,
(
v_load
(
Lr_p2
+
d
-
1
)
+
_P1
));
L2
=
v_min
(
L2
,
(
v_load
(
Lr_p2
+
d
+
1
)
+
_P1
));
L3
=
_mm_min_epi16
(
L3
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p3
+
d
-
1
)),
_P1
));
L3
=
_mm_min_epi16
(
L3
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p3
+
d
+
1
)),
_P1
));
L3
=
v_min
(
L3
,
(
v_load
(
Lr_p3
+
d
-
1
)
+
_P1
));
L3
=
v_min
(
L3
,
(
v_load
(
Lr_p3
+
d
+
1
)
+
_P1
));
L0
=
_mm_min_epi16
(
L0
,
_delta0
);
L0
=
_mm_adds_epi16
(
_mm_subs_epi16
(
L0
,
_delta0
),
Cpd
);
L0
=
v_min
(
L0
,
_delta0
);
L0
=
((
L0
-
_delta0
)
+
Cpd
);
L1
=
_mm_min_epi16
(
L1
,
_delta1
);
L1
=
_mm_adds_epi16
(
_mm_subs_epi16
(
L1
,
_delta1
),
Cpd
);
L1
=
v_min
(
L1
,
_delta1
);
L1
=
((
L1
-
_delta1
)
+
Cpd
);
L2
=
_mm_min_epi16
(
L2
,
_delta2
);
L2
=
_mm_adds_epi16
(
_mm_subs_epi16
(
L2
,
_delta2
),
Cpd
);
L2
=
v_min
(
L2
,
_delta2
);
L2
=
((
L2
-
_delta2
)
+
Cpd
);
L3
=
_mm_min_epi16
(
L3
,
_delta3
);
L3
=
_mm_adds_epi16
(
_mm_subs_epi16
(
L3
,
_delta3
),
Cpd
);
L3
=
v_min
(
L3
,
_delta3
);
L3
=
((
L3
-
_delta3
)
+
Cpd
);
_mm_store_si128
(
(
__m128i
*
)(
Lr_p
+
d
)
,
L0
);
_mm_store_si128
(
(
__m128i
*
)(
Lr_p
+
d
+
D2
)
,
L1
);
_mm_store_si128
(
(
__m128i
*
)(
Lr_p
+
d
+
D2
*
2
)
,
L2
);
_mm_store_si128
(
(
__m128i
*
)(
Lr_p
+
d
+
D2
*
3
)
,
L3
);
v_store
(
Lr_p
+
d
,
L0
);
v_store
(
Lr_p
+
d
+
D2
,
L1
);
v_store
(
Lr_p
+
d
+
D2
*
2
,
L2
);
v_store
(
Lr_p
+
d
+
D2
*
3
,
L3
);
__m128i
t0
=
_mm_min_epi16
(
_mm_unpacklo_epi16
(
L0
,
L2
),
_mm_unpackhi_epi16
(
L0
,
L2
));
__m128i
t1
=
_mm_min_epi16
(
_mm_unpacklo_epi16
(
L1
,
L3
),
_mm_unpackhi_epi16
(
L1
,
L3
));
t0
=
_mm_min_epi16
(
_mm_unpacklo_epi16
(
t0
,
t1
),
_mm_unpackhi_epi16
(
t0
,
t1
));
_minL0
=
_mm_min_epi16
(
_minL0
,
t0
);
// Get minimum from in L0-L3
v_int16x8
t02L
,
t02H
,
t13L
,
t13H
,
t0123L
,
t0123H
;
v_zip
(
L0
,
L2
,
t02L
,
t02H
);
// L0[0] L2[0] L0[1] L2[1]...
v_zip
(
L1
,
L3
,
t13L
,
t13H
);
// L1[0] L3[0] L1[1] L3[1]...
v_int16x8
t02
=
v_min
(
t02L
,
t02H
);
// L0[i] L2[i] L0[i] L2[i]...
v_int16x8
t13
=
v_min
(
t13L
,
t13H
);
// L1[i] L3[i] L1[i] L3[i]...
v_zip
(
t02
,
t13
,
t0123L
,
t0123H
);
// L0[i] L1[i] L2[i] L3[i]...
v_int16x8
t0
=
v_min
(
t0123L
,
t0123H
);
_minL0
=
v_min
(
_minL0
,
t0
);
__m128i
Sval
=
_mm_load_si128
((
const
__m128i
*
)(
Sp
+
d
)
);
v_int16x8
Sval
=
v_load
(
Sp
+
d
);
L0
=
_mm_adds_epi16
(
L0
,
L1
)
;
L2
=
_mm_adds_epi16
(
L2
,
L3
)
;
Sval
=
_mm_adds_epi16
(
Sval
,
L0
)
;
Sval
=
_mm_adds_epi16
(
Sval
,
L2
)
;
L0
=
L0
+
L1
;
L2
=
L2
+
L3
;
Sval
=
Sval
+
L0
;
Sval
=
Sval
+
L2
;
_mm_store_si128
((
__m128i
*
)(
Sp
+
d
)
,
Sval
);
v_store
(
Sp
+
d
,
Sval
);
}
_minL0
=
_mm_min_epi16
(
_minL0
,
_mm_srli_si128
(
_minL0
,
8
));
_mm_storel_epi64
((
__m128i
*
)
&
minLr
[
0
][
xm
],
_minL0
);
v_int32x4
minL
,
minH
;
v_expand
(
_minL0
,
minL
,
minH
);
v_pack_store
(
&
minLr
[
0
][
xm
],
v_min
(
minL
,
minH
));
}
else
#endif
...
...
@@ -686,55 +666,54 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
const
CostType
*
Cp
=
C
+
x
*
D
;
#if CV_S
SE2
#if CV_S
IMD128
if
(
useSIMD
)
{
__m128i
_P1
=
_mm_set1_epi
16
((
short
)
P1
);
__m128i
_delta0
=
_mm_set1_epi
16
((
short
)
delta0
);
v_int16x8
_P1
=
v_setall_s
16
((
short
)
P1
);
v_int16x8
_delta0
=
v_setall_s
16
((
short
)
delta0
);
__m128i
_minL0
=
_mm_set1_epi
16
((
short
)
minL0
);
__m128i
_minS
=
_mm_set1_epi16
(
MAX_COST
),
_bestDisp
=
_mm_set1_epi
16
(
-
1
);
__m128i
_d8
=
_mm_setr_epi16
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
),
_8
=
_mm_set1_epi
16
(
8
);
v_int16x8
_minL0
=
v_setall_s
16
((
short
)
minL0
);
v_int16x8
_minS
=
v_setall_s16
(
MAX_COST
),
_bestDisp
=
v_setall_s
16
(
-
1
);
v_int16x8
_d8
=
v_int16x8
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
),
_8
=
v_setall_s
16
(
8
);
for
(
d
=
0
;
d
<
D
;
d
+=
8
)
{
__m128i
Cpd
=
_mm_load_si128
((
const
__m128i
*
)(
Cp
+
d
)),
L0
;
L0
=
_mm_load_si128
((
const
__m128i
*
)(
Lr_p0
+
d
));
L0
=
_mm_min_epi16
(
L0
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p0
+
d
-
1
)),
_P1
)
);
L0
=
_mm_min_epi16
(
L0
,
_mm_adds_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
Lr_p0
+
d
+
1
)),
_P1
)
);
L0
=
_mm_min_epi16
(
L0
,
_delta0
);
L0
=
_mm_adds_epi16
(
_mm_subs_epi16
(
L0
,
_delta0
),
Cpd
)
;
_mm_store_si128
((
__m128i
*
)(
Lr_p
+
d
)
,
L0
);
_minL0
=
_mm_min_epi16
(
_minL0
,
L0
);
L0
=
_mm_adds_epi16
(
L0
,
*
(
__m128i
*
)(
Sp
+
d
)
);
_mm_store_si128
((
__m128i
*
)(
Sp
+
d
)
,
L0
);
__m128i
mask
=
_mm_cmpgt_epi16
(
_minS
,
L0
)
;
_minS
=
_mm_min_epi16
(
_minS
,
L0
);
_bestDisp
=
_
mm_xor_si128
(
_bestDisp
,
_mm_and_si128
(
_mm_xor_si128
(
_bestDisp
,
_d8
),
mask
)
);
_d8
=
_mm_adds_epi16
(
_d8
,
_8
)
;
v_int16x8
Cpd
=
v_load
(
Cp
+
d
)
;
v_int16x8
L0
=
v_load
(
Lr_p0
+
d
);
L0
=
v_min
(
L0
,
v_load
(
Lr_p0
+
d
-
1
)
+
_P1
);
L0
=
v_min
(
L0
,
v_load
(
Lr_p0
+
d
+
1
)
+
_P1
);
L0
=
v_min
(
L0
,
_delta0
);
L0
=
L0
-
_delta0
+
Cpd
;
v_store
(
Lr_p
+
d
,
L0
);
_minL0
=
v_min
(
_minL0
,
L0
);
L0
=
L0
+
v_load
(
Sp
+
d
);
v_store
(
Sp
+
d
,
L0
);
v_int16x8
mask
=
_minS
>
L0
;
_minS
=
v_min
(
_minS
,
L0
);
_bestDisp
=
_
bestDisp
^
((
_bestDisp
^
_d8
)
&
mask
);
_d8
+=
_8
;
}
short
bestDispBuf
[
8
];
v_store
(
bestDispBuf
,
_bestDisp
);
short
CV_DECL_ALIGNED
(
16
)
bestDispBuf
[
8
];
_mm_store_si128
((
__m128i
*
)
bestDispBuf
,
_bestDisp
);
v_int32x4
min32L
,
min32H
;
v_expand
(
_minL0
,
min32L
,
min32H
);
minLr
[
0
][
xm
]
=
(
CostType
)
std
::
min
(
v_reduce_min
(
min32L
),
v_reduce_min
(
min32H
));
_minL0
=
_mm_min_epi16
(
_minL0
,
_mm_srli_si128
(
_minL0
,
8
));
_minL0
=
_mm_min_epi16
(
_minL0
,
_mm_srli_si128
(
_minL0
,
4
));
_minL0
=
_mm_min_epi16
(
_minL0
,
_mm_srli_si128
(
_minL0
,
2
));
v_expand
(
_minS
,
min32L
,
min32H
);
minS
=
std
::
min
(
v_reduce_min
(
min32L
),
v_reduce_min
(
min32H
));
__m128i
qS
=
_mm_min_epi16
(
_minS
,
_mm_srli_si128
(
_minS
,
8
)
);
qS
=
_mm_min_epi16
(
qS
,
_mm_srli_si128
(
qS
,
4
)
);
qS
=
_mm_min_epi16
(
qS
,
_mm_srli_si128
(
qS
,
2
))
;
v_int16x8
ss
=
v_setall_s16
((
short
)
minS
);
v_uint16x8
minMask
=
v_reinterpret_as_u16
(
ss
==
_minS
);
v_uint16x8
minBit
=
minMask
&
v_LSB
;
minLr
[
0
][
xm
]
=
(
CostType
)
_mm_cvtsi128_si32
(
_minL0
);
minS
=
(
CostType
)
_mm_cvtsi128_si32
(
qS
);
qS
=
_mm_shuffle_epi32
(
_mm_unpacklo_epi16
(
qS
,
qS
),
0
);
qS
=
_mm_cmpeq_epi16
(
_minS
,
qS
);
int
idx
=
_mm_movemask_epi8
(
_mm_packs_epi16
(
qS
,
qS
))
&
255
;
v_uint32x4
minBitL
,
minBitH
;
v_expand
(
minBit
,
minBitL
,
minBitH
);
int
idx
=
v_reduce_sum
(
minBitL
)
+
v_reduce_sum
(
minBitH
);
bestDisp
=
bestDispBuf
[
LSBTab
[
idx
]];
}
else
...
...
@@ -759,50 +738,41 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
else
{
#if CV_S
SE2
#if CV_S
IMD128
if
(
useSIMD
)
{
__m128i
_minS
=
_mm_set1_epi16
(
MAX_COST
),
_bestDisp
=
_mm_set1_epi16
(
-
1
);
__m128i
_d8
=
_mm_setr_epi16
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
),
_8
=
_mm_set1_epi16
(
8
);
for
(
d
=
0
;
d
<
D
;
d
+=
8
)
{
__m128i
L0
=
_mm_load_si128
((
const
__m128i
*
)(
Sp
+
d
));
__m128i
mask
=
_mm_cmplt_epi16
(
L0
,
_minS
);
_minS
=
_mm_min_epi16
(
L0
,
_minS
);
_bestDisp
=
_mm_xor_si128
(
_bestDisp
,
_mm_and_si128
(
_mm_xor_si128
(
_bestDisp
,
_d8
),
mask
));
_d8
=
_mm_adds_epi16
(
_d8
,
_8
);
}
short
CV_DECL_ALIGNED
(
16
)
bestDispBuf
[
8
];
_mm_store_si128
((
__m128i
*
)
bestDispBuf
,
_bestDisp
);
short
CV_DECL_ALIGNED
(
16
)
minSBuf
[
8
];
_mm_store_si128
((
__m128i
*
)
minSBuf
,
_minS
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
int
Sval
=
minSBuf
[
i
];
if
(
Sval
<=
minS
)
{
if
(
(
Sval
<
minS
)
||
(
bestDispBuf
[
i
]
<
bestDisp
)
)
{
bestDisp
=
bestDispBuf
[
i
];
}
minS
=
Sval
;
}
}
v_int16x8
_minS
=
v_setall_s16
(
MAX_COST
),
_bestDisp
=
v_setall_s16
(
-
1
);
v_int16x8
_d8
=
v_int16x8
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
),
_8
=
v_setall_s16
(
8
);
for
(
d
=
0
;
d
<
D
;
d
+=
8
)
{
v_int16x8
L0
=
v_load
(
Sp
+
d
);
v_int16x8
mask
=
L0
<
_minS
;
_minS
=
v_min
(
L0
,
_minS
);
_bestDisp
=
_bestDisp
^
((
_bestDisp
^
_d8
)
&
mask
);
_d8
=
_d8
+
_8
;
}
v_int32x4
_d0
,
_d1
;
v_expand
(
_minS
,
_d0
,
_d1
);
minS
=
(
int
)
std
::
min
(
v_reduce_min
(
_d0
),
v_reduce_min
(
_d1
));
v_int16x8
v_mask
=
v_setall_s16
((
short
)
minS
)
==
_minS
;
_bestDisp
=
(
_bestDisp
&
v_mask
)
|
(
v_setall_s16
(
SHRT_MAX
)
&
~
v_mask
);
v_expand
(
_bestDisp
,
_d0
,
_d1
);
bestDisp
=
(
int
)
std
::
min
(
v_reduce_min
(
_d0
),
v_reduce_min
(
_d1
));
}
else
#endif
{
for
(
d
=
0
;
d
<
D
;
d
++
)
{
int
Sval
=
Sp
[
d
];
if
(
Sval
<
minS
)
{
minS
=
Sval
;
bestDisp
=
d
;
}
}
for
(
d
=
0
;
d
<
D
;
d
++
)
{
int
Sval
=
Sp
[
d
];
if
(
Sval
<
minS
)
{
minS
=
Sval
;
bestDisp
=
d
;
}
}
}
}
...
...
@@ -886,6 +856,10 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
int
costBufSize
,
hsumBufNRows
;
int
TAB_OFS
,
ftzero
;
#if CV_SIMD128
bool
useSIMD
;
#endif
PixType
*
clipTab
;
SGBM3WayMainLoop
(
Mat
*
_buffers
,
const
Mat
&
_img1
,
const
Mat
&
_img2
,
Mat
*
_dst_disp
,
const
StereoSGBMParams
&
params
,
PixType
*
_clipTab
,
int
_nstripes
,
int
_stripe_overlap
);
...
...
@@ -915,6 +889,10 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
hsumBufNRows
=
SH2
*
2
+
2
;
TAB_OFS
=
256
*
4
;
ftzero
=
std
::
max
(
params
.
preFilterCap
,
15
)
|
1
;
#if CV_SIMD128
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
}
void
getBufferPointers
(
Mat
&
buffer
,
int
width
,
int
width1
,
int
D
,
int
num_ch
,
int
SH2
,
int
P2
,
...
...
@@ -1015,20 +993,25 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
const
CostType
*
pixSub
=
pixDiff
+
std
::
max
(
x
-
(
SW2
+
1
)
*
D
,
0
);
#if CV_SIMD128
v_int16x8
hv_reg
;
for
(
d
=
0
;
d
<
D
;
d
+=
8
)
if
(
useSIMD
)
{
hv_reg
=
v_load_aligned
(
hsumAdd
+
x
-
D
+
d
)
+
(
v_load_aligned
(
pixAdd
+
d
)
-
v_load_aligned
(
pixSub
+
d
));
v_store_aligned
(
hsumAdd
+
x
+
d
,
hv_reg
);
v_store_aligned
(
C
+
x
+
d
,
v_load_aligned
(
C
+
x
+
d
)
+
(
hv_reg
-
v_load_aligned
(
hsumSub
+
x
+
d
)));
v_int16x8
hv_reg
;
for
(
d
=
0
;
d
<
D
;
d
+=
8
)
{
hv_reg
=
v_load_aligned
(
hsumAdd
+
x
-
D
+
d
)
+
(
v_load_aligned
(
pixAdd
+
d
)
-
v_load_aligned
(
pixSub
+
d
));
v_store_aligned
(
hsumAdd
+
x
+
d
,
hv_reg
);
v_store_aligned
(
C
+
x
+
d
,
v_load_aligned
(
C
+
x
+
d
)
+
(
hv_reg
-
v_load_aligned
(
hsumSub
+
x
+
d
)));
}
}
#
else
for
(
d
=
0
;
d
<
D
;
d
++
)
else
#endif
{
int
hv
=
hsumAdd
[
x
+
d
]
=
(
CostType
)(
hsumAdd
[
x
-
D
+
d
]
+
pixAdd
[
d
]
-
pixSub
[
d
]);
C
[
x
+
d
]
=
(
CostType
)(
C
[
x
+
d
]
+
hv
-
hsumSub
[
x
+
d
]);
for
(
d
=
0
;
d
<
D
;
d
++
)
{
int
hv
=
hsumAdd
[
x
+
d
]
=
(
CostType
)(
hsumAdd
[
x
-
D
+
d
]
+
pixAdd
[
d
]
-
pixSub
[
d
]);
C
[
x
+
d
]
=
(
CostType
)(
C
[
x
+
d
]
+
hv
-
hsumSub
[
x
+
d
]);
}
}
#endif
}
}
else
...
...
@@ -1055,34 +1038,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
#if CV_SIMD128
// define some additional reduce operations:
inline
short
min
(
const
v_int16x8
&
a
)
inline
short
min
_pos
(
const
v_int16x8
&
val
,
const
v_int16x8
&
pos
,
const
short
min_val
)
{
short
CV_DECL_ALIGNED
(
16
)
buf
[
8
];
v_store_aligned
(
buf
,
a
);
short
s0
=
std
::
min
(
buf
[
0
],
buf
[
1
]);
short
s1
=
std
::
min
(
buf
[
2
],
buf
[
3
]);
short
s2
=
std
::
min
(
buf
[
4
],
buf
[
5
]);
short
s3
=
std
::
min
(
buf
[
6
],
buf
[
7
]);
return
std
::
min
(
std
::
min
(
s0
,
s1
),
std
::
min
(
s2
,
s3
));
}
v_int16x8
v_min
=
v_setall_s16
(
min_val
);
v_int16x8
v_mask
=
v_min
==
val
;
v_int16x8
v_pos
=
(
pos
&
v_mask
)
|
(
v_setall_s16
(
SHRT_MAX
)
&
~
v_mask
);
inline
short
min_pos
(
const
v_int16x8
&
val
,
const
v_int16x8
&
pos
)
{
short
CV_DECL_ALIGNED
(
16
)
val_buf
[
8
];
v_store_aligned
(
val_buf
,
val
);
short
CV_DECL_ALIGNED
(
16
)
pos_buf
[
8
];
v_store_aligned
(
pos_buf
,
pos
);
short
res_pos
=
0
;
short
min_val
=
SHRT_MAX
;
if
(
val_buf
[
0
]
<
min_val
)
{
min_val
=
val_buf
[
0
];
res_pos
=
pos_buf
[
0
];}
if
(
val_buf
[
1
]
<
min_val
)
{
min_val
=
val_buf
[
1
];
res_pos
=
pos_buf
[
1
];}
if
(
val_buf
[
2
]
<
min_val
)
{
min_val
=
val_buf
[
2
];
res_pos
=
pos_buf
[
2
];}
if
(
val_buf
[
3
]
<
min_val
)
{
min_val
=
val_buf
[
3
];
res_pos
=
pos_buf
[
3
];}
if
(
val_buf
[
4
]
<
min_val
)
{
min_val
=
val_buf
[
4
];
res_pos
=
pos_buf
[
4
];}
if
(
val_buf
[
5
]
<
min_val
)
{
min_val
=
val_buf
[
5
];
res_pos
=
pos_buf
[
5
];}
if
(
val_buf
[
6
]
<
min_val
)
{
min_val
=
val_buf
[
6
];
res_pos
=
pos_buf
[
6
];}
if
(
val_buf
[
7
]
<
min_val
)
{
min_val
=
val_buf
[
7
];
res_pos
=
pos_buf
[
7
];}
return
res_pos
;
return
v_reduce_min
(
v_pos
);
}
#endif
...
...
@@ -1092,104 +1054,109 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
CostType
&
leftMinCost
,
CostType
&
topMinCost
,
int
D
,
int
P1
,
int
P2
)
{
#if CV_SIMD128
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
))
{
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
v_int16x8
leftMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
leftMinCost
+
P2
));
v_int16x8
leftMinCost_new_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src0_leftBuf
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src1_leftBuf
=
v_load_aligned
(
leftBuf_prev
);
v_int16x8
leftMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
leftMinCost
+
P2
));
v_int16x8
leftMinCost_new_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src0_leftBuf
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src1_leftBuf
=
v_load_aligned
(
leftBuf_prev
);
v_int16x8
topMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
topMinCost
+
P2
));
v_int16x8
topMinCost_new_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src0_topBuf
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src1_topBuf
=
v_load_aligned
(
topBuf
);
v_int16x8
topMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
topMinCost
+
P2
));
v_int16x8
topMinCost_new_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src0_topBuf
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src1_topBuf
=
v_load_aligned
(
topBuf
);
v_int16x8
src2
;
v_int16x8
src_shifted_left
,
src_shifted_right
;
v_int16x8
res
;
v_int16x8
src2
;
v_int16x8
src_shifted_left
,
src_shifted_right
;
v_int16x8
res
;
for
(
int
i
=
0
;
i
<
D
-
8
;
i
+=
8
)
{
//process leftBuf:
//lookahead load:
src2
=
v_load_aligned
(
leftBuf_prev
+
i
+
8
);
for
(
int
i
=
0
;
i
<
D
-
8
;
i
+=
8
)
{
//process leftBuf:
//lookahead load:
src2
=
v_load_aligned
(
leftBuf_prev
+
i
+
8
);
//get shifted versions of the current block and add P1:
src_shifted_left
=
v_extract
<
7
>
(
src0_leftBuf
,
src1_leftBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_leftBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_leftBuf
,
leftMinCostP2_reg
))
-
leftMinCostP2_reg
);
leftMinCost_new_reg
=
v_min
(
leftMinCost_new_reg
,
res
);
v_store_aligned
(
leftBuf
+
i
,
res
);
//update src buffers:
src0_leftBuf
=
src1_leftBuf
;
src1_leftBuf
=
src2
;
//process topBuf:
//lookahead load:
src2
=
v_load_aligned
(
topBuf
+
i
+
8
);
//get shifted versions of the current block and add P1:
src_shifted_left
=
v_extract
<
7
>
(
src0_topBuf
,
src1_topBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_topBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_topBuf
,
topMinCostP2_reg
))
-
topMinCostP2_reg
);
topMinCost_new_reg
=
v_min
(
topMinCost_new_reg
,
res
);
v_store_aligned
(
topBuf
+
i
,
res
);
//update src buffers:
src0_topBuf
=
src1_topBuf
;
src1_topBuf
=
src2
;
}
//get shifted versions of the current block and add P1:
// a bit different processing for the last cycle of the loop:
//process leftBuf:
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
v_extract
<
7
>
(
src0_leftBuf
,
src1_leftBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_leftBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_leftBuf
,
leftMinCostP2_reg
))
-
leftMinCostP2_reg
);
leftMinCost_new_reg
=
v_min
(
leftMinCost_new_reg
,
res
);
v_store_aligned
(
leftBuf
+
i
,
res
);
//update src buffers:
src0_leftBuf
=
src1_leftBuf
;
src1_leftBuf
=
src2
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_leftBuf
,
leftMinCostP2_reg
))
-
leftMinCostP2_reg
);
leftMinCost
=
v_reduce_min
(
v_min
(
leftMinCost_new_reg
,
res
));
v_store_aligned
(
leftBuf
+
D
-
8
,
res
);
//process topBuf:
//lookahead load:
src2
=
v_load_aligned
(
topBuf
+
i
+
8
);
//get shifted versions of the current block and add P1:
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
v_extract
<
7
>
(
src0_topBuf
,
src1_topBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_topBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_topBuf
,
topMinCostP2_reg
))
-
topMinCostP2_reg
);
topMinCost_new_reg
=
v_min
(
topMinCost_new_reg
,
res
);
v_store_aligned
(
topBuf
+
i
,
res
);
//update src buffers:
src0_topBuf
=
src1_topBuf
;
src1_topBuf
=
src2
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_topBuf
,
topMinCostP2_reg
))
-
topMinCostP2_reg
);
topMinCost
=
v_reduce_min
(
v_min
(
topMinCost_new_reg
,
res
));
v_store_aligned
(
topBuf
+
D
-
8
,
res
);
}
// a bit different processing for the last cycle of the loop:
//process leftBuf:
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
v_extract
<
7
>
(
src0_leftBuf
,
src1_leftBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_leftBuf
,
src2
)
+
P1_reg
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_leftBuf
,
leftMinCostP2_reg
))
-
leftMinCostP2_reg
);
leftMinCost
=
min
(
v_min
(
leftMinCost_new_reg
,
res
));
v_store_aligned
(
leftBuf
+
D
-
8
,
res
);
//process topBuf:
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
v_extract
<
7
>
(
src0_topBuf
,
src1_topBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_topBuf
,
src2
)
+
P1_reg
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_topBuf
,
topMinCostP2_reg
))
-
topMinCostP2_reg
);
topMinCost
=
min
(
v_min
(
topMinCost_new_reg
,
res
));
v_store_aligned
(
topBuf
+
D
-
8
,
res
);
#else
CostType
leftMinCost_new
=
SHRT_MAX
;
CostType
topMinCost_new
=
SHRT_MAX
;
int
leftMinCost_P2
=
leftMinCost
+
P2
;
int
topMinCost_P2
=
topMinCost
+
P2
;
CostType
leftBuf_prev_i_minus_1
=
SHRT_MAX
;
CostType
topBuf_i_minus_1
=
SHRT_MAX
;
CostType
tmp
;
for
(
int
i
=
0
;
i
<
D
-
1
;
i
++
)
else
#endif
{
leftBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
i
]
+
std
::
min
(
std
::
min
(
leftBuf_prev_i_minus_1
+
P1
,
leftBuf_prev
[
i
+
1
]
+
P1
),
std
::
min
((
int
)
leftBuf_prev
[
i
],
leftMinCost_P2
))
-
leftMinCost_P2
);
leftBuf_prev_i_minus_1
=
leftBuf_prev
[
i
];
leftMinCost_new
=
std
::
min
(
leftMinCost_new
,
leftBuf
[
i
]);
tmp
=
topBuf
[
i
];
topBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
i
]
+
std
::
min
(
std
::
min
(
topBuf_i_minus_1
+
P1
,
topBuf
[
i
+
1
]
+
P1
),
std
::
min
((
int
)
topBuf
[
i
],
topMinCost_P2
))
-
topMinCost_P2
);
topBuf_i_minus_1
=
tmp
;
topMinCost_new
=
std
::
min
(
topMinCost_new
,
topBuf
[
i
]);
}
CostType
leftMinCost_new
=
SHRT_MAX
;
CostType
topMinCost_new
=
SHRT_MAX
;
int
leftMinCost_P2
=
leftMinCost
+
P2
;
int
topMinCost_P2
=
topMinCost
+
P2
;
CostType
leftBuf_prev_i_minus_1
=
SHRT_MAX
;
CostType
topBuf_i_minus_1
=
SHRT_MAX
;
CostType
tmp
;
for
(
int
i
=
0
;
i
<
D
-
1
;
i
++
)
{
leftBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
i
]
+
std
::
min
(
std
::
min
(
leftBuf_prev_i_minus_1
+
P1
,
leftBuf_prev
[
i
+
1
]
+
P1
),
std
::
min
((
int
)
leftBuf_prev
[
i
],
leftMinCost_P2
))
-
leftMinCost_P2
);
leftBuf_prev_i_minus_1
=
leftBuf_prev
[
i
];
leftMinCost_new
=
std
::
min
(
leftMinCost_new
,
leftBuf
[
i
]);
tmp
=
topBuf
[
i
];
topBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
i
]
+
std
::
min
(
std
::
min
(
topBuf_i_minus_1
+
P1
,
topBuf
[
i
+
1
]
+
P1
),
std
::
min
((
int
)
topBuf
[
i
],
topMinCost_P2
))
-
topMinCost_P2
);
topBuf_i_minus_1
=
tmp
;
topMinCost_new
=
std
::
min
(
topMinCost_new
,
topBuf
[
i
]);
}
leftBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
D
-
1
]
+
std
::
min
(
leftBuf_prev_i_minus_1
+
P1
,
std
::
min
((
int
)
leftBuf_prev
[
D
-
1
],
leftMinCost_P2
))
-
leftMinCost_P2
);
leftMinCost
=
std
::
min
(
leftMinCost_new
,
leftBuf
[
D
-
1
]);
leftBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
D
-
1
]
+
std
::
min
(
leftBuf_prev_i_minus_1
+
P1
,
std
::
min
((
int
)
leftBuf_prev
[
D
-
1
],
leftMinCost_P2
))
-
leftMinCost_P2
);
leftMinCost
=
std
::
min
(
leftMinCost_new
,
leftBuf
[
D
-
1
]);
topBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
D
-
1
]
+
std
::
min
(
topBuf_i_minus_1
+
P1
,
std
::
min
((
int
)
topBuf
[
D
-
1
],
topMinCost_P2
))
-
topMinCost_P2
);
topMinCost
=
std
::
min
(
topMinCost_new
,
topBuf
[
D
-
1
]);
#endif
topBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
D
-
1
]
+
std
::
min
(
topBuf_i_minus_1
+
P1
,
std
::
min
((
int
)
topBuf
[
D
-
1
],
topMinCost_P2
))
-
topMinCost_P2
);
topMinCost
=
std
::
min
(
topMinCost_new
,
topBuf
[
D
-
1
]);
}
}
// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
...
...
@@ -1199,96 +1166,101 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType*
CostType
&
rightMinCost
,
int
D
,
int
P1
,
int
P2
,
int
&
optimal_disp
,
CostType
&
min_cost
)
{
#if CV_SIMD128
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
))
{
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
v_int16x8
rightMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
rightMinCost
+
P2
));
v_int16x8
rightMinCost_new_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src0_rightBuf
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src1_rightBuf
=
v_load
(
rightBuf
);
v_int16x8
rightMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
rightMinCost
+
P2
));
v_int16x8
rightMinCost_new_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src0_rightBuf
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
src1_rightBuf
=
v_load
(
rightBuf
);
v_int16x8
src2
;
v_int16x8
src_shifted_left
,
src_shifted_right
;
v_int16x8
res
;
v_int16x8
src2
;
v_int16x8
src_shifted_left
,
src_shifted_right
;
v_int16x8
res
;
v_int16x8
min_sum_cost_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
min_sum_pos_reg
=
v_setall_s16
(
0
);
v_int16x8
loop_idx
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
);
v_int16x8
eight_reg
=
v_setall_s16
(
8
);
v_int16x8
min_sum_cost_reg
=
v_setall_s16
(
SHRT_MAX
);
v_int16x8
min_sum_pos_reg
=
v_setall_s16
(
0
);
v_int16x8
loop_idx
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
);
v_int16x8
eight_reg
=
v_setall_s16
(
8
);
for
(
int
i
=
0
;
i
<
D
-
8
;
i
+=
8
)
{
//lookahead load:
src2
=
v_load_aligned
(
rightBuf
+
i
+
8
);
for
(
int
i
=
0
;
i
<
D
-
8
;
i
+=
8
)
{
//lookahead load:
src2
=
v_load_aligned
(
rightBuf
+
i
+
8
);
//get shifted versions of the current block and add P1:
src_shifted_left
=
v_extract
<
7
>
(
src0_rightBuf
,
src1_rightBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_rightBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_rightBuf
,
rightMinCostP2_reg
))
-
rightMinCostP2_reg
);
rightMinCost_new_reg
=
v_min
(
rightMinCost_new_reg
,
res
);
v_store_aligned
(
rightBuf
+
i
,
res
);
// compute and save total cost:
res
=
res
+
v_load_aligned
(
leftBuf
+
i
)
+
v_load_aligned
(
topBuf
+
i
);
v_store_aligned
(
leftBuf
+
i
,
res
);
// track disparity value with the minimum cost:
min_sum_cost_reg
=
v_min
(
min_sum_cost_reg
,
res
);
min_sum_pos_reg
=
min_sum_pos_reg
+
((
min_sum_cost_reg
==
res
)
&
(
loop_idx
-
min_sum_pos_reg
));
loop_idx
=
loop_idx
+
eight_reg
;
//update src:
src0_rightBuf
=
src1_rightBuf
;
src1_rightBuf
=
src2
;
}
//get shifted versions of the current block and add P1:
// a bit different processing for the last cycle of the loop:
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
v_extract
<
7
>
(
src0_rightBuf
,
src1_rightBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_rightBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_rightBuf
,
rightMinCostP2_reg
))
-
rightMinCostP2_reg
);
rightMinCost_new_reg
=
v_min
(
rightMinCost_new_reg
,
res
);
v_store_aligned
(
rightBuf
+
i
,
res
);
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_rightBuf
,
rightMinCostP2_reg
))
-
rightMinCostP2_reg
);
rightMinCost
=
v_reduce_min
(
v_min
(
rightMinCost_new_reg
,
res
));
v_store_aligned
(
rightBuf
+
D
-
8
,
res
);
// compute and save total cost:
res
=
res
+
v_load_aligned
(
leftBuf
+
i
)
+
v_load_aligned
(
topBuf
+
i
);
v_store_aligned
(
leftBuf
+
i
,
res
);
res
=
res
+
v_load_aligned
(
leftBuf
+
D
-
8
)
+
v_load_aligned
(
topBuf
+
D
-
8
);
v_store_aligned
(
leftBuf
+
D
-
8
,
res
);
// track disparity value with the minimum cost:
min_sum_cost_reg
=
v_min
(
min_sum_cost_reg
,
res
);
min_cost
=
v_reduce_min
(
min_sum_cost_reg
);
min_sum_pos_reg
=
min_sum_pos_reg
+
((
min_sum_cost_reg
==
res
)
&
(
loop_idx
-
min_sum_pos_reg
));
loop_idx
=
loop_idx
+
eight_reg
;
//update src:
src0_rightBuf
=
src1_rightBuf
;
src1_rightBuf
=
src2
;
optimal_disp
=
min_pos
(
min_sum_cost_reg
,
min_sum_pos_reg
,
min_cost
);
}
// a bit different processing for the last cycle of the loop:
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
v_extract
<
7
>
(
src0_rightBuf
,
src1_rightBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_rightBuf
,
src2
)
+
P1_reg
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_rightBuf
,
rightMinCostP2_reg
))
-
rightMinCostP2_reg
);
rightMinCost
=
min
(
v_min
(
rightMinCost_new_reg
,
res
));
v_store_aligned
(
rightBuf
+
D
-
8
,
res
);
res
=
res
+
v_load_aligned
(
leftBuf
+
D
-
8
)
+
v_load_aligned
(
topBuf
+
D
-
8
);
v_store_aligned
(
leftBuf
+
D
-
8
,
res
);
min_sum_cost_reg
=
v_min
(
min_sum_cost_reg
,
res
);
min_cost
=
min
(
min_sum_cost_reg
);
min_sum_pos_reg
=
min_sum_pos_reg
+
((
min_sum_cost_reg
==
res
)
&
(
loop_idx
-
min_sum_pos_reg
));
optimal_disp
=
min_pos
(
min_sum_cost_reg
,
min_sum_pos_reg
);
#else
CostType
rightMinCost_new
=
SHRT_MAX
;
int
rightMinCost_P2
=
rightMinCost
+
P2
;
CostType
rightBuf_i_minus_1
=
SHRT_MAX
;
CostType
tmp
;
min_cost
=
SHRT_MAX
;
for
(
int
i
=
0
;
i
<
D
-
1
;
i
++
)
else
#endif
{
tmp
=
rightBuf
[
i
];
rightBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
i
]
+
std
::
min
(
std
::
min
(
rightBuf_i_minus_1
+
P1
,
rightBuf
[
i
+
1
]
+
P1
),
std
::
min
((
int
)
rightBuf
[
i
],
rightMinCost_P2
))
-
rightMinCost_P2
);
rightBuf_i_minus_1
=
tmp
;
rightMinCost_new
=
std
::
min
(
rightMinCost_new
,
rightBuf
[
i
]);
leftBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
((
int
)
leftBuf
[
i
]
+
rightBuf
[
i
]
+
topBuf
[
i
]);
if
(
leftBuf
[
i
]
<
min_cost
)
CostType
rightMinCost_new
=
SHRT_MAX
;
int
rightMinCost_P2
=
rightMinCost
+
P2
;
CostType
rightBuf_i_minus_1
=
SHRT_MAX
;
CostType
tmp
;
min_cost
=
SHRT_MAX
;
for
(
int
i
=
0
;
i
<
D
-
1
;
i
++
)
{
optimal_disp
=
i
;
min_cost
=
leftBuf
[
i
];
tmp
=
rightBuf
[
i
];
rightBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
i
]
+
std
::
min
(
std
::
min
(
rightBuf_i_minus_1
+
P1
,
rightBuf
[
i
+
1
]
+
P1
),
std
::
min
((
int
)
rightBuf
[
i
],
rightMinCost_P2
))
-
rightMinCost_P2
);
rightBuf_i_minus_1
=
tmp
;
rightMinCost_new
=
std
::
min
(
rightMinCost_new
,
rightBuf
[
i
]);
leftBuf
[
i
]
=
cv
::
saturate_cast
<
CostType
>
((
int
)
leftBuf
[
i
]
+
rightBuf
[
i
]
+
topBuf
[
i
]);
if
(
leftBuf
[
i
]
<
min_cost
)
{
optimal_disp
=
i
;
min_cost
=
leftBuf
[
i
];
}
}
}
rightBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
D
-
1
]
+
std
::
min
(
rightBuf_i_minus_1
+
P1
,
std
::
min
((
int
)
rightBuf
[
D
-
1
],
rightMinCost_P2
))
-
rightMinCost_P2
);
rightMinCost
=
std
::
min
(
rightMinCost_new
,
rightBuf
[
D
-
1
]);
leftBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
((
int
)
leftBuf
[
D
-
1
]
+
rightBuf
[
D
-
1
]
+
topBuf
[
D
-
1
]);
if
(
leftBuf
[
D
-
1
]
<
min_cost
)
{
optimal_disp
=
D
-
1
;
min_cost
=
leftBuf
[
D
-
1
];
rightBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
(
costs
[
D
-
1
]
+
std
::
min
(
rightBuf_i_minus_1
+
P1
,
std
::
min
((
int
)
rightBuf
[
D
-
1
],
rightMinCost_P2
))
-
rightMinCost_P2
);
rightMinCost
=
std
::
min
(
rightMinCost_new
,
rightBuf
[
D
-
1
]);
leftBuf
[
D
-
1
]
=
cv
::
saturate_cast
<
CostType
>
((
int
)
leftBuf
[
D
-
1
]
+
rightBuf
[
D
-
1
]
+
topBuf
[
D
-
1
]);
if
(
leftBuf
[
D
-
1
]
<
min_cost
)
{
optimal_disp
=
D
-
1
;
min_cost
=
leftBuf
[
D
-
1
];
}
}
#endif
}
void
SGBM3WayMainLoop
::
operator
()
(
const
Range
&
range
)
const
...
...
@@ -1360,42 +1332,47 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
if
(
uniquenessRatio
>
0
)
{
#if CV_SIMD128
horPassCostVolume
+=
x
;
int
thresh
=
(
100
*
min_cost
)
/
(
100
-
uniquenessRatio
);
v_int16x8
thresh_reg
=
v_setall_s16
((
short
)(
thresh
+
1
));
v_int16x8
d1
=
v_setall_s16
((
short
)(
best_d
-
1
));
v_int16x8
d2
=
v_setall_s16
((
short
)(
best_d
+
1
));
v_int16x8
eight_reg
=
v_setall_s16
(
8
);
v_int16x8
cur_d
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
);
v_int16x8
mask
,
cost1
,
cost2
;
for
(
d
=
0
;
d
<
D
;
d
+=
16
)
if
(
useSIMD
)
{
cost1
=
v_load_aligned
(
horPassCostVolume
+
d
);
cost2
=
v_load_aligned
(
horPassCostVolume
+
d
+
8
);
horPassCostVolume
+=
x
;
int
thresh
=
(
100
*
min_cost
)
/
(
100
-
uniquenessRatio
);
v_int16x8
thresh_reg
=
v_setall_s16
((
short
)(
thresh
+
1
));
v_int16x8
d1
=
v_setall_s16
((
short
)(
best_d
-
1
));
v_int16x8
d2
=
v_setall_s16
((
short
)(
best_d
+
1
));
v_int16x8
eight_reg
=
v_setall_s16
(
8
);
v_int16x8
cur_d
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
);
v_int16x8
mask
,
cost1
,
cost2
;
for
(
d
=
0
;
d
<
D
;
d
+=
16
)
{
cost1
=
v_load_aligned
(
horPassCostVolume
+
d
);
cost2
=
v_load_aligned
(
horPassCostVolume
+
d
+
8
);
mask
=
cost1
<
thresh_reg
;
mask
=
mask
&
(
(
cur_d
<
d1
)
|
(
cur_d
>
d2
)
);
if
(
v_signmask
(
mask
)
)
break
;
mask
=
cost1
<
thresh_reg
;
mask
=
mask
&
(
(
cur_d
<
d1
)
|
(
cur_d
>
d2
)
);
if
(
v_signmask
(
mask
)
)
break
;
cur_d
=
cur_d
+
eight_reg
;
cur_d
=
cur_d
+
eight_reg
;
mask
=
cost2
<
thresh_reg
;
mask
=
mask
&
(
(
cur_d
<
d1
)
|
(
cur_d
>
d2
)
);
if
(
v_signmask
(
mask
)
)
break
;
mask
=
cost2
<
thresh_reg
;
mask
=
mask
&
(
(
cur_d
<
d1
)
|
(
cur_d
>
d2
)
);
if
(
v_signmask
(
mask
)
)
break
;
cur_d
=
cur_d
+
eight_reg
;
cur_d
=
cur_d
+
eight_reg
;
}
horPassCostVolume
-=
x
;
}
horPassCostVolume
-=
x
;
#else
for
(
d
=
0
;
d
<
D
;
d
++
)
else
#endif
{
if
(
horPassCostVolume
[
x
+
d
]
*
(
100
-
uniquenessRatio
)
<
min_cost
*
100
&&
std
::
abs
(
d
-
best_d
)
>
1
)
break
;
for
(
d
=
0
;
d
<
D
;
d
++
)
{
if
(
horPassCostVolume
[
x
+
d
]
*
(
100
-
uniquenessRatio
)
<
min_cost
*
100
&&
std
::
abs
(
d
-
best_d
)
>
1
)
break
;
}
}
#endif
if
(
d
<
D
)
continue
;
}
...
...
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
ecb8fb96
...
...
@@ -782,25 +782,37 @@ inline void v_store_f16(short* ptr, v_float16x4& a)
{
vst1_f16
(
ptr
,
a
.
val
);
}
#endif
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_
4(_Tpvec, scalartype, func, scalar_func
) \
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_
8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix
) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
scalartype CV_DECL_ALIGNED(16) buf[4]; \
v_store_aligned(buf, a); \
scalartype s0 = scalar_func(buf[0], buf[1]); \
scalartype s1 = scalar_func(buf[2], buf[3]); \
return scalar_func(s0, s1); \
_Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
a0 = vp##vectorfunc##_##suffix(a0, a0); \
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
}
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_uint32x4
,
unsigned
,
sum
,
OPENCV_HAL_ADD
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_uint32x4
,
unsigned
,
max
,
std
::
max
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_uint32x4
,
unsigned
,
min
,
std
::
min
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_int32x4
,
int
,
sum
,
OPENCV_HAL_ADD
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_int32x4
,
int
,
max
,
std
::
max
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_int32x4
,
int
,
min
,
std
::
min
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float
,
sum
,
OPENCV_HAL_ADD
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float
,
max
,
std
::
max
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float
,
min
,
std
::
min
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
short
,
sum
,
add
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
short
,
max
,
max
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
short
,
min
,
min
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
short
,
sum
,
add
,
s16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
short
,
max
,
max
,
s16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
short
,
min
,
min
,
s16
)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
_Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
}
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_uint32x4
,
uint32x2
,
unsigned
,
sum
,
add
,
u32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_uint32x4
,
uint32x2
,
unsigned
,
max
,
max
,
u32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_uint32x4
,
uint32x2
,
unsigned
,
min
,
min
,
u32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_int32x4
,
int32x2
,
int
,
sum
,
add
,
s32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_int32x4
,
int32x2
,
int
,
max
,
max
,
s32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_int32x4
,
int32x2
,
int
,
min
,
min
,
s32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float32x2
,
float
,
sum
,
add
,
f32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float32x2
,
float
,
max
,
max
,
f32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float32x2
,
float
,
min
,
min
,
f32
)
inline
int
v_signmask
(
const
v_uint8x16
&
a
)
{
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
ecb8fb96
...
...
@@ -1060,6 +1060,46 @@ inline void v_store_f16(short* ptr, v_float16x4& a)
{
_mm_storel_epi64
((
__m128i
*
)
ptr
,
a
.
val
);
}
#endif
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
return (scalartype)_mm_cvtsi128_si32(val); \
} \
inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
{ \
__m128i val = a.val; \
__m128i smask = _mm_set1_epi16(sbit); \
val = _mm_xor_si128(val, smask); \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
return (scalartype)_mm_cvtsi128_si32(val); \
} \
inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
return (unsigned scalartype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8
(
int16x8
,
short
,
max
,
epi16
,
(
short
)
-
32768
)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8
(
int16x8
,
short
,
min
,
epi16
,
(
short
)
-
32768
)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM
(
int16x8
,
short
,
16
)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
...
...
modules/core/test/test_intrin.cpp
View file @
ecb8fb96
...
...
@@ -449,7 +449,7 @@ template<typename R> struct TheTest
R
a
=
dataA
;
EXPECT_EQ
((
LaneType
)
1
,
v_reduce_min
(
a
));
EXPECT_EQ
((
LaneType
)
R
::
nlanes
,
v_reduce_max
(
a
));
EXPECT_EQ
((
LaneType
)(
1
+
R
::
nlanes
)
*
2
,
v_reduce_sum
(
a
));
EXPECT_EQ
((
LaneType
)(
(
1
+
R
::
nlanes
)
*
R
::
nlanes
/
2
)
,
v_reduce_sum
(
a
));
return
*
this
;
}
...
...
@@ -842,6 +842,7 @@ TEST(hal_intrin, uint16x8) {
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_reduce
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
7
>
().
test_pack
<
16
>
()
.
test_pack_u
<
1
>
().
test_pack_u
<
2
>
().
test_pack_u
<
7
>
().
test_pack_u
<
16
>
()
...
...
@@ -867,6 +868,7 @@ TEST(hal_intrin, int16x8) {
.
test_min_max
()
.
test_absdiff
()
.
test_abs
()
.
test_reduce
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
7
>
().
test_pack
<
16
>
()
.
test_unpack
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment