Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
429d2805
Commit
429d2805
authored
Jul 30, 2015
by
Maksim Shabunin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #5081 from sbokov:Adding_HAL_v_extract
parents
6922b948
1ef8cf5a
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
42 deletions
+18
-42
stereosgbm.cpp
modules/calib3d/src/stereosgbm.cpp
+18
-42
No files found.
modules/calib3d/src/stereosgbm.cpp
View file @
429d2805
...
...
@@ -1017,7 +1017,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
}
}
#if CV_SIMD128
&& CV_SSE2
#if CV_SIMD128
// define some additional reduce operations:
inline
short
min
(
const
v_int16x8
&
a
)
{
...
...
@@ -1055,7 +1055,7 @@ inline short min_pos(const v_int16x8& val,const v_int16x8& pos)
inline
void
accumulateCostsLeftTop
(
CostType
*
leftBuf
,
CostType
*
leftBuf_prev
,
CostType
*
topBuf
,
CostType
*
costs
,
CostType
&
leftMinCost
,
CostType
&
topMinCost
,
int
D
,
int
P1
,
int
P2
)
{
#if CV_SIMD128
&& CV_SSE2
#if CV_SIMD128
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
v_int16x8
leftMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
leftMinCost
+
P2
));
...
...
@@ -1078,13 +1078,9 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
//lookahead load:
src2
=
v_load_aligned
(
leftBuf_prev
+
i
+
8
);
//get shifted versions of the current block:
src_shifted_left
=
v_int16x8
(
_mm_slli_si128
(
src1_leftBuf
.
val
,
2
));
src_shifted_right
=
v_int16x8
(
_mm_srli_si128
(
src1_leftBuf
.
val
,
2
));
// replace shifted-in zeros with proper values and add P1:
src_shifted_left
=
(
src_shifted_left
|
v_int16x8
(
_mm_srli_si128
(
src0_leftBuf
.
val
,
14
)))
+
P1_reg
;
src_shifted_right
=
(
src_shifted_right
|
v_int16x8
(
_mm_slli_si128
(
src2
.
val
,
14
)))
+
P1_reg
;
//get shifted versions of the current block and add P1:
src_shifted_left
=
v_extract
<
7
>
(
src0_leftBuf
,
src1_leftBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_leftBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_leftBuf
,
leftMinCostP2_reg
))
-
leftMinCostP2_reg
);
...
...
@@ -1099,13 +1095,9 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
//lookahead load:
src2
=
v_load_aligned
(
topBuf
+
i
+
8
);
//get shifted versions of the current block:
src_shifted_left
=
v_int16x8
(
_mm_slli_si128
(
src1_topBuf
.
val
,
2
));
src_shifted_right
=
v_int16x8
(
_mm_srli_si128
(
src1_topBuf
.
val
,
2
));
// replace shifted-in zeros with proper values and add P1:
src_shifted_left
=
(
src_shifted_left
|
v_int16x8
(
_mm_srli_si128
(
src0_topBuf
.
val
,
14
)))
+
P1_reg
;
src_shifted_right
=
(
src_shifted_right
|
v_int16x8
(
_mm_slli_si128
(
src2
.
val
,
14
)))
+
P1_reg
;
//get shifted versions of the current block and add P1:
src_shifted_left
=
v_extract
<
7
>
(
src0_topBuf
,
src1_topBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_topBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_topBuf
,
topMinCostP2_reg
))
-
topMinCostP2_reg
);
...
...
@@ -1119,26 +1111,18 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
// a bit different processing for the last cycle of the loop:
//process leftBuf:
src_shifted_left
=
v_int16x8
(
_mm_slli_si128
(
src1_leftBuf
.
val
,
2
));
src_shifted_right
=
v_int16x8
(
_mm_srli_si128
(
src1_leftBuf
.
val
,
2
));
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
(
src_shifted_left
|
v_int16x8
(
_mm_srli_si128
(
src0_leftBuf
.
val
,
14
)))
+
P1_reg
;
src_shifted_right
=
(
src_shifted_right
|
v_int16x8
(
_mm_slli_si128
(
src2
.
val
,
14
)))
+
P1_reg
;
src_shifted_left
=
v_extract
<
7
>
(
src0_leftBuf
,
src1_leftBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_leftBuf
,
src2
)
+
P1_reg
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_leftBuf
,
leftMinCostP2_reg
))
-
leftMinCostP2_reg
);
leftMinCost
=
min
(
v_min
(
leftMinCost_new_reg
,
res
));
v_store_aligned
(
leftBuf
+
D
-
8
,
res
);
//process topBuf:
src_shifted_left
=
v_int16x8
(
_mm_slli_si128
(
src1_topBuf
.
val
,
2
));
src_shifted_right
=
v_int16x8
(
_mm_srli_si128
(
src1_topBuf
.
val
,
2
));
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
(
src_shifted_left
|
v_int16x8
(
_mm_srli_si128
(
src0_topBuf
.
val
,
14
)))
+
P1_reg
;
src_shifted_right
=
(
src_shifted_right
|
v_int16x8
(
_mm_slli_si128
(
src2
.
val
,
14
)))
+
P1_reg
;
src_shifted_left
=
v_extract
<
7
>
(
src0_topBuf
,
src1_topBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_topBuf
,
src2
)
+
P1_reg
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_topBuf
,
topMinCostP2_reg
))
-
topMinCostP2_reg
);
topMinCost
=
min
(
v_min
(
topMinCost_new_reg
,
res
));
...
...
@@ -1178,7 +1162,7 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
inline
void
accumulateCostsRight
(
CostType
*
rightBuf
,
CostType
*
topBuf
,
CostType
*
leftBuf
,
CostType
*
costs
,
CostType
&
rightMinCost
,
int
D
,
int
P1
,
int
P2
,
int
&
optimal_disp
,
CostType
&
min_cost
)
{
#if CV_SIMD128
&& CV_SSE2
#if CV_SIMD128
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
v_int16x8
rightMinCostP2_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
rightMinCost
+
P2
));
...
...
@@ -1200,13 +1184,9 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType*
//lookahead load:
src2
=
v_load_aligned
(
rightBuf
+
i
+
8
);
//get shifted versions of the current block:
src_shifted_left
=
v_int16x8
(
_mm_slli_si128
(
src1_rightBuf
.
val
,
2
));
src_shifted_right
=
v_int16x8
(
_mm_srli_si128
(
src1_rightBuf
.
val
,
2
));
// replace shifted-in zeros with proper values and add P1:
src_shifted_left
=
(
src_shifted_left
|
v_int16x8
(
_mm_srli_si128
(
src0_rightBuf
.
val
,
14
)))
+
P1_reg
;
src_shifted_right
=
(
src_shifted_right
|
v_int16x8
(
_mm_slli_si128
(
src2
.
val
,
14
)))
+
P1_reg
;
//get shifted versions of the current block and add P1:
src_shifted_left
=
v_extract
<
7
>
(
src0_rightBuf
,
src1_rightBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_rightBuf
,
src2
)
+
P1_reg
;
// process and save current block:
res
=
v_load_aligned
(
costs
+
i
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_rightBuf
,
rightMinCostP2_reg
))
-
rightMinCostP2_reg
);
...
...
@@ -1228,13 +1208,9 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType*
}
// a bit different processing for the last cycle of the loop:
src_shifted_left
=
v_int16x8
(
_mm_slli_si128
(
src1_rightBuf
.
val
,
2
));
src_shifted_right
=
v_int16x8
(
_mm_srli_si128
(
src1_rightBuf
.
val
,
2
));
src2
=
v_setall_s16
(
SHRT_MAX
);
src_shifted_left
=
(
src_shifted_left
|
v_int16x8
(
_mm_srli_si128
(
src0_rightBuf
.
val
,
14
)))
+
P1_reg
;
src_shifted_right
=
(
src_shifted_right
|
v_int16x8
(
_mm_slli_si128
(
src2
.
val
,
14
)))
+
P1_reg
;
src_shifted_left
=
v_extract
<
7
>
(
src0_rightBuf
,
src1_rightBuf
)
+
P1_reg
;
src_shifted_right
=
v_extract
<
1
>
(
src1_rightBuf
,
src2
)
+
P1_reg
;
res
=
v_load_aligned
(
costs
+
D
-
8
)
+
(
v_min
(
v_min
(
src_shifted_left
,
src_shifted_right
),
v_min
(
src1_rightBuf
,
rightMinCostP2_reg
))
-
rightMinCostP2_reg
);
rightMinCost
=
min
(
v_min
(
rightMinCost_new_reg
,
res
));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment