Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
788ac96f
Commit
788ac96f
authored
Aug 02, 2010
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added optimized belief propagation implementation (used short for messages)
parent
d6bbaea2
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
499 additions
and
302 deletions
+499
-302
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+26
-10
beliefpropagation_gpu.cpp
modules/gpu/src/beliefpropagation_gpu.cpp
+131
-65
beliefpropagation.cu
modules/gpu/src/cuda/beliefpropagation.cu
+233
-122
saturate_cast.hpp
modules/gpu/src/cuda/saturate_cast.hpp
+109
-105
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
788ac96f
...
...
@@ -375,20 +375,32 @@ namespace cv
class
CV_EXPORTS
StereoBeliefPropagation_GPU
{
public
:
enum
{
MSG_TYPE_AUTO
,
MSG_TYPE_FLOAT
,
MSG_TYPE_SHORT_SCALE_AUTO
,
MSG_TYPE_SHORT_SCALE_MANUAL
};
enum
{
DEFAULT_NDISP
=
64
};
enum
{
DEFAULT_ITERS
=
5
};
enum
{
DEFAULT_LEVELS
=
5
};
//! the default constructor
explicit
StereoBeliefPropagation_GPU
(
int
ndisp
=
DEFAULT_NDISP
,
int
iters
=
DEFAULT_ITERS
,
int
levels
=
DEFAULT_LEVELS
);
//! the full constructor taking the number of disparities, number of BP iterations on first level,
//! number of levels, truncation of discontinuity cost, truncation of data cost and weighting of data cost.
StereoBeliefPropagation_GPU
(
int
ndisp
,
int
iters
,
int
levels
,
float
disc_cost
,
float
data_cost
,
float
lambda
);
explicit
StereoBeliefPropagation_GPU
(
int
ndisp_
=
DEFAULT_NDISP
,
int
iters_
=
DEFAULT_ITERS
,
int
levels_
=
DEFAULT_LEVELS
,
int
msg_type_
=
MSG_TYPE_AUTO
,
float
msg_scale
=
1.0
f
);
//! the full constructor taking the number of disparities, number of BP iterations on each level,
//! number of levels, truncation of data cost, data weight,
//! truncation of discontinuity cost and discontinuity single jump
StereoBeliefPropagation_GPU
(
int
ndisp_
,
int
iters_
,
int
levels_
,
float
max_data_term_
,
float
data_weight_
,
float
max_disc_term_
,
float
disc_single_jump_
,
int
msg_type_
=
MSG_TYPE_AUTO
,
float
msg_scale
=
1.0
f
);
//! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
//! if disparity is empty output type will be CV_
32
S else output type will be disparity.type().
//! if disparity is empty output type will be CV_
16
S else output type will be disparity.type().
void
operator
()(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
);
//! Acync version
...
...
@@ -404,9 +416,13 @@ namespace cv
int
iters
;
int
levels
;
float
disc_cost
;
float
data_cost
;
float
lambda
;
float
max_data_term
;
float
data_weight
;
float
max_disc_term
;
float
disc_single_jump
;
int
msg_type
;
float
msg_scale
;
private
:
GpuMat
u
,
d
,
l
,
r
,
u2
,
d2
,
l2
,
r2
;
std
::
vector
<
GpuMat
>
datas
;
...
...
modules/gpu/src/beliefpropagation_gpu.cpp
View file @
788ac96f
...
...
@@ -48,8 +48,8 @@ using namespace std;
#if !defined (HAVE_CUDA)
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
,
int
,
int
)
{
throw_nogpu
();
}
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
,
int
,
int
,
float
,
float
,
float
)
{
throw_nogpu
();
}
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
,
int
,
int
,
int
,
float
)
{
throw_nogpu
();
}
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
,
int
,
int
,
float
,
float
,
float
,
float
,
int
,
float
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
CudaStream
&
)
{
throw_nogpu
();
}
...
...
@@ -58,37 +58,52 @@ bool cv::gpu::StereoBeliefPropagation_GPU::checkIfGpuCallReasonable() { throw_no
#else
/* !defined (HAVE_CUDA) */
static
const
float
DEFAULT_DISC_COST
=
1.7
f
;
static
const
float
DEFAULT_DATA_COST
=
10.0
f
;
static
const
float
DEFAULT_LAMBDA_COST
=
0.07
f
;
typedef
DevMem2D_
<
float
>
DevMem2Df
;
typedef
DevMem2D_
<
int
>
DevMem2Di
;
const
float
DEFAULT_MAX_DATA_TERM
=
10.0
f
;
const
float
DEFAULT_DATA_WEIGHT
=
0.07
f
;
const
float
DEFAULT_MAX_DISC_TERM
=
1.7
f
;
const
float
DEFAULT_DISC_SINGLE_JUMP
=
1.0
f
;
namespace
cv
{
namespace
gpu
{
namespace
impl
{
extern
"C"
void
load_constants
(
int
ndisp
,
float
disc_cost
,
float
data_cost
,
float
lambda
);
extern
"C"
void
comp_data_caller
(
const
DevMem2D
&
l
,
const
DevMem2D
&
r
,
DevMem2Df
mdata
,
const
cudaStream_t
&
stream
);
extern
"C"
void
data_down_kernel_caller
(
int
dst_cols
,
int
dst_rows
,
int
src_rows
,
const
DevMem2Df
&
src
,
DevMem2Df
dst
,
const
cudaStream_t
&
stream
);
extern
"C"
void
level_up
(
int
dst_idx
,
int
dst_cols
,
int
dst_rows
,
int
src_rows
,
DevMem2Df
*
mu
,
DevMem2Df
*
md
,
DevMem2Df
*
ml
,
DevMem2Df
*
mr
,
const
cudaStream_t
&
stream
);
extern
"C"
void
call_all_iterations
(
int
cols
,
int
rows
,
int
iters
,
DevMem2Df
&
u
,
DevMem2Df
&
d
,
DevMem2Df
&
l
,
DevMem2Df
&
r
,
const
DevMem2Df
&
data
,
const
cudaStream_t
&
stream
);
extern
"C"
void
output_caller
(
const
DevMem2Df
&
u
,
const
DevMem2Df
&
d
,
const
DevMem2Df
&
l
,
const
DevMem2Df
&
r
,
const
DevMem2Df
&
data
,
DevMem2Di
disp
,
const
cudaStream_t
&
stream
);
void
load_constants
(
int
ndisp
,
float
max_data_term
,
float
data_weight
,
float
max_disc_term
,
float
disc_single_jump
);
void
comp_data
(
int
msgType
,
const
DevMem2D
&
l
,
const
DevMem2D
&
r
,
DevMem2D
mdata
,
const
cudaStream_t
&
stream
);
void
data_step_down
(
int
dst_cols
,
int
dst_rows
,
int
src_rows
,
int
msgType
,
const
DevMem2D
&
src
,
DevMem2D
dst
,
const
cudaStream_t
&
stream
);
void
level_up_messages
(
int
dst_idx
,
int
dst_cols
,
int
dst_rows
,
int
src_rows
,
int
msgType
,
DevMem2D
*
mus
,
DevMem2D
*
mds
,
DevMem2D
*
mls
,
DevMem2D
*
mrs
,
const
cudaStream_t
&
stream
);
void
calc_all_iterations
(
int
cols
,
int
rows
,
int
iters
,
int
msgType
,
DevMem2D
&
u
,
DevMem2D
&
d
,
DevMem2D
&
l
,
DevMem2D
&
r
,
const
DevMem2D
&
data
,
const
cudaStream_t
&
stream
);
void
output
(
int
msgType
,
const
DevMem2D
&
u
,
const
DevMem2D
&
d
,
const
DevMem2D
&
l
,
const
DevMem2D
&
r
,
const
DevMem2D
&
data
,
DevMem2D
disp
,
const
cudaStream_t
&
stream
);
}}}
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
ndisp_
,
int
iters_
,
int
levels_
)
:
ndisp
(
ndisp_
),
iters
(
iters_
),
levels
(
levels_
),
disc_cost
(
DEFAULT_DISC_COST
),
data_cost
(
DEFAULT_DATA_COST
),
lambda
(
DEFAULT_LAMBDA_COST
),
datas
(
levels_
)
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
ndisp_
,
int
iters_
,
int
levels_
,
int
msg_type_
,
float
msg_scale_
)
:
ndisp
(
ndisp_
),
iters
(
iters_
),
levels
(
levels_
),
max_data_term
(
DEFAULT_MAX_DATA_TERM
),
data_weight
(
DEFAULT_DATA_WEIGHT
),
max_disc_term
(
DEFAULT_MAX_DISC_TERM
),
disc_single_jump
(
DEFAULT_DISC_SINGLE_JUMP
),
msg_type
(
msg_type_
),
msg_scale
(
msg_scale_
),
datas
(
levels_
)
{
CV_Assert
(
0
<
ndisp
&&
0
<
iters
&&
0
<
levels
);
}
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
ndisp_
,
int
iters_
,
int
levels_
,
float
max_data_term_
,
float
data_weight_
,
float
max_disc_term_
,
float
disc_single_jump_
,
int
msg_type_
,
float
msg_scale_
)
:
ndisp
(
ndisp_
),
iters
(
iters_
),
levels
(
levels_
),
max_data_term
(
max_data_term_
),
data_weight
(
data_weight_
),
max_disc_term
(
max_disc_term_
),
disc_single_jump
(
disc_single_jump_
),
msg_type
(
msg_type_
),
msg_scale
(
msg_scale_
),
datas
(
levels_
)
{
CV_Assert
(
0
<
ndisp
);
CV_Assert
(
ndisp
%
8
==
0
);
CV_Assert
(
0
<
ndisp
&&
0
<
iters
&&
0
<
levels
);
}
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
StereoBeliefPropagation_GPU
(
int
ndisp_
,
int
iters_
,
int
levels_
,
float
disc_cost_
,
float
data_cost_
,
float
lambda_
)
:
ndisp
(
ndisp_
),
iters
(
iters_
),
levels
(
levels_
),
disc_cost
(
disc_cost_
),
data_cost
(
data_cost_
),
lambda
(
lambda_
),
datas
(
levels_
)
static
bool
checkMsgOverflow
(
int
levels
,
float
max_data_term
,
float
data_weight
,
float
max_disc_term
,
float
msg_scale
)
{
CV_Assert
(
0
<
ndisp
);
CV_Assert
(
ndisp
%
8
==
0
);
float
maxV
=
ceil
(
max_disc_term
*
msg_scale
);
float
maxD
=
ceil
(
max_data_term
*
data_weight
*
msg_scale
);
float
maxMsg
=
maxV
+
(
maxD
*
pow
(
4.0
f
,
(
float
)
levels
));
maxMsg
=
maxV
+
(
maxD
*
pow
(
4.0
f
,
(
float
)
levels
))
+
3
*
maxMsg
;
return
(
maxMsg
>
numeric_limits
<
short
>::
max
());
}
static
void
stereo_bp_gpu_operator
(
int
ndisp
,
int
iters
,
int
levels
,
float
disc_cost
,
float
data_cost
,
float
lambda
,
static
void
stereo_bp_gpu_operator
(
int
ndisp
,
int
iters
,
int
levels
,
float
max_data_term
,
float
data_weight
,
float
max_disc_term
,
float
disc_single_jump
,
int
msg_type
,
float
&
msg_scale
,
GpuMat
&
u
,
GpuMat
&
d
,
GpuMat
&
l
,
GpuMat
&
r
,
GpuMat
&
u2
,
GpuMat
&
d2
,
GpuMat
&
l2
,
GpuMat
&
r2
,
vector
<
GpuMat
>&
datas
,
GpuMat
&
out
,
...
...
@@ -108,14 +123,73 @@ static void stereo_bp_gpu_operator(int ndisp, int iters, int levels, float disc_
const
int
min_image_dim_size
=
2
;
CV_Assert
(
min
(
lowest_cols
,
lowest_rows
)
>
min_image_dim_size
);
u
.
create
(
rows
*
ndisp
,
cols
,
CV_32F
);
d
.
create
(
rows
*
ndisp
,
cols
,
CV_32F
);
l
.
create
(
rows
*
ndisp
,
cols
,
CV_32F
);
r
.
create
(
rows
*
ndisp
,
cols
,
CV_32F
);
switch
(
msg_type
)
{
case
StereoBeliefPropagation_GPU
:
:
MSG_TYPE_AUTO
:
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
100.0
f
))
{
msg_type
=
CV_16S
;
msg_scale
=
100.0
f
;
}
else
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
64.0
f
))
{
msg_type
=
CV_16S
;
msg_scale
=
64.0
f
;
}
else
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
32.0
f
))
{
msg_type
=
CV_16S
;
msg_scale
=
32.0
f
;
}
else
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
16.0
f
))
{
msg_type
=
CV_16S
;
msg_scale
=
16.0
f
;
}
else
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
10.0
f
))
{
msg_type
=
CV_16S
;
msg_scale
=
10.0
f
;
}
else
{
msg_type
=
CV_32F
;
msg_scale
=
1.0
f
;
}
break
;
case
StereoBeliefPropagation_GPU
:
:
MSG_TYPE_FLOAT
:
msg_type
=
CV_32F
;
msg_scale
=
1.0
f
;
break
;
case
StereoBeliefPropagation_GPU
:
:
MSG_TYPE_SHORT_SCALE_AUTO
:
msg_type
=
CV_16S
;
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
100.0
f
))
msg_scale
=
100.0
f
;
else
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
64.0
f
))
msg_scale
=
64.0
f
;
else
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
32.0
f
))
msg_scale
=
32.0
f
;
else
if
(
!
checkMsgOverflow
(
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
16.0
f
))
msg_scale
=
16.0
f
;
else
msg_scale
=
10.0
f
;
break
;
case
StereoBeliefPropagation_GPU
:
:
MSG_TYPE_SHORT_SCALE_MANUAL
:
msg_type
=
CV_16S
;
break
;
default
:
cv
::
gpu
::
error
(
"Unsupported message type"
,
__FILE__
,
__LINE__
);
}
u
.
create
(
rows
*
ndisp
,
cols
,
msg_type
);
d
.
create
(
rows
*
ndisp
,
cols
,
msg_type
);
l
.
create
(
rows
*
ndisp
,
cols
,
msg_type
);
r
.
create
(
rows
*
ndisp
,
cols
,
msg_type
);
if
(
levels
&
1
)
{
u
=
zero
;
//can clear less area
//can clear less area
u
=
zero
;
d
=
zero
;
l
=
zero
;
r
=
zero
;
...
...
@@ -126,10 +200,10 @@ static void stereo_bp_gpu_operator(int ndisp, int iters, int levels, float disc_
int
less_rows
=
(
rows
+
1
)
/
2
;
int
less_cols
=
(
cols
+
1
)
/
2
;
u2
.
create
(
less_rows
*
ndisp
,
less_cols
,
CV_32F
);
d2
.
create
(
less_rows
*
ndisp
,
less_cols
,
CV_32F
);
l2
.
create
(
less_rows
*
ndisp
,
less_cols
,
CV_32F
);
r2
.
create
(
less_rows
*
ndisp
,
less_cols
,
CV_32F
);
u2
.
create
(
less_rows
*
ndisp
,
less_cols
,
msg_type
);
d2
.
create
(
less_rows
*
ndisp
,
less_cols
,
msg_type
);
l2
.
create
(
less_rows
*
ndisp
,
less_cols
,
msg_type
);
r2
.
create
(
less_rows
*
ndisp
,
less_cols
,
msg_type
);
if
((
levels
&
1
)
==
0
)
{
...
...
@@ -140,72 +214,64 @@ static void stereo_bp_gpu_operator(int ndisp, int iters, int levels, float disc_
}
}
impl
::
load_constants
(
ndisp
,
disc_cost
,
data_cost
,
lambda
);
impl
::
load_constants
(
ndisp
,
max_data_term
,
msg_scale
*
data_weight
,
msg_scale
*
max_disc_term
,
msg_scale
*
disc_single_jump
);
datas
.
resize
(
levels
);
AutoBuffer
<
int
>
cols_all_buf
(
levels
);
AutoBuffer
<
int
>
rows_all_buf
(
levels
);
AutoBuffer
<
int
>
iters_all_buf
(
levels
);
AutoBuffer
<
int
>
buf
(
levels
<<
1
);
int
*
cols_all
=
cols_all_buf
;
int
*
rows_all
=
rows_all_buf
;
int
*
iters_all
=
iters_all_buf
;
int
*
cols_all
=
buf
;
int
*
rows_all
=
cols_all
+
levels
;
cols_all
[
0
]
=
cols
;
rows_all
[
0
]
=
rows
;
iters_all
[
0
]
=
iters
;
datas
[
0
].
create
(
rows
*
ndisp
,
cols
,
CV_32F
);
//datas[0] = Scalar(data_cost); //DOTO did in kernel, but not sure if correct
datas
[
0
].
create
(
rows
*
ndisp
,
cols
,
msg_type
);
impl
::
comp_data
_caller
(
left
,
right
,
datas
.
front
(),
stream
);
impl
::
comp_data
(
msg_type
,
left
,
right
,
datas
.
front
(),
stream
);
for
(
int
i
=
1
;
i
<
levels
;
i
++
)
{
cols_all
[
i
]
=
(
cols_all
[
i
-
1
]
+
1
)
/
2
;
rows_all
[
i
]
=
(
rows_all
[
i
-
1
]
+
1
)
/
2
;
// this is difference from Felzenszwalb algorithm
// we reduce iters num for each next level
iters_all
[
i
]
=
max
(
2
*
iters_all
[
i
-
1
]
/
3
,
1
);
cols_all
[
i
]
=
(
cols_all
[
i
-
1
]
+
1
)
/
2
;
rows_all
[
i
]
=
(
rows_all
[
i
-
1
]
+
1
)
/
2
;
datas
[
i
].
create
(
rows_all
[
i
]
*
ndisp
,
cols_all
[
i
],
CV_32F
);
datas
[
i
].
create
(
rows_all
[
i
]
*
ndisp
,
cols_all
[
i
],
msg_type
);
impl
::
data_
down_kernel_caller
(
cols_all
[
i
],
rows_all
[
i
],
rows_all
[
i
-
1
]
,
datas
[
i
-
1
],
datas
[
i
],
stream
);
impl
::
data_
step_down
(
cols_all
[
i
],
rows_all
[
i
],
rows_all
[
i
-
1
],
msg_type
,
datas
[
i
-
1
],
datas
[
i
],
stream
);
}
DevMem2D
_
<
float
>
mus
[]
=
{
u
,
u2
};
DevMem2D
_
<
float
>
mds
[]
=
{
d
,
d2
};
DevMem2D
_
<
float
>
mrs
[]
=
{
r
,
r2
};
DevMem2D
_
<
float
>
mls
[]
=
{
l
,
l2
};
DevMem2D
mus
[]
=
{
u
,
u2
};
DevMem2D
mds
[]
=
{
d
,
d2
};
DevMem2D
mrs
[]
=
{
r
,
r2
};
DevMem2D
mls
[]
=
{
l
,
l2
};
int
mem_idx
=
(
levels
&
1
)
?
0
:
1
;
for
(
int
i
=
levels
-
1
;
i
>=
0
;
i
--
)
// for lower level we have already computed messages by setting to zero
for
(
int
i
=
levels
-
1
;
i
>=
0
;
i
--
)
{
// for lower level we have already computed messages by setting to zero
if
(
i
!=
levels
-
1
)
impl
::
level_up
(
mem_idx
,
cols_all
[
i
],
rows_all
[
i
],
rows_all
[
i
+
1
]
,
mus
,
mds
,
mls
,
mrs
,
stream
);
impl
::
level_up
_messages
(
mem_idx
,
cols_all
[
i
],
rows_all
[
i
],
rows_all
[
i
+
1
],
msg_type
,
mus
,
mds
,
mls
,
mrs
,
stream
);
impl
::
cal
l_all_iterations
(
cols_all
[
i
],
rows_all
[
i
],
iters_all
[
i
]
,
mus
[
mem_idx
],
mds
[
mem_idx
],
mls
[
mem_idx
],
mrs
[
mem_idx
],
datas
[
i
],
stream
);
impl
::
cal
c_all_iterations
(
cols_all
[
i
],
rows_all
[
i
],
iters
,
msg_type
,
mus
[
mem_idx
],
mds
[
mem_idx
],
mls
[
mem_idx
],
mrs
[
mem_idx
],
datas
[
i
],
stream
);
mem_idx
=
(
mem_idx
+
1
)
&
1
;
}
if
(
disp
.
empty
())
disp
.
create
(
rows
,
cols
,
CV_
32
S
);
disp
.
create
(
rows
,
cols
,
CV_
16
S
);
if
(
disp
.
type
()
==
CV_
32
S
)
if
(
disp
.
type
()
==
CV_
16
S
)
{
disp
=
zero
;
impl
::
output
_caller
(
u
,
d
,
l
,
r
,
datas
.
front
(),
disp
,
stream
);
impl
::
output
(
msg_type
,
u
,
d
,
l
,
r
,
datas
.
front
(),
disp
,
stream
);
}
else
{
out
.
create
(
rows
,
cols
,
CV_
32
S
);
out
.
create
(
rows
,
cols
,
CV_
16
S
);
out
=
zero
;
impl
::
output
_caller
(
u
,
d
,
l
,
r
,
datas
.
front
(),
out
,
stream
);
impl
::
output
(
msg_type
,
u
,
d
,
l
,
r
,
datas
.
front
(),
out
,
stream
);
out
.
convertTo
(
disp
,
disp
.
type
());
}
...
...
@@ -213,12 +279,12 @@ static void stereo_bp_gpu_operator(int ndisp, int iters, int levels, float disc_
void
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
operator
()(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disp
)
{
::
stereo_bp_gpu_operator
(
ndisp
,
iters
,
levels
,
disc_cost
,
data_cost
,
lambda
,
u
,
d
,
l
,
r
,
u2
,
d2
,
l2
,
r2
,
datas
,
out
,
left
,
right
,
disp
,
0
);
::
stereo_bp_gpu_operator
(
ndisp
,
iters
,
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
disc_single_jump
,
msg_type
,
msg_scale
,
u
,
d
,
l
,
r
,
u2
,
d2
,
l2
,
r2
,
datas
,
out
,
left
,
right
,
disp
,
0
);
}
void
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
operator
()(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disp
,
const
CudaStream
&
stream
)
{
::
stereo_bp_gpu_operator
(
ndisp
,
iters
,
levels
,
disc_cost
,
data_cost
,
lambda
,
u
,
d
,
l
,
r
,
u2
,
d2
,
l2
,
r2
,
datas
,
out
,
left
,
right
,
disp
,
StreamAccessor
::
getStream
(
stream
));
::
stereo_bp_gpu_operator
(
ndisp
,
iters
,
levels
,
max_data_term
,
data_weight
,
max_disc_term
,
disc_single_jump
,
msg_type
,
msg_scale
,
u
,
d
,
l
,
r
,
u2
,
d2
,
l2
,
r2
,
datas
,
out
,
left
,
right
,
disp
,
StreamAccessor
::
getStream
(
stream
));
}
bool
cv
::
gpu
::
StereoBeliefPropagation_GPU
::
checkIfGpuCallReasonable
()
...
...
modules/gpu/src/cuda/beliefpropagation.cu
View file @
788ac96f
...
...
@@ -41,43 +41,57 @@
//M*/
#include "opencv2/gpu/devmem2d.hpp"
#include "saturate_cast.hpp"
#include "safe_call.hpp"
using namespace cv::gpu;
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
#ifndef FLT_MAX
#define FLT_MAX 3.402823466e+38F
#endif
typedef unsigned char uchar;
///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////
namespace beliefpropagation_gpu
{
{
__constant__ int cndisp;
__constant__ float cdisc_cost;
__constant__ float cdata_cost;
__constant__ float clambda;
__constant__ float cmax_data_term;
__constant__ float cdata_weight;
__constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump;
};
namespace cv { namespace gpu { namespace impl {
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
{
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cdisc_single_jump, &disc_single_jump, sizeof(float)) );
}
}}}
///////////////////////////////////////////////////////////////
//////////////////
comp data ///////
//////////////////////////
//////////////////
//////// comp data
//////////////////////////
///////////////////////////////////////////////////////////////
namespace beliefpropagation_gpu
{
__global__ void comp_data_kernel(uchar* l, uchar* r, size_t step, float* data, size_t data_step, int cols, int rows)
template <typename T>
__global__ void comp_data(uchar* l, uchar* r, size_t step, T* data, size_t data_step, int cols, int rows)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y
> 0 && y < rows - 1 && x > 0 && x < cols - 1
)
if (y
< rows && x < cols
)
{
uchar
*
ls = l + y * step + x;
uchar
*
rs = r + y * step + x;
uchar
*
ls = l + y * step + x;
uchar
*
rs = r + y * step + x;
float *
ds = data + y * data_step + x;
T*
ds = data + y * data_step + x;
size_t disp_step = data_step * rows;
for (int disp = 0; disp < cndisp; disp++)
...
...
@@ -88,11 +102,11 @@ namespace beliefpropagation_gpu
int re = rs[-disp];
float val = abs(le - re);
ds[disp * disp_step] =
clambda * fmin(val, cdata_cost
);
ds[disp * disp_step] =
saturate_cast<T>(fmin(cdata_weight * val, cdata_weight * cmax_data_term)
);
}
else
{
ds[disp * disp_step] =
cdata_cost
;
ds[disp * disp_step] =
saturate_cast<T>(cdata_weight * cmax_data_term)
;
}
}
}
...
...
@@ -100,41 +114,52 @@ namespace beliefpropagation_gpu
}
namespace cv { namespace gpu { namespace impl {
extern "C" void load_constants(int ndisp, float disc_cost, float data_cost, float lambda)
{
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cndisp, &ndisp, sizeof(ndisp)) );
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cdisc_cost, &disc_cost, sizeof(disc_cost)) );
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::cdata_cost, &data_cost, sizeof(data_cost)) );
cudaSafeCall( cudaMemcpyToSymbol(beliefpropagation_gpu::clambda, &lambda, sizeof(lambda)) );
}
extern "C" void comp_data_caller(const DevMem2D& l, const DevMem2D& r, DevMem2D_<float> mdata, const cudaStream_t& stream)
typedef void (*CompDataFunc)(const DevMem2D& l, const DevMem2D& r, DevMem2D mdata, const cudaStream_t& stream);
template<typename T>
void comp_data_(const DevMem2D& l, const DevMem2D& r, DevMem2D mdata, const cudaStream_t& stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(l.cols, threads.x);
grid.y = divUp(l.rows, threads.y);
beliefpropagation_gpu::comp_data<T><<<grid, threads, 0, stream>>>(l.ptr, r.ptr, l.step, (T*)mdata.ptr, mdata.step/sizeof(T), l.cols, l.rows);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
if (stream == 0)
{
beliefpropagation_gpu::comp_data_kernel<<<grid, threads>>>(l.ptr, r.ptr, l.step, mdata.ptr, mdata.step/sizeof(float), l.cols, l.rows);
//cudaSafeCall( cudaThreadSynchronize() );
}
else
{
beliefpropagation_gpu::comp_data_kernel<<<grid, threads, 0, stream>>>(l.ptr, r.ptr, l.step, mdata.ptr, mdata.step/sizeof(float), l.cols, l.rows);
}
void comp_data(int msgType, const DevMem2D& l, const DevMem2D& r, DevMem2D mdata, const cudaStream_t& stream)
{
static CompDataFunc tab[8] =
{
0, // uchar
0, // schar
0, // ushort
comp_data_<short>, // short
0, // int
comp_data_<float>, // float
0, // double
0 // user type
};
CompDataFunc func = tab[msgType];
if (func == 0)
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(l, r, mdata, stream);
}
}}}
///////////////////////////////////////////////////////////////
//////////////////
data_step_down /////
///////////////////////
//////////////////
////// data step down
///////////////////////
///////////////////////////////////////////////////////////////
namespace beliefpropagation_gpu
{
__global__ void data_down_kernel(int dst_cols, int dst_rows, int src_rows, float *src, size_t src_step, float *dst, size_t dst_step)
{
template <typename T>
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
...
...
@@ -151,14 +176,17 @@ namespace beliefpropagation_gpu
dst_reg += src[d * src_disp_step + src_step * (2*y+0) + (2*x+1)];
dst_reg += src[d * src_disp_step + src_step * (2*y+1) + (2*x+1)];
dst[d * dst_disp_step + y * dst_step + x] =
dst_reg
;
dst[d * dst_disp_step + y * dst_step + x] =
saturate_cast<T>(dst_reg)
;
}
}
}
}
namespace cv { namespace gpu { namespace impl {
extern "C" void data_down_kernel_caller(int dst_cols, int dst_rows, int src_rows, const DevMem2D_<float>& src, DevMem2D_<float> dst, const cudaStream_t& stream)
typedef void (*DataStepDownFunc)(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, DevMem2D dst, const cudaStream_t& stream);
template<typename T>
void data_step_down_(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, DevMem2D dst, const cudaStream_t& stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
...
...
@@ -166,26 +194,41 @@ namespace cv { namespace gpu { namespace impl {
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
if (stream == 0)
{
beliefpropagation_gpu::data_down_kernel<<<grid, threads>>>(dst_cols, dst_rows, src_rows, src.ptr, src.step/sizeof(float), dst.ptr, dst.step/sizeof(float));
//cudaSafeCall( cudaThreadSynchronize() );
}
else
{
beliefpropagation_gpu::data_down_kernel<<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, src.ptr, src.step/sizeof(float), dst.ptr, dst.step/sizeof(float));
}
beliefpropagation_gpu::data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)src.ptr, src.step/sizeof(T), (T*)dst.ptr, dst.step/sizeof(T));
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
void data_step_down(int dst_cols, int dst_rows, int src_rows, int msgType, const DevMem2D& src, DevMem2D dst, const cudaStream_t& stream)
{
static DataStepDownFunc tab[8] =
{
0, // uchar
0, // schar
0, // ushort
data_step_down_<short>, // short
0, // int
data_step_down_<float>, // float
0, // double
0 // user type
};
DataStepDownFunc func = tab[msgType];
if (func == 0)
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(dst_cols, dst_rows, src_rows, src, dst, stream);
}
}}}
///////////////////////////////////////////////////////////////
//////////////////
level up messages ////////////////////////
//////////////////
/
level up messages ////////////////////////
///////////////////////////////////////////////////////////////
namespace beliefpropagation_gpu
{
__global__ void level_up_kernel(int dst_cols, int dst_rows, int src_rows, float *src, size_t src_step, float *dst, size_t dst_step)
{
template <typename T>
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
...
...
@@ -195,8 +238,8 @@ namespace beliefpropagation_gpu
const size_t dst_disp_step = dst_step * dst_rows;
const size_t src_disp_step = src_step * src_rows;
float *
dstr = dst + y * dst_step + x;
float *
srcr = src + y/2 * src_step + x/2;
T*
dstr = dst + y * dst_step + x;
const T*
srcr = src + y/2 * src_step + x/2;
for (int d = 0; d < cndisp; ++d)
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
...
...
@@ -205,7 +248,10 @@ namespace beliefpropagation_gpu
}
namespace cv { namespace gpu { namespace impl {
extern "C" void level_up(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D_<float>* mu, DevMem2D_<float>* md, DevMem2D_<float>* ml, DevMem2D_<float>* mr, const cudaStream_t& stream)
typedef void (*LevelUpMessagesFunc)(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, const cudaStream_t& stream);
template<typename T>
void level_up_messages_(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, const cudaStream_t& stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
...
...
@@ -215,74 +261,94 @@ namespace cv { namespace gpu { namespace impl {
int src_idx = (dst_idx + 1) & 1;
if (stream == 0)
{
beliefpropagation_gpu::level_up_kernel<<<grid, threads>>>(dst_cols, dst_rows, src_rows, mu[src_idx].ptr, mu[src_idx].step/sizeof(float), mu[dst_idx].ptr, mu[dst_idx].step/sizeof(float));
beliefpropagation_gpu::level_up_kernel<<<grid, threads>>>(dst_cols, dst_rows, src_rows, md[src_idx].ptr, md[src_idx].step/sizeof(float), md[dst_idx].ptr, md[dst_idx].step/sizeof(float));
beliefpropagation_gpu::level_up_kernel<<<grid, threads>>>(dst_cols, dst_rows, src_rows, ml[src_idx].ptr, ml[src_idx].step/sizeof(float), ml[dst_idx].ptr, ml[dst_idx].step/sizeof(float));
beliefpropagation_gpu::level_up_kernel<<<grid, threads>>>(dst_cols, dst_rows, src_rows, mr[src_idx].ptr, mr[src_idx].step/sizeof(float), mr[dst_idx].ptr, mr[dst_idx].step/sizeof(float));
//cudaSafeCall( cudaThreadSynchronize() );
}
else
{
beliefpropagation_gpu::level_up_kernel<<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, mu[src_idx].ptr, mu[src_idx].step/sizeof(float), mu[dst_idx].ptr, mu[dst_idx].step/sizeof(float));
beliefpropagation_gpu::level_up_kernel<<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, md[src_idx].ptr, md[src_idx].step/sizeof(float), md[dst_idx].ptr, md[dst_idx].step/sizeof(float));
beliefpropagation_gpu::level_up_kernel<<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, ml[src_idx].ptr, ml[src_idx].step/sizeof(float), ml[dst_idx].ptr, ml[dst_idx].step/sizeof(float));
beliefpropagation_gpu::level_up_kernel<<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, mr[src_idx].ptr, mr[src_idx].step/sizeof(float), mr[dst_idx].ptr, mr[dst_idx].step/sizeof(float));
}
beliefpropagation_gpu::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mus[src_idx].ptr, mus[src_idx].step/sizeof(T), (T*)mus[dst_idx].ptr, mus[dst_idx].step/sizeof(T));
beliefpropagation_gpu::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mds[src_idx].ptr, mds[src_idx].step/sizeof(T), (T*)mds[dst_idx].ptr, mds[dst_idx].step/sizeof(T));
beliefpropagation_gpu::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mls[src_idx].ptr, mls[src_idx].step/sizeof(T), (T*)mls[dst_idx].ptr, mls[dst_idx].step/sizeof(T));
beliefpropagation_gpu::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mrs[src_idx].ptr, mrs[src_idx].step/sizeof(T), (T*)mrs[dst_idx].ptr, mrs[dst_idx].step/sizeof(T));
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
}}}
void level_up_messages(int dst_idx, int dst_cols, int dst_rows, int src_rows, int msgType, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, const cudaStream_t& stream)
{
static LevelUpMessagesFunc tab[8] =
{
0, // uchar
0, // schar
0, // ushort
level_up_messages_<short>, // short
0, // int
level_up_messages_<float>, // float
0, // double
0 // user type
};
LevelUpMessagesFunc func = tab[msgType];
if (func == 0)
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(dst_idx, dst_cols, dst_rows, src_rows, mus, mds, mls, mrs, stream);
}
}}}
///////////////////////////////////////////////////////////////
/////////////////
Calcs all iterations //
/////////////////////
/////////////////
/// calc all iterations
/////////////////////
///////////////////////////////////////////////////////////////
namespace beliefpropagation_gpu
{
__device__ void calc_min_linear_penalty(float *dst, size_t step)
template <typename T>
__device__ void calc_min_linear_penalty(T* dst, size_t step)
{
float prev = dst[0];
float cur;
for (int disp = 1; disp < cndisp; ++disp)
{
prev +=
1.0f
;
prev +=
cdisc_single_jump
;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = prev = cur;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
prev = dst[(cndisp - 1) * step];
for (int disp = cndisp - 2; disp >= 0; disp--)
{
prev +=
1.0f
;
prev +=
cdisc_single_jump
;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = prev = cur;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
}
__device__ void message(float *msg1, float *msg2, float *msg3, float *data, float *dst, size_t msg_disp_step, size_t data_disp_step)
template <typename T>
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
{
float minimum = FLT_MAX;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = msg1[msg_disp_step * i] + msg2[msg_disp_step * i] + msg3[msg_disp_step * i] + data[data_disp_step * i];
float dst_reg = msg1[msg_disp_step * i];
dst_reg += msg2[msg_disp_step * i];
dst_reg += msg3[msg_disp_step * i];
dst_reg += data[data_disp_step * i];
if (dst_reg < minimum)
minimum = dst_reg;
dst[msg_disp_step * i] = dst_reg;
dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
}
calc_min_linear_penalty(dst, msg_disp_step);
minimum += c
disc_cost
;
minimum += c
max_disc_term
;
float sum = 0;
for(int i = 0; i < cndisp; ++i)
...
...
@@ -290,7 +356,8 @@ namespace beliefpropagation_gpu
float dst_reg = dst[msg_disp_step * i];
if (dst_reg > minimum)
{
dst[msg_disp_step * i] = dst_reg = minimum;
dst_reg = minimum;
dst[msg_disp_step * i] = saturate_cast<T>(minimum);
}
sum += dst_reg;
}
...
...
@@ -300,18 +367,20 @@ namespace beliefpropagation_gpu
dst[msg_disp_step * i] -= sum;
}
__global__ void one_iteration(int t, float* u, float *d, float *l, float *r, size_t msg_step, float *data, size_t data_step, int cols, int rows)
template <typename T>
__global__ void one_iteration(int t, T* u, T* d, T* l, T* r, size_t msg_step, const T* data, size_t data_step, int cols, int rows)
{
int y = blockIdx.y * blockDim.y + threadIdx.y;
int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
if ( (y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
{
float *us = u + y * msg_step + x;
float *ds = d + y * msg_step + x;
float *ls = l + y * msg_step + x;
float *rs = r + y * msg_step + x;
float *dt = data + y * data_step + x;
T* us = u + y * msg_step + x;
T* ds = d + y * msg_step + x;
T* ls = l + y * msg_step + x;
T* rs = r + y * msg_step + x;
const T* dt = data + y * data_step + x;
size_t msg_disp_step = msg_step * rows;
size_t data_disp_step = data_step * rows;
...
...
@@ -324,7 +393,10 @@ namespace beliefpropagation_gpu
}
namespace cv { namespace gpu { namespace impl {
extern "C" void call_all_iterations(int cols, int rows, int iters, DevMem2D_<float>& u, DevMem2D_<float>& d, DevMem2D_<float>& l, DevMem2D_<float>& r, const DevMem2D_<float>& data, const cudaStream_t& stream)
typedef void (*CalcAllIterationFunc)(int cols, int rows, int iters, DevMem2D& u, DevMem2D& d, DevMem2D& l, DevMem2D& r, const DevMem2D& data, const cudaStream_t& stream);
template<typename T>
void calc_all_iterations_(int cols, int rows, int iters, DevMem2D& u, DevMem2D& d, DevMem2D& l, DevMem2D& r, const DevMem2D& data, const cudaStream_t& stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
...
...
@@ -332,39 +404,55 @@ namespace cv { namespace gpu { namespace impl {
grid.x = divUp(cols, threads.x << 1);
grid.y = divUp(rows, threads.y);
if (stream == 0
)
for(int t = 0; t < iters; ++t
)
{
for(int t = 0; t < iters; ++t)
beliefpropagation_gpu::one_iteration<<<grid, threads>>>(t, u.ptr, d.ptr, l.ptr, r.ptr, u.step/sizeof(float), data.ptr, data.step/sizeof(float), cols, rows);
//cudaSafeCall( cudaThreadSynchronize() );
}
else
{
for(int t = 0; t < iters; ++t)
beliefpropagation_gpu::one_iteration<<<grid, threads, 0, stream>>>(t, u.ptr, d.ptr, l.ptr, r.ptr, u.step/sizeof(float), data.ptr, data.step/sizeof(float), cols, rows);
}
beliefpropagation_gpu::one_iteration<T><<<grid, threads, 0, stream>>>(t, (T*)u.ptr, (T*)d.ptr, (T*)l.ptr, (T*)r.ptr, u.step/sizeof(T), (const T*)data.ptr, data.step/sizeof(T), cols, rows);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
}
}}}
void calc_all_iterations(int cols, int rows, int iters, int msgType, DevMem2D& u, DevMem2D& d, DevMem2D& l, DevMem2D& r, const DevMem2D& data, const cudaStream_t& stream)
{
static CalcAllIterationFunc tab[8] =
{
0, // uchar
0, // schar
0, // ushort
calc_all_iterations_<short>, // short
0, // int
calc_all_iterations_<float>, // float
0, // double
0 // user type
};
CalcAllIterationFunc func = tab[msgType];
if (func == 0)
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(cols, rows, iters, u, d, l, r, data, stream);
}
}}}
///////////////////////////////////////////////////////////////
//////////////////
Output caller /
////////////////////////////
//////////////////
///////// output
////////////////////////////
///////////////////////////////////////////////////////////////
namespace beliefpropagation_gpu
{
__global__ void output(int cols, int rows, float *u, float *d, float *l, float *r, float* data, size_t step, int *disp, size_t res_step)
{
template <typename T>
__global__ void output(int cols, int rows, const T* u, const T* d, const T* l, const T* r, const T* data, size_t step, short* disp, size_t res_step)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
{
float *
us = u + (y + 1) * step + x;
float *
ds = d + (y - 1) * step + x;
float *
ls = l + y * step + (x + 1);
float *
rs = r + y * step + (x - 1);
float *
dt = data + y * step + x;
const T*
us = u + (y + 1) * step + x;
const T*
ds = d + (y - 1) * step + x;
const T*
ls = l + y * step + (x + 1);
const T*
rs = r + y * step + (x - 1);
const T*
dt = data + y * step + x;
size_t disp_step = rows * step;
...
...
@@ -372,7 +460,11 @@ namespace beliefpropagation_gpu
float best_val = FLT_MAX;
for (int d = 0; d < cndisp; ++d)
{
float val = us[d * disp_step] + ds[d * disp_step] + ls[d * disp_step] + rs[d * disp_step] + dt[d * disp_step];
float val = us[d * disp_step];
val += ds[d * disp_step];
val += ls[d * disp_step];
val += rs[d * disp_step];
val += dt[d * disp_step];
if (val < best_val)
{
...
...
@@ -381,28 +473,46 @@ namespace beliefpropagation_gpu
}
}
disp[res_step * y + x] =
best;
disp[res_step * y + x] =
saturate_cast<short>(best);
}
}
}
namespace cv { namespace gpu { namespace impl {
extern "C" void output_caller(const DevMem2D_<float>& u, const DevMem2D_<float>& d, const DevMem2D_<float>& l, const DevMem2D_<float>& r, const DevMem2D_<float>& data, DevMem2D_<int> disp, const cudaStream_t& stream)
{
typedef void (*OutputFunc)(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, DevMem2D disp, const cudaStream_t& stream);
template<typename T>
void output_(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, DevMem2D disp, const cudaStream_t& stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
if (stream == 0)
{
beliefpropagation_gpu::output<<<grid, threads>>>(disp.cols, disp.rows, u.ptr, d.ptr, l.ptr, r.ptr, data.ptr, u.step/sizeof(float), disp.ptr, disp.step/sizeof(int));
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
beliefpropagation_gpu::output<<<grid, threads, 0, stream>>>(disp.cols, disp.rows, u.ptr, d.ptr, l.ptr, r.ptr, data.ptr, u.step/sizeof(float), disp.ptr, disp.step/sizeof(int));
}
beliefpropagation_gpu::output<T><<<grid, threads, 0, stream>>>(disp.cols, disp.rows, (const T*)u.ptr, (const T*)d.ptr, (const T*)l.ptr, (const T*)r.ptr, (const T*)data.ptr, u.step/sizeof(T), (short*)disp.ptr, disp.step/sizeof(short));
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
void output(int msgType, const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, DevMem2D disp, const cudaStream_t& stream)
{
static OutputFunc tab[8] =
{
0, // uchar
0, // schar
0, // ushort
output_<short>, // short
0, // int
output_<float>, // float
0, // double
0 // user type
};
OutputFunc func = tab[msgType];
if (func == 0)
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(u, d, l, r, data, disp, stream);
}
}}}
\ No newline at end of file
modules/gpu/src/cuda/saturate_cast.hpp
View file @
788ac96f
...
...
@@ -49,119 +49,123 @@ namespace cv
{
namespace
gpu
{
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
uchar
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
schar
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
ushort
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
short
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
uint
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
int
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
float
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
double
v
)
{
return
_Tp
(
v
);
}
// To fix link error: this func already defined in other obj file
namespace
{
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
uchar
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
schar
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
ushort
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
short
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
uint
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
int
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
float
v
)
{
return
_Tp
(
v
);
}
template
<
typename
_Tp
>
__device__
_Tp
saturate_cast
(
double
v
)
{
return
_Tp
(
v
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
schar
v
)
{
return
(
uchar
)
max
((
int
)
v
,
0
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
ushort
v
)
{
return
(
uchar
)
min
((
uint
)
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
int
v
)
{
return
(
uchar
)((
uint
)
v
<=
UCHAR_MAX
?
v
:
v
>
0
?
UCHAR_MAX
:
0
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
uint
v
)
{
return
(
uchar
)
min
(
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
short
v
)
{
return
saturate_cast
<
uchar
>
((
uint
)
v
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
schar
v
)
{
return
(
uchar
)
max
((
int
)
v
,
0
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
ushort
v
)
{
return
(
uchar
)
min
((
uint
)
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
int
v
)
{
return
(
uchar
)((
uint
)
v
<=
UCHAR_MAX
?
v
:
v
>
0
?
UCHAR_MAX
:
0
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
uint
v
)
{
return
(
uchar
)
min
(
v
,
(
uint
)
UCHAR_MAX
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
short
v
)
{
return
saturate_cast
<
uchar
>
((
uint
)
v
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
#else
return
saturate_cast
<
uchar
>
((
float
)
v
);
#endif
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
}
template
<>
__device__
uchar
saturate_cast
<
uchar
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
uchar
>
(
iv
);
#else
return
saturate_cast
<
uchar
>
((
float
)
v
);
#endif
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
uchar
v
)
{
return
(
schar
)
min
((
int
)
v
,
SCHAR_MAX
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
ushort
v
)
{
return
(
schar
)
min
((
uint
)
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
int
v
)
{
return
(
schar
)((
uint
)(
v
-
SCHAR_MIN
)
<=
(
uint
)
UCHAR_MAX
?
v
:
v
>
0
?
SCHAR_MAX
:
SCHAR_MIN
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
short
v
)
{
return
saturate_cast
<
schar
>
((
int
)
v
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
uint
v
)
{
return
(
schar
)
min
(
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
uchar
v
)
{
return
(
schar
)
min
((
int
)
v
,
SCHAR_MAX
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
ushort
v
)
{
return
(
schar
)
min
((
uint
)
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
int
v
)
{
return
(
schar
)((
uint
)(
v
-
SCHAR_MIN
)
<=
(
uint
)
UCHAR_MAX
?
v
:
v
>
0
?
SCHAR_MAX
:
SCHAR_MIN
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
short
v
)
{
return
saturate_cast
<
schar
>
((
int
)
v
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
uint
v
)
{
return
(
schar
)
min
(
v
,
(
uint
)
SCHAR_MAX
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
#else
return
saturate_cast
<
schar
>
((
float
)
v
);
#endif
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
}
template
<>
__device__
schar
saturate_cast
<
schar
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
schar
>
(
iv
);
#else
return
saturate_cast
<
schar
>
((
float
)
v
);
#endif
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
schar
v
)
{
return
(
ushort
)
max
((
int
)
v
,
0
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
short
v
)
{
return
(
ushort
)
max
((
int
)
v
,
0
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
int
v
)
{
return
(
ushort
)((
uint
)
v
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
USHRT_MAX
:
0
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
uint
v
)
{
return
(
ushort
)
min
(
v
,
(
uint
)
USHRT_MAX
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
#else
return
saturate_cast
<
ushort
>
((
float
)
v
);
#endif
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
schar
v
)
{
return
(
ushort
)
max
((
int
)
v
,
0
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
short
v
)
{
return
(
ushort
)
max
((
int
)
v
,
0
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
int
v
)
{
return
(
ushort
)((
uint
)
v
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
USHRT_MAX
:
0
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
uint
v
)
{
return
(
ushort
)
min
(
v
,
(
uint
)
USHRT_MAX
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
}
template
<>
__device__
ushort
saturate_cast
<
ushort
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
ushort
>
(
iv
);
#else
return
saturate_cast
<
ushort
>
((
float
)
v
);
#endif
}
template
<>
__device__
short
saturate_cast
<
short
>
(
ushort
v
)
{
return
(
short
)
min
((
int
)
v
,
SHRT_MAX
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
int
v
)
{
return
(
short
)((
uint
)(
v
-
SHRT_MIN
)
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
SHRT_MAX
:
SHRT_MIN
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
uint
v
)
{
return
(
short
)
min
(
v
,
(
uint
)
SHRT_MAX
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
#else
return
saturate_cast
<
short
>
((
float
)
v
);
#endif
}
template
<>
__device__
short
saturate_cast
<
short
>
(
ushort
v
)
{
return
(
short
)
min
((
int
)
v
,
SHRT_MAX
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
int
v
)
{
return
(
short
)((
uint
)(
v
-
SHRT_MIN
)
<=
(
uint
)
USHRT_MAX
?
v
:
v
>
0
?
SHRT_MAX
:
SHRT_MIN
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
uint
v
)
{
return
(
short
)
min
(
v
,
(
uint
)
SHRT_MAX
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
float
v
)
{
int
iv
=
__float2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
}
template
<>
__device__
short
saturate_cast
<
short
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int
iv
=
__double2int_rn
(
v
);
return
saturate_cast
<
short
>
(
iv
);
#else
return
saturate_cast
<
short
>
((
float
)
v
);
#endif
}
template
<>
__device__
int
saturate_cast
<
int
>
(
float
v
)
{
return
__float2int_rn
(
v
);
}
template
<>
__device__
int
saturate_cast
<
int
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
return
__double2int_rn
(
v
);
#else
return
saturate_cast
<
int
>
((
float
)
v
);
#endif
}
template
<>
__device__
int
saturate_cast
<
int
>
(
float
v
)
{
return
__float2int_rn
(
v
);
}
template
<>
__device__
int
saturate_cast
<
int
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
return
__double2int_rn
(
v
);
#else
return
saturate_cast
<
int
>
((
float
)
v
);
#endif
}
template
<>
__device__
uint
saturate_cast
<
uint
>
(
float
v
){
return
__float2uint_rn
(
v
);
}
template
<>
__device__
uint
saturate_cast
<
uint
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
return
__double2uint_rn
(
v
);
#else
return
saturate_cast
<
uint
>
((
float
)
v
);
#endif
template
<>
__device__
uint
saturate_cast
<
uint
>
(
float
v
){
return
__float2uint_rn
(
v
);
}
template
<>
__device__
uint
saturate_cast
<
uint
>
(
double
v
)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
return
__double2uint_rn
(
v
);
#else
return
saturate_cast
<
uint
>
((
float
)
v
);
#endif
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment