Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
69e6d001
Commit
69e6d001
authored
May 08, 2013
by
peng xiao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Optimize stereobm a bit.
Speedup about 30% on 6730M GPU.
parent
ed2199a4
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
43 additions
and
39 deletions
+43
-39
stereobm.cl
modules/ocl/src/opencl/stereobm.cl
+43
-39
No files found.
modules/ocl/src/opencl/stereobm.cl
View file @
69e6d001
...
...
@@ -60,8 +60,9 @@ unsigned int CalcSSD(__local unsigned int *col_ssd)
{
unsigned
int
cache
=
col_ssd[0]
;
for
(
int
i
=
1
,
j
=
radius
+
1
; i <= radius; i++, j++)
cache
+=
col_ssd[i]
+
col_ssd[j]
;
#
pragma
unroll
for
(
int
i
=
1
; i <= (radius << 1); i++)
cache
+=
col_ssd[i]
;
return
cache
;
}
...
...
@@ -69,20 +70,22 @@ unsigned int CalcSSD(__local unsigned int *col_ssd)
uint2
MinSSD
(
__local
unsigned
int
*col_ssd
)
{
unsigned
int
ssd[N_DISPARITIES]
;
//See
above:
#
define
COL_SSD_SIZE
(
BLOCK_W
+
2
*
radius
)
ssd[0]
=
CalcSSD
(
col_ssd
+
0
*
(
BLOCK_W
+
2
*
radius
))
;
ssd[1]
=
CalcSSD
(
col_ssd
+
1
*
(
BLOCK_W
+
2
*
radius
))
;
ssd[2]
=
CalcSSD
(
col_ssd
+
2
*
(
BLOCK_W
+
2
*
radius
))
;
ssd[3]
=
CalcSSD
(
col_ssd
+
3
*
(
BLOCK_W
+
2
*
radius
))
;
ssd[4]
=
CalcSSD
(
col_ssd
+
4
*
(
BLOCK_W
+
2
*
radius
))
;
ssd[5]
=
CalcSSD
(
col_ssd
+
5
*
(
BLOCK_W
+
2
*
radius
))
;
ssd[6]
=
CalcSSD
(
col_ssd
+
6
*
(
BLOCK_W
+
2
*
radius
))
;
ssd[7]
=
CalcSSD
(
col_ssd
+
7
*
(
BLOCK_W
+
2
*
radius
))
;
const
int
win_size
=
(
radius
<<
1
)
;
//See
above:
#
define
COL_SSD_SIZE
(
BLOCK_W
+
WIN_SIZE
)
ssd[0]
=
CalcSSD
(
col_ssd
+
0
*
(
BLOCK_W
+
win_size
))
;
ssd[1]
=
CalcSSD
(
col_ssd
+
1
*
(
BLOCK_W
+
win_size
))
;
ssd[2]
=
CalcSSD
(
col_ssd
+
2
*
(
BLOCK_W
+
win_size
))
;
ssd[3]
=
CalcSSD
(
col_ssd
+
3
*
(
BLOCK_W
+
win_size
))
;
ssd[4]
=
CalcSSD
(
col_ssd
+
4
*
(
BLOCK_W
+
win_size
))
;
ssd[5]
=
CalcSSD
(
col_ssd
+
5
*
(
BLOCK_W
+
win_size
))
;
ssd[6]
=
CalcSSD
(
col_ssd
+
6
*
(
BLOCK_W
+
win_size
))
;
ssd[7]
=
CalcSSD
(
col_ssd
+
7
*
(
BLOCK_W
+
win_size
))
;
unsigned
int
mssd
=
min
(
min
(
min
(
ssd[0],
ssd[1]
)
,
min
(
ssd[4],
ssd[5]
))
,
min
(
min
(
ssd[2],
ssd[3]
)
,
min
(
ssd[6],
ssd[7]
)))
;
int
bestIdx
=
0
;
for
(
int
i
=
0
; i < N_DISPARITIES; i++)
{
if
(
mssd
==
ssd[i]
)
...
...
@@ -100,14 +103,15 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL,
uint8
diff1
=
(
uint8
)(
imageL[idx1]
)
-
imgR1
;
uint8
diff2
=
(
uint8
)(
imageL[idx2]
)
-
imgR2
;
uint8
res
=
diff2
*
diff2
-
diff1
*
diff1
;
col_ssd[0
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s7
;
col_ssd[1
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s6
;
col_ssd[2
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s5
;
col_ssd[3
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s4
;
col_ssd[4
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s3
;
col_ssd[5
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s2
;
col_ssd[6
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s1
;
col_ssd[7
*
(
BLOCK_W
+
2
*
radius
)
]
+=
res.s0
;
const
int
win_size
=
(
radius
<<
1
)
;
col_ssd[0
*
(
BLOCK_W
+
win_size
)
]
+=
res.s7
;
col_ssd[1
*
(
BLOCK_W
+
win_size
)
]
+=
res.s6
;
col_ssd[2
*
(
BLOCK_W
+
win_size
)
]
+=
res.s5
;
col_ssd[3
*
(
BLOCK_W
+
win_size
)
]
+=
res.s4
;
col_ssd[4
*
(
BLOCK_W
+
win_size
)
]
+=
res.s3
;
col_ssd[5
*
(
BLOCK_W
+
win_size
)
]
+=
res.s2
;
col_ssd[6
*
(
BLOCK_W
+
win_size
)
]
+=
res.s1
;
col_ssd[7
*
(
BLOCK_W
+
win_size
)
]
+=
res.s0
;
}
void
InitColSSD
(
int
x_tex,
int
y_tex,
int
im_pitch,
__global
unsigned
char*
imageL,
...
...
@@ -115,28 +119,27 @@ void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imag
__local
unsigned
int
*col_ssd
)
{
uint8
leftPixel1
;
int
idx
;
uint8
diffa
=
0
;
for
(
int
i
=
0
; i < (2 * radius + 1); i++)
int
idx
=
y_tex
*
im_pitch
+
x_tex
;
const
int
win_size
=
(
radius
<<
1
)
;
for
(
int
i
=
0
; i < (win_size + 1); i++)
{
idx
=
y_tex
*
im_pitch
+
x_tex
;
leftPixel1
=
(
uint8
)(
imageL[idx]
)
;
uint8
imgR
=
convert_uint8
(
vload8
(
0
,
imageR
+
(
idx
-
d
-
7
)))
;
uint8
res
=
leftPixel1
-
imgR
;
diffa
+=
res
*
res
;
y_tex
+=
1
;
idx
+=
im_pitch
;
}
//See
above:
#
define
COL_SSD_SIZE
(
BLOCK_W
+
2
*
radius
)
col_ssd[0
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s7
;
col_ssd[1
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s6
;
col_ssd[2
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s5
;
col_ssd[3
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s4
;
col_ssd[4
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s3
;
col_ssd[5
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s2
;
col_ssd[6
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s1
;
col_ssd[7
*
(
BLOCK_W
+
2
*
radius
)
]
=
diffa.s0
;
//See
above:
#
define
COL_SSD_SIZE
(
BLOCK_W
+
WIN_SIZE
)
col_ssd[0
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s7
;
col_ssd[1
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s6
;
col_ssd[2
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s5
;
col_ssd[3
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s4
;
col_ssd[4
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s3
;
col_ssd[5
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s2
;
col_ssd[6
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s1
;
col_ssd[7
*
(
BLOCK_W
+
win_size
)
]
=
diffa.s0
;
}
__kernel
void
stereoKernel
(
__global
unsigned
char
*left,
__global
unsigned
char
*right,
...
...
@@ -145,15 +148,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
int
img_step,
int
maxdisp,
__local
unsigned
int
*col_ssd_cache
)
{
__local
unsigned
int
*col_ssd
=
col_ssd_cache
+
get_local_id
(
0
)
;
__local
unsigned
int
*col_ssd_extra
=
get_local_id
(
0
)
<
(
2
*
radius
)
?
col_ssd
+
BLOCK_W
:
0
;
__local
unsigned
int
*col_ssd
=
col_ssd_cache
+
get_local_id
(
0
)
;
__local
unsigned
int
*col_ssd_extra
=
get_local_id
(
0
)
<
(
radius
<<
1
)
?
col_ssd
+
BLOCK_W
:
0
;
int
X
=
get_group_id
(
0
)
*
BLOCK_W
+
get_local_id
(
0
)
+
maxdisp
+
radius
;
//
int
Y
=
get_group_id
(
1
)
*
ROWSperTHREAD
+
radius
;
#
define
Y
(
get_group_id
(
1
)
*
ROWSperTHREAD
+
radius
)
__global
unsigned
int*
minSSDImage
=
cminSSDImage
+
X
+
Y
*
cminSSD_step
;
__global
unsigned
int*
minSSDImage
=
cminSSDImage
+
X
+
Y
*
cminSSD_step
;
__global
unsigned
char*
disparImage
=
disp
+
X
+
Y
*
disp_step
;
int
end_row
=
ROWSperTHREAD
<
(
cheight
-
Y
)
?
ROWSperTHREAD:
(
cheight
-
Y
)
;
...
...
@@ -187,7 +189,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
for
(
int
row
=
1
; row < end_row; row++)
{
int
idx1
=
y_tex
*
img_step
+
x_tex
;
int
idx2
=
min
(
y_tex
+
(
2
*
radius
+
1
)
,
cheight
-
1
)
*
img_step
+
x_tex
;
int
idx2
=
min
(
y_tex
+
((
radius
<<
1
)
+
1
)
,
cheight
-
1
)
*
img_step
+
x_tex
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
StepDown
(
idx1,
idx2,
left,
right,
d,
col_ssd
)
;
if
(
col_ssd_extra
>
0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment