Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
24ca620c
Commit
24ca620c
authored
May 28, 2013
by
Vadim Pisarevsky
Committed by
OpenCV Buildbot
May 28, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #916 from bitwangyaoyao:2.4_fixPyrLK
parents
d3c7ae6a
d81c145f
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
119 additions
and
404 deletions
+119
-404
hog.cpp
modules/ocl/src/hog.cpp
+3
-1
pyrlk.cl
modules/ocl/src/opencl/pyrlk.cl
+100
-332
pyrlk.cpp
modules/ocl/src/pyrlk.cpp
+16
-71
No files found.
modules/ocl/src/hog.cpp
View file @
24ca620c
...
...
@@ -1578,7 +1578,9 @@ static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
)
{
size_t
wave_size
=
queryDeviceInfo
<
WAVEFRONT_SIZE
,
size_t
>
();
cl_kernel
kernel
=
openCLGetKernelFromSource
(
clCxt
,
source
,
kernelName
);
size_t
wave_size
=
queryDeviceInfo
<
WAVEFRONT_SIZE
,
size_t
>
(
kernel
);
openCLSafeCall
(
clReleaseKernel
(
kernel
));
if
(
wave_size
<=
16
)
{
char
build_options
[
64
];
...
...
modules/ocl/src/opencl/pyrlk.cl
View file @
24ca620c
...
...
@@ -46,145 +46,10 @@
//#pragma
OPENCL
EXTENSION
cl_amd_printf
:
enable
__kernel
void
calcSharrDeriv_vertical_C1_D0
(
__global
const
uchar*
src,
int
srcStep,
int
rows,
int
cols,
int
cn,
__global
short*
dx_buf,
int
dx_bufStep,
__global
short*
dy_buf,
int
dy_bufStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
if
(
y
<
rows
&&
x
<
cols
*
cn
)
{
const
uchar
src_val0
=
(
src
+
(
y
>
0
?
y-1
:
rows
>
1
?
1
:
0
)
*
srcStep
)
[x]
;
const
uchar
src_val1
=
(
src
+
y
*
srcStep
)
[x]
;
const
uchar
src_val2
=
(
src
+
(
y
<
rows-1
?
y+1
:
rows
>
1
?
rows-2
:
0
)
*
srcStep
)
[x]
;
((
__global
short*
)((
__global
char*
)
dx_buf
+
y
*
dx_bufStep
/
2
))
[x]
=
(
src_val0
+
src_val2
)
*
3
+
src_val1
*
10
;
((
__global
short*
)((
__global
char*
)
dy_buf
+
y
*
dy_bufStep
/
2
))
[x]
=
src_val2
-
src_val0
;
}
}
__kernel
void
calcSharrDeriv_vertical_C4_D0
(
__global
const
uchar*
src,
int
srcStep,
int
rows,
int
cols,
int
cn,
__global
short*
dx_buf,
int
dx_bufStep,
__global
short*
dy_buf,
int
dy_bufStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
if
(
y
<
rows
&&
x
<
cols
*
cn
)
{
const
uchar
src_val0
=
(
src
+
(
y
>
0
?
y
-
1
:
1
)
*
srcStep
)
[x]
;
const
uchar
src_val1
=
(
src
+
y
*
srcStep
)
[x]
;
const
uchar
src_val2
=
(
src
+
(
y
<
rows
-
1
?
y
+
1
:
rows
-
2
)
*
srcStep
)
[x]
;
((
__global
short*
)((
__global
char*
)
dx_buf
+
y
*
dx_bufStep
/
2
))
[x]
=
(
src_val0
+
src_val2
)
*
3
+
src_val1
*
10
;
((
__global
short*
)((
__global
char*
)
dy_buf
+
y
*
dy_bufStep
/
2
))
[x]
=
src_val2
-
src_val0
;
}
}
__kernel
void
calcSharrDeriv_horizontal_C1_D0
(
int
rows,
int
cols,
int
cn,
__global
const
short*
dx_buf,
int
dx_bufStep,
__global
const
short*
dy_buf,
int
dy_bufStep,
__global
short*
dIdx,
int
dIdxStep,
__global
short*
dIdy,
int
dIdyStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
colsn
=
cols
*
cn
;
if
(
y
<
rows
&&
x
<
colsn
)
{
__global
const
short*
dx_buf_row
=
dx_buf
+
y
*
dx_bufStep
;
__global
const
short*
dy_buf_row
=
dy_buf
+
y
*
dy_bufStep
;
const
int
xr
=
x
+
cn
<
colsn
?
x
+
cn
:
(
cols
-
2
)
*
cn
+
x
+
cn
-
colsn
;
const
int
xl
=
x
-
cn
>=
0
?
x
-
cn
:
cn
+
x
;
((
__global
short*
)((
__global
char*
)
dIdx
+
y
*
dIdxStep
/
2
))
[x]
=
dx_buf_row[xr]
-
dx_buf_row[xl]
;
((
__global
short*
)((
__global
char*
)
dIdy
+
y
*
dIdyStep
/
2
))
[x]
=
(
dy_buf_row[xr]
+
dy_buf_row[xl]
)
*
3
+
dy_buf_row[x]
*
10
;
}
}
__kernel
void
calcSharrDeriv_horizontal_C4_D0
(
int
rows,
int
cols,
int
cn,
__global
const
short*
dx_buf,
int
dx_bufStep,
__global
const
short*
dy_buf,
int
dy_bufStep,
__global
short*
dIdx,
int
dIdxStep,
__global
short*
dIdy,
int
dIdyStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
colsn
=
cols
*
cn
;
if
(
y
<
rows
&&
x
<
colsn
)
{
__global
const
short*
dx_buf_row
=
dx_buf
+
y
*
dx_bufStep
;
__global
const
short*
dy_buf_row
=
dy_buf
+
y
*
dy_bufStep
;
const
int
xr
=
x
+
cn
<
colsn
?
x
+
cn
:
(
cols
-
2
)
*
cn
+
x
+
cn
-
colsn
;
const
int
xl
=
x
-
cn
>=
0
?
x
-
cn
:
cn
+
x
;
((
__global
short*
)((
__global
char*
)
dIdx
+
y
*
dIdxStep
/
2
))
[x]
=
dx_buf_row[xr]
-
dx_buf_row[xl]
;
((
__global
short*
)((
__global
char*
)
dIdy
+
y
*
dIdyStep
/
2
))
[x]
=
(
dy_buf_row[xr]
+
dy_buf_row[xl]
)
*
3
+
dy_buf_row[x]
*
10
;
}
}
#
define
W_BITS
14
#
define
W_BITS1
14
#
define
CV_DESCALE
(
x,
n
)
(((
x
)
+
(
1
<<
((
n
)
-1
)))
>>
(
n
))
int
linearFilter_uchar
(
__global
const
uchar*
src,
int
srcStep,
int
cn,
float2
pt,
int
x,
int
y
)
{
int2
ipt
;
ipt.x
=
convert_int_sat_rtn
(
pt.x
)
;
ipt.y
=
convert_int_sat_rtn
(
pt.y
)
;
float
a
=
pt.x
-
ipt.x
;
float
b
=
pt.y
-
ipt.y
;
int
iw00
=
convert_int_sat_rte
((
1.0f
-
a
)
*
(
1.0f
-
b
)
*
(
1
<<
W_BITS
))
;
int
iw01
=
convert_int_sat_rte
(
a
*
(
1.0f
-
b
)
*
(
1
<<
W_BITS
))
;
int
iw10
=
convert_int_sat_rte
((
1.0f
-
a
)
*
b
*
(
1
<<
W_BITS
))
;
int
iw11
=
(
1
<<
W_BITS
)
-
iw00
-
iw01
-
iw10
;
__global
const
uchar*
src_row
=
src
+
(
ipt.y
+
y
)
*
srcStep
+
ipt.x
*
cn
;
__global
const
uchar*
src_row1
=
src
+
(
ipt.y
+
y
+
1
)
*
srcStep
+
ipt.x
*
cn
;
return
CV_DESCALE
(
src_row[x]
*
iw00
+
src_row[x
+
cn]
*
iw01
+
src_row1[x]
*
iw10
+
src_row1[x
+
cn]
*
iw11,
W_BITS1
-
5
)
;
}
int
linearFilter_short
(
__global
const
short*
src,
int
srcStep,
int
cn,
float2
pt,
int
x,
int
y
)
{
int2
ipt
;
ipt.x
=
convert_int_sat_rtn
(
pt.x
)
;
ipt.y
=
convert_int_sat_rtn
(
pt.y
)
;
float
a
=
pt.x
-
ipt.x
;
float
b
=
pt.y
-
ipt.y
;
int
iw00
=
convert_int_sat_rte
((
1.0f
-
a
)
*
(
1.0f
-
b
)
*
(
1
<<
W_BITS
))
;
int
iw01
=
convert_int_sat_rte
(
a
*
(
1.0f
-
b
)
*
(
1
<<
W_BITS
))
;
int
iw10
=
convert_int_sat_rte
((
1.0f
-
a
)
*
b
*
(
1
<<
W_BITS
))
;
int
iw11
=
(
1
<<
W_BITS
)
-
iw00
-
iw01
-
iw10
;
__global
const
short*
src_row
=
src
+
(
ipt.y
+
y
)
*
srcStep
+
ipt.x
*
cn
;
__global
const
short*
src_row1
=
src
+
(
ipt.y
+
y
+
1
)
*
srcStep
+
ipt.x
*
cn
;
return
CV_DESCALE
(
src_row[x]
*
iw00
+
src_row[x
+
cn]
*
iw01
+
src_row1[x]
*
iw10
+
src_row1[x
+
cn]
*
iw11,
W_BITS1
)
;
}
float
linearFilter_float
(
__global
const
float*
src,
int
srcStep,
int
cn,
float2
pt,
float
x,
float
y
)
{
int2
ipt
;
ipt.x
=
convert_int_sat_rtn
(
pt.x
)
;
ipt.y
=
convert_int_sat_rtn
(
pt.y
)
;
float
a
=
pt.x
-
ipt.x
;
float
b
=
pt.y
-
ipt.y
;
float
iw00
=
((
1.0f
-
a
)
*
(
1.0f
-
b
)
*
(
1
<<
W_BITS
))
;
float
iw01
=
(
a
*
(
1.0f
-
b
)
*
(
1
<<
W_BITS
))
;
float
iw10
=
((
1.0f
-
a
)
*
b
*
(
1
<<
W_BITS
))
;
float
iw11
=
(
1
<<
W_BITS
)
-
iw00
-
iw01
-
iw10
;
__global
const
float*
src_row
=
src
+
(
int
)(
ipt.y
+
y
)
*
srcStep
/
4
+
ipt.x
*
cn
;
__global
const
float*
src_row1
=
src
+
(
int
)(
ipt.y
+
y
+
1
)
*
srcStep
/
4
+
ipt.x
*
cn
;
return
src_row[
(
int
)
x]
*
iw00
+
src_row[
(
int
)
x
+
cn]
*
iw01
+
src_row1[
(
int
)
x]
*
iw10
+
src_row1[
(
int
)
x
+
cn]
*
iw11,
W_BITS1
-
5
;
}
#
define
BUFFER
64
#
ifndef
WAVE_SIZE
#
define
WAVE_SIZE
1
#
endif
#
ifdef
CPU
void
reduce3
(
float
val1,
float
val2,
float
val3,
__local
float*
smem1,
__local
float*
smem2,
__local
float*
smem3,
int
tid
)
{
...
...
@@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local
smem3[tid]
=
val3
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
if
BUFFER
>
128
if
(
tid
<
128
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
128]
;
smem2[tid]
=
val2
+=
smem2[tid
+
128]
;
smem3[tid]
=
val3
+=
smem3[tid
+
128]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
#
if
BUFFER
>
64
if
(
tid
<
64
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
64]
;
smem2[tid]
=
val2
+=
smem2[tid
+
64]
;
smem3[tid]
=
val3
+=
smem3[tid
+
64]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
tid
<
32
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
32]
;
smem2[tid]
=
val2
+=
smem2[tid
+
32]
;
smem3[tid]
=
val3
+=
smem3[tid
+
32]
;
smem1[tid]
+=
smem1[tid
+
32]
;
smem2[tid]
+=
smem2[tid
+
32]
;
smem3[tid]
+=
smem3[tid
+
32]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
16]
;
smem2[tid]
=
val2
+=
smem2[tid
+
16]
;
smem3[tid]
=
val3
+=
smem3[tid
+
16]
;
smem1[tid]
+=
smem1[tid
+
16]
;
smem2[tid]
+=
smem2[tid
+
16]
;
smem3[tid]
+=
smem3[tid
+
16]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
8]
;
smem2[tid]
=
val2
+=
smem2[tid
+
8]
;
smem3[tid]
=
val3
+=
smem3[tid
+
8]
;
smem1[tid]
+=
smem1[tid
+
8]
;
smem2[tid]
+=
smem2[tid
+
8]
;
smem3[tid]
+=
smem3[tid
+
8]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
4]
;
smem2[tid]
=
val2
+=
smem2[tid
+
4]
;
smem3[tid]
=
val3
+=
smem3[tid
+
4]
;
smem1[tid]
+=
smem1[tid
+
4]
;
smem2[tid]
+=
smem2[tid
+
4]
;
smem3[tid]
+=
smem3[tid
+
4]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
2]
;
smem2[tid]
=
val2
+=
smem2[tid
+
2]
;
smem3[tid]
=
val3
+=
smem3[tid
+
2]
;
smem1[tid]
+=
smem1[tid
+
2]
;
smem2[tid]
+=
smem2[tid
+
2]
;
smem3[tid]
+=
smem3[tid
+
2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
smem1[BUFFER]
=
val1
+=
smem1[tid
+
1]
;
smem2[BUFFER]
=
val2
+=
smem2[tid
+
1]
;
smem3[BUFFER]
=
val3
+=
smem3[tid
+
1]
;
smem1[BUFFER]
=
smem1[tid]
+
smem1[tid
+
1]
;
smem2[BUFFER]
=
smem2[tid]
+
smem2[tid
+
1]
;
smem3[BUFFER]
=
smem3[tid]
+
smem3[tid
+
1]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
smem2[tid]
=
val2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
if
BUFFER
>
128
if
(
tid
<
128
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
128]
)
;
smem2[tid]
=
(
val2
+=
smem2[tid
+
128]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
#
if
BUFFER
>
64
if
(
tid
<
64
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
64]
)
;
smem2[tid]
=
(
val2
+=
smem2[tid
+
64]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
tid
<
32
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
32]
)
;
smem2[tid]
=
(
val2
+=
smem2[tid
+
32]
)
;
smem1[tid]
+=
smem1[tid
+
32]
;
smem2[tid]
+=
smem2[tid
+
32]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
16]
)
;
smem2[tid]
=
(
val2
+=
smem2[tid
+
16]
)
;
smem1[tid]
+=
smem1[tid
+
16]
;
smem2[tid]
+=
smem2[tid
+
16]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
8]
)
;
smem2[tid]
=
(
val2
+=
smem2[tid
+
8]
)
;
smem1[tid]
+=
smem1[tid
+
8]
;
smem2[tid]
+=
smem2[tid
+
8]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
4]
)
;
smem2[tid]
=
(
val2
+=
smem2[tid
+
4]
)
;
smem1[tid]
+=
smem1[tid
+
4]
;
smem2[tid]
+=
smem2[tid
+
4]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
2]
)
;
smem2[tid]
=
(
val2
+=
smem2[tid
+
2]
)
;
smem1[tid]
+=
smem1[tid
+
2]
;
smem2[tid]
+=
smem2[tid
+
2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
smem1[BUFFER]
=
(
val1
+=
smem1[tid
+
1]
)
;
smem2[BUFFER]
=
(
val2
+=
smem2[tid
+
1]
)
;
smem1[BUFFER]
=
smem1[tid]
+
smem1[tid
+
1]
;
smem2[BUFFER]
=
smem2[tid]
+
smem2[tid
+
1]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
smem1[tid]
=
val1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
if
BUFFER
>
128
if
(
tid
<
128
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
128]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
#
if
BUFFER
>
64
if
(
tid
<
64
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
64]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
tid
<
32
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
32]
)
;
smem1[tid]
+=
smem1[tid
+
32]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
16]
)
;
smem1[tid]
+=
smem1[tid
+
16]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
8]
)
;
smem1[tid]
+=
smem1[tid
+
8]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
4]
)
;
smem1[tid]
+=
smem1[tid
+
4]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
smem1[tid]
=
(
val1
+=
smem1[tid
+
2]
)
;
smem1[tid]
+=
smem1[tid
+
2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
smem1[BUFFER]
=
(
val1
+=
smem1[tid
+
1]
)
;
smem1[BUFFER]
=
smem1[tid]
+
smem1[tid
+
1]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
#
else
void
reduce3
(
float
val1,
float
val2,
float
val3,
__local
float*
smem1,
__local
float*
smem2,
__local
float*
smem3,
int
tid
)
void
reduce3
(
float
val1,
float
val2,
float
val3,
__local
volatile
float*
smem1,
__local
volatile
float*
smem2,
__local
volatile
float*
smem3,
int
tid
)
{
smem1[tid]
=
val1
;
smem2[tid]
=
val2
;
smem3[tid]
=
val3
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
if
BUFFER
>
128
if
(
tid
<
128
)
if
(
tid
<
32
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
128]
;
smem2[tid]
=
val2
+=
smem2[tid
+
128]
;
smem3[tid]
=
val3
+=
smem3[tid
+
128]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem1[tid]
+=
smem1[tid
+
32]
;
smem2[tid]
+=
smem2[tid
+
32]
;
smem3[tid]
+=
smem3[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
#
if
BUFFER
>
64
if
(
tid
<
64
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
64]
;
smem2[tid]
=
val2
+=
smem2[tid
+
64]
;
smem3[tid]
=
val3
+=
smem3[tid
+
64]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem1[tid]
+=
smem1[tid
+
16]
;
smem2[tid]
+=
smem2[tid
+
16]
;
smem3[tid]
+=
smem3[tid
+
16]
;
#
if
WAVE_SIZE
<16
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
smem1[tid]
+=
smem1[tid
+
8]
;
smem2[tid]
+=
smem2[tid
+
8]
;
smem3[tid]
+=
smem3[tid
+
8]
;
if
(
tid
<
32
)
{
volatile
__local
float*
vmem1
=
smem1
;
volatile
__local
float*
vmem2
=
smem2
;
volatile
__local
float*
vmem3
=
smem3
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
32]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
32]
;
vmem3[tid]
=
val3
+=
vmem3[tid
+
32]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
16]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
16]
;
vmem3[tid]
=
val3
+=
vmem3[tid
+
16]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
8]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
8]
;
vmem3[tid]
=
val3
+=
vmem3[tid
+
8]
;
smem1[tid]
+=
smem1[tid
+
4]
;
smem2[tid]
+=
smem2[tid
+
4]
;
smem3[tid]
+=
smem3[tid
+
4]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
4
]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
4
]
;
vmem3[tid]
=
val3
+=
vmem3[tid
+
4
]
;
smem1[tid]
+=
smem1[tid
+
2
]
;
smem2[tid]
+=
smem2[tid
+
2
]
;
smem3[tid]
+=
smem3[tid
+
2
]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
2]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
2]
;
vmem3[tid]
=
val3
+=
vmem3[tid
+
2]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
1]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
1]
;
vmem3[tid]
=
val3
+=
vmem3[tid
+
1]
;
smem1[tid]
+=
smem1[tid
+
1]
;
smem2[tid]
+=
smem2[tid
+
1]
;
smem3[tid]
+=
smem3[tid
+
1]
;
}
}
void
reduce2
(
float
val1,
float
val2,
__local
float*
smem1,
__local
float*
smem2,
int
tid
)
void
reduce2
(
float
val1,
float
val2,
__local
volatile
float*
smem1,
__local
volatile
float*
smem2,
int
tid
)
{
smem1[tid]
=
val1
;
smem2[tid]
=
val2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
if
BUFFER
>
128
if
(
tid
<
128
)
if
(
tid
<
32
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
128]
;
smem2[tid]
=
val2
+=
smem2[tid
+
128]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem1[tid]
+=
smem1[tid
+
32]
;
smem2[tid]
+=
smem2[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
#
if
BUFFER
>
64
if
(
tid
<
64
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
64]
;
smem2[tid]
=
val2
+=
smem2[tid
+
64]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem1[tid]
+=
smem1[tid
+
16]
;
smem2[tid]
+=
smem2[tid
+
16]
;
#
if
WAVE_SIZE
<16
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
smem1[tid]
+=
smem1[tid
+
8]
;
smem2[tid]
+=
smem2[tid
+
8]
;
if
(
tid
<
32
)
{
volatile
__local
float*
vmem1
=
smem1
;
volatile
__local
float*
vmem2
=
smem2
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
32]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
32]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
16]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
16]
;
smem1[tid]
+=
smem1[tid
+
4]
;
smem2[tid]
+=
smem2[tid
+
4]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
8
]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
8
]
;
smem1[tid]
+=
smem1[tid
+
2
]
;
smem2[tid]
+=
smem2[tid
+
2
]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
4]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
4]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
2]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
2]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
1]
;
vmem2[tid]
=
val2
+=
vmem2[tid
+
1]
;
smem1[tid]
+=
smem1[tid
+
1]
;
smem2[tid]
+=
smem2[tid
+
1]
;
}
}
void
reduce1
(
float
val1,
__local
float*
smem1,
int
tid
)
void
reduce1
(
float
val1,
__local
volatile
float*
smem1,
int
tid
)
{
smem1[tid]
=
val1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
if
BUFFER
>
128
if
(
tid
<
128
)
if
(
tid
<
32
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
128]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem1[tid]
+=
smem1[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
#
if
BUFFER
>
64
if
(
tid
<
64
)
{
smem1[tid]
=
val1
+=
smem1[tid
+
64]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem1[tid]
+=
smem1[tid
+
16]
;
#
if
WAVE_SIZE
<16
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
if
(
tid
<
32
)
{
volatile
__local
float*
vmem1
=
smem1
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
32]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
16]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
8]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
4]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
2]
;
vmem1[tid]
=
val1
+=
vmem1[tid
+
1]
;
smem1[tid]
+=
smem1[tid
+
8]
;
smem1[tid]
+=
smem1[tid
+
4]
;
smem1[tid]
+=
smem1[tid
+
2]
;
smem1[tid]
+=
smem1[tid
+
1]
;
}
}
#
endif
#
define
SCALE
(
1.0f
/
(
1
<<
20
))
#
define
THRESHOLD
0.01f
#
define
DIMENSION
21
//
Image
read
mode
__constant
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_LINEAR
;
...
...
modules/ocl/src/pyrlk.cpp
View file @
24ca620c
...
...
@@ -15,8 +15,8 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
//
Dachuan Zhao, dachuan@multicorewareinc.com
//
Yao Wang, bitwangyaoyao@gmail.com
//
Dachuan Zhao, dachuan@multicorewareinc.com
//
Yao Wang, bitwangyaoyao@gmail.com
// Nathan, liujun@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
...
...
@@ -56,31 +56,16 @@ namespace cv
{
namespace
ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern
const
char
*
pyrlk
;
extern
const
char
*
pyrlk_no_image
;
extern
const
char
*
arithm_mul
;
}
}
struct
dim3
{
unsigned
int
x
,
y
,
z
;
};
struct
float2
{
float
x
,
y
;
};
struct
int2
{
int
x
,
y
;
};
namespace
{
void
calcPatchSize
(
cv
::
Size
winSize
,
int
cn
,
dim3
&
block
,
dim3
&
patch
,
bool
isDeviceArch11
)
static
void
calcPatchSize
(
cv
::
Size
winSize
,
int
cn
,
dim3
&
block
,
dim3
&
patch
,
bool
isDeviceArch11
)
{
winSize
.
width
*=
cn
;
...
...
@@ -100,45 +85,6 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe
block
.
z
=
patch
.
z
=
1
;
}
}
static
void
multiply_cus
(
const
oclMat
&
src1
,
oclMat
&
dst
,
float
scalar
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
CV_Error
(
CV_GpuNotSupported
,
"Selected device don't support double
\r\n
"
);
return
;
}
CV_Assert
(
src1
.
cols
==
dst
.
cols
&&
src1
.
rows
==
dst
.
rows
);
CV_Assert
(
src1
.
type
()
==
dst
.
type
());
CV_Assert
(
src1
.
depth
()
!=
CV_8S
);
Context
*
clCxt
=
src1
.
clCxt
;
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
size_t
globalThreads
[
3
]
=
{
src1
.
cols
,
src1
.
rows
,
1
};
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
args
.
push_back
(
make_pair
(
sizeof
(
float
),
(
float
*
)
&
scalar
));
openCLExecuteKernel
(
clCxt
,
&
arithm_mul
,
"arithm_muls"
,
globalThreads
,
localThreads
,
args
,
-
1
,
src1
.
depth
());
}
static
void
lkSparse_run
(
oclMat
&
I
,
oclMat
&
J
,
const
oclMat
&
prevPts
,
oclMat
&
nextPts
,
oclMat
&
status
,
oclMat
&
err
,
bool
/*GET_MIN_EIGENVALS*/
,
int
ptcount
,
...
...
@@ -151,15 +97,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
size_t
localThreads
[
3
]
=
{
8
,
isImageSupported
?
8
:
32
,
1
};
size_t
globalThreads
[
3
]
=
{
8
*
ptcount
,
isImageSupported
?
8
:
32
,
1
};
int
cn
=
I
.
oclchannels
();
char
calcErr
;
if
(
level
==
0
)
{
calcErr
=
1
;
}
else
{
calcErr
=
0
;
}
char
calcErr
=
level
==
0
?
1
:
0
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
...
...
@@ -198,7 +136,17 @@ static void lkSparse_run(oclMat &I, oclMat &J,
{
if
(
isImageSupported
)
{
openCLExecuteKernel
(
clCxt
,
&
pyrlk
,
kernelName
,
globalThreads
,
localThreads
,
args
,
I
.
oclchannels
(),
I
.
depth
());
stringstream
idxStr
;
idxStr
<<
kernelName
<<
"_C"
<<
I
.
oclchannels
()
<<
"_D"
<<
I
.
depth
();
cl_kernel
kernel
=
openCLGetKernelFromSource
(
clCxt
,
&
pyrlk
,
idxStr
.
str
());
int
wave_size
=
queryDeviceInfo
<
WAVEFRONT_SIZE
,
int
>
(
kernel
);
openCLSafeCall
(
clReleaseKernel
(
kernel
));
static
char
opt
[
16
]
=
{
0
};
sprintf
(
opt
,
" -D WAVE_SIZE=%d"
,
wave_size
);
openCLExecuteKernel
(
clCxt
,
&
pyrlk
,
kernelName
,
globalThreads
,
localThreads
,
args
,
I
.
oclchannels
(),
I
.
depth
(),
opt
);
releaseTexture
(
ITex
);
releaseTexture
(
JTex
);
}
...
...
@@ -241,8 +189,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
oclMat
temp1
=
(
useInitialFlow
?
nextPts
:
prevPts
).
reshape
(
1
);
oclMat
temp2
=
nextPts
.
reshape
(
1
);
multiply_cus
(
temp1
,
temp2
,
1.0
f
/
(
1
<<
maxLevel
)
/
2.0
f
);
//::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
multiply
(
1.0
f
/
(
1
<<
maxLevel
)
/
2.0
f
,
temp1
,
temp2
);
ensureSizeIsEnough
(
1
,
prevPts
.
cols
,
CV_8UC1
,
status
);
status
.
setTo
(
Scalar
::
all
(
1
));
...
...
@@ -257,7 +204,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
ensureSizeIsEnough
(
1
,
prevPts
.
cols
,
CV_32FC1
,
*
err
);
// build the image pyramids.
prevPyr_
.
resize
(
maxLevel
+
1
);
nextPyr_
.
resize
(
maxLevel
+
1
);
...
...
@@ -274,7 +220,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
}
// dI/dx ~ Ix, dI/dy ~ Iy
for
(
int
level
=
maxLevel
;
level
>=
0
;
level
--
)
{
lkSparse_run
(
prevPyr_
[
level
],
nextPyr_
[
level
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment