Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
ac8506db
Commit
ac8506db
authored
Aug 16, 2013
by
Roman Donchenko
Committed by
OpenCV Buildbot
Aug 16, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1210 from pengx17:2.4_ocl_surf_intel_fix
parents
35cb59bb
fd77a49e
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
255 additions
and
228 deletions
+255
-228
surf.cl
modules/nonfree/src/opencl/surf.cl
+255
-228
No files found.
modules/nonfree/src/opencl/surf.cl
View file @
ac8506db
...
...
@@ -16,6 +16,7 @@
//
//
@Authors
//
Peng
Xiao,
pengxiao@multicorewareinc.com
//
Sen
Liu,
swjtuls1987@126.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
...
...
@@ -43,9 +44,6 @@
//
//M*/
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_local_int32_base_atomics
:
enable
//
specialized
for
non-image2d_t
supported
platform,
intel
HD4000,
for
example
#
ifdef
DISABLE_IMAGE2D
#
define
IMAGE_INT32
__global
uint
*
...
...
@@ -105,7 +103,7 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
//
for
simple
haar
paatern
float
icvCalcHaarPatternSum_2
(
IMAGE_INT32
sumTex,
__constant
float
src[2][5]
,
__constant
float
2
*src
,
int
oldSize,
int
newSize,
int
y,
int
x,
...
...
@@ -116,21 +114,24 @@ float icvCalcHaarPatternSum_2(
F
d
=
0
;
#
pragma
unroll
for
(
int
k
=
0
; k < 2; ++k)
{
int
dx1
=
convert_int_rte
(
ratio
*
src[k][0]
)
;
int
dy1
=
convert_int_rte
(
ratio
*
src[k][1]
)
;
int
dx2
=
convert_int_rte
(
ratio
*
src[k][2]
)
;
int
dy2
=
convert_int_rte
(
ratio
*
src[k][3]
)
;
F
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1,
y
+
dy1
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1,
y
+
dy2
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2,
y
+
dy1
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2,
y
+
dy2
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[k][4]
/
((
dx2
-
dx1
)
*
(
dy2
-
dy1
))
;
}
int2
dx1
=
convert_int2_rte
(
ratio
*
src[0]
)
;
int2
dy1
=
convert_int2_rte
(
ratio
*
src[1]
)
;
int2
dx2
=
convert_int2_rte
(
ratio
*
src[2]
)
;
int2
dy2
=
convert_int2_rte
(
ratio
*
src[3]
)
;
F
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.x,
y
+
dy1.x
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.x,
y
+
dy2.x
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.x,
y
+
dy1.x
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.x,
y
+
dy2.x
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].x
/
((
dx2.x
-
dx1.x
)
*
(
dy2.x
-
dy1.x
))
;
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.y,
y
+
dy1.y
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.y,
y
+
dy2.y
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.y,
y
+
dy1.y
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.y,
y
+
dy2.y
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].y
/
((
dx2.y
-
dx1.y
)
*
(
dy2.y
-
dy1.y
))
;
return
(
float
)
d
;
}
...
...
@@ -138,7 +139,7 @@ float icvCalcHaarPatternSum_2(
//
N
=
3
float
icvCalcHaarPatternSum_3
(
IMAGE_INT32
sumTex,
__constant
float
src[2][5]
,
__constant
float
4
*src
,
int
oldSize,
int
newSize,
int
y,
int
x,
...
...
@@ -149,21 +150,31 @@ float icvCalcHaarPatternSum_3(
F
d
=
0
;
#
pragma
unroll
for
(
int
k
=
0
; k < 3; ++k)
{
int
dx1
=
convert_int_rte
(
ratio
*
src[k][0]
)
;
int
dy1
=
convert_int_rte
(
ratio
*
src[k][1]
)
;
int
dx2
=
convert_int_rte
(
ratio
*
src[k][2]
)
;
int
dy2
=
convert_int_rte
(
ratio
*
src[k][3]
)
;
F
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1,
y
+
dy1
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1,
y
+
dy2
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2,
y
+
dy1
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2,
y
+
dy2
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[k][4]
/
((
dx2
-
dx1
)
*
(
dy2
-
dy1
))
;
}
int4
dx1
=
convert_int4_rte
(
ratio
*
src[0]
)
;
int4
dy1
=
convert_int4_rte
(
ratio
*
src[1]
)
;
int4
dx2
=
convert_int4_rte
(
ratio
*
src[2]
)
;
int4
dy2
=
convert_int4_rte
(
ratio
*
src[3]
)
;
F
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.x,
y
+
dy1.x
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.x,
y
+
dy2.x
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.x,
y
+
dy1.x
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.x,
y
+
dy2.x
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].x
/
((
dx2.x
-
dx1.x
)
*
(
dy2.x
-
dy1.x
))
;
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.y,
y
+
dy1.y
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.y,
y
+
dy2.y
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.y,
y
+
dy1.y
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.y,
y
+
dy2.y
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].y
/
((
dx2.y
-
dx1.y
)
*
(
dy2.y
-
dy1.y
))
;
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.z,
y
+
dy1.z
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.z,
y
+
dy2.z
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.z,
y
+
dy1.z
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.z,
y
+
dy2.z
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].z
/
((
dx2.z
-
dx1.z
)
*
(
dy2.z
-
dy1.z
))
;
return
(
float
)
d
;
}
...
...
@@ -171,7 +182,7 @@ float icvCalcHaarPatternSum_3(
//
N
=
4
float
icvCalcHaarPatternSum_4
(
IMAGE_INT32
sumTex,
__constant
float
src[2][5]
,
__constant
float
4
*src
,
int
oldSize,
int
newSize,
int
y,
int
x,
...
...
@@ -182,21 +193,38 @@ float icvCalcHaarPatternSum_4(
F
d
=
0
;
#
pragma
unroll
for
(
int
k
=
0
; k < 4; ++k)
{
int
dx1
=
convert_int_rte
(
ratio
*
src[k][0]
)
;
int
dy1
=
convert_int_rte
(
ratio
*
src[k][1]
)
;
int
dx2
=
convert_int_rte
(
ratio
*
src[k][2]
)
;
int
dy2
=
convert_int_rte
(
ratio
*
src[k][3]
)
;
F
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1,
y
+
dy1
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1,
y
+
dy2
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2,
y
+
dy1
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2,
y
+
dy2
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[k][4]
/
((
dx2
-
dx1
)
*
(
dy2
-
dy1
))
;
}
int4
dx1
=
convert_int4_rte
(
ratio
*
src[0]
)
;
int4
dy1
=
convert_int4_rte
(
ratio
*
src[1]
)
;
int4
dx2
=
convert_int4_rte
(
ratio
*
src[2]
)
;
int4
dy2
=
convert_int4_rte
(
ratio
*
src[3]
)
;
F
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.x,
y
+
dy1.x
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.x,
y
+
dy2.x
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.x,
y
+
dy1.x
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.x,
y
+
dy2.x
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].x
/
((
dx2.x
-
dx1.x
)
*
(
dy2.x
-
dy1.x
))
;
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.y,
y
+
dy1.y
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.y,
y
+
dy2.y
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.y,
y
+
dy1.y
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.y,
y
+
dy2.y
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].y
/
((
dx2.y
-
dx1.y
)
*
(
dy2.y
-
dy1.y
))
;
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.z,
y
+
dy1.z
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.z,
y
+
dy2.z
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.z,
y
+
dy1.z
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.z,
y
+
dy2.z
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].z
/
((
dx2.z
-
dx1.z
)
*
(
dy2.z
-
dy1.z
))
;
t
=
0
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.w,
y
+
dy1.w
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx1.w,
y
+
dy2.w
)
,
rows,
cols,
elemPerRow
)
;
t
-=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.w,
y
+
dy1.w
)
,
rows,
cols,
elemPerRow
)
;
t
+=
read_sumTex
(
sumTex,
sampler,
(
int2
)(
x
+
dx2.w,
y
+
dy2.w
)
,
rows,
cols,
elemPerRow
)
;
d
+=
t
*
src[4].w
/
((
dx2.w
-
dx1.w
)
*
(
dy2.w
-
dy1.w
))
;
return
(
float
)
d
;
}
...
...
@@ -204,9 +232,9 @@ float icvCalcHaarPatternSum_4(
////////////////////////////////////////////////////////////////////////
//
Hessian
__constant
float
c_DX
[3][5]
=
{
{0,
2
,
3
,
7
,
1},
{3,
2
,
6
,
7
,
-2},
{6,
2
,
9
,
7
,
1}
}
;
__constant
float
c_DY
[3][5]
=
{
{2,
0
,
7
,
3
,
1},
{2,
3
,
7
,
6
,
-2},
{2,
6
,
7
,
9
,
1}
}
;
__constant
float
c_DXY[4][5]
=
{
{1,
1
,
4
,
4
,
1},
{5,
1
,
8
,
4
,
-1},
{1,
5
,
4
,
8
,
-1},
{5,
5
,
8
,
8
,
1}
}
;
__constant
float
4
c_DX[5]
=
{
(
float4
)(
0
,
3
,
6
,
0
)
,
(
float4
)(
2
,
2
,
2
,
0
)
,
(
float4
)(
3
,
6
,
9
,
0
)
,
(
float4
)(
7
,
7
,
7
,
0
)
,
(
float4
)(
1
,
-2
,
1
,
0
)
}
;
__constant
float
4
c_DY[5]
=
{
(
float4
)(
2
,
2
,
2
,
0
)
,
(
float4
)(
0
,
3
,
6
,
0
)
,
(
float4
)(
7
,
7
,
7
,
0
)
,
(
float4
)(
3
,
6
,
9
,
0
)
,
(
float4
)(
1
,
-2
,
1
,
0
)
}
;
__constant
float
4
c_DXY[5]
=
{
(
float4
)(
1
,
5
,
1
,
5
)
,
(
float4
)(
1
,
1
,
5
,
5
)
,
(
float4
)(
4
,
8
,
4
,
8
)
,
(
float4
)(
4
,
4
,
8
,
8
)
,
(
float4
)(
1
,
-1
,
-1
,
1
)
}
;// Use integral image to calculate haar wavelets.
__inline
int
calcSize
(
int
octave,
int
layer
)
{
...
...
@@ -236,7 +264,7 @@ __kernel void icvCalcLayerDetAndTrace(
int
c_octave,
int
c_layer_rows,
int
sumTex_step
)
)
{
det_step
/=
sizeof
(
*det
)
;
trace_step
/=
sizeof
(
*trace
)
;
...
...
@@ -300,7 +328,7 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
//
Non-maximal
suppression
to
further
filtering
the
candidates
from
previous
step
__kernel
void
icvFindMaximaInLayer_withmask
(
void
icvFindMaximaInLayer_withmask
(
__global
const
float
*
det,
__global
const
float
*
trace,
__global
int4
*
maxPosBuffer,
...
...
@@ -318,7 +346,7 @@ __kernel
float
c_hessianThreshold,
IMAGE_INT32
maskSumTex,
int
mask_step
)
)
{
volatile
__local
float
N9[768]
; // threads.x * threads.y * 3
...
...
@@ -347,26 +375,26 @@ __kernel
const
int
localLin
=
get_local_id
(
0
)
+
get_local_id
(
1
)
*
get_local_size
(
0
)
+
zoff
;
N9[localLin
-
zoff]
=
det[det_step
*
(
c_layer_rows
*
(
layer
-
1
)
+
min
(
max
(
i,
0
)
,
c_img_rows
-
1
))
//
y
+
min
(
max
(
j,
0
)
,
c_img_cols
-
1
)
]
; // x
(
c_layer_rows
*
(
layer
-
1
)
+
min
(
max
(
i,
0
)
,
c_img_rows
-
1
))
//
y
+
min
(
max
(
j,
0
)
,
c_img_cols
-
1
)
]
; // x
N9[localLin
]
=
det[det_step
*
(
c_layer_rows
*
(
layer
)
+
min
(
max
(
i,
0
)
,
c_img_rows
-
1
))
//
y
+
min
(
max
(
j,
0
)
,
c_img_cols
-
1
)
]
; // x
(
c_layer_rows
*
(
layer
)
+
min
(
max
(
i,
0
)
,
c_img_rows
-
1
))
//
y
+
min
(
max
(
j,
0
)
,
c_img_cols
-
1
)
]
; // x
N9[localLin
+
zoff]
=
det[det_step
*
(
c_layer_rows
*
(
layer
+
1
)
+
min
(
max
(
i,
0
)
,
c_img_rows
-
1
))
//
y
+
min
(
max
(
j,
0
)
,
c_img_cols
-
1
)
]
; // x
(
c_layer_rows
*
(
layer
+
1
)
+
min
(
max
(
i,
0
)
,
c_img_rows
-
1
))
//
y
+
min
(
max
(
j,
0
)
,
c_img_cols
-
1
)
]
; // x
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
i
<
c_layer_rows
-
margin
&&
j
<
c_layer_cols
-
margin
&&
get_local_id
(
0
)
>
0
&&
get_local_id
(
0
)
<
get_local_size
(
0
)
-
1
&&
get_local_id
(
1
)
>
0
&&
get_local_id
(
1
)
<
get_local_size
(
1
)
-
1
//
these
are
unnecessary
conditions
ported
from
CUDA
)
&&
j
<
c_layer_cols
-
margin
&&
get_local_id
(
0
)
>
0
&&
get_local_id
(
0
)
<
get_local_size
(
0
)
-
1
&&
get_local_id
(
1
)
>
0
&&
get_local_id
(
1
)
<
get_local_size
(
1
)
-
1
//
these
are
unnecessary
conditions
ported
from
CUDA
)
{
float
val0
=
N9[localLin]
;
...
...
@@ -382,34 +410,34 @@ __kernel
{
//
Check
to
see
if
we
have
a
max
(
in
its
26
neighbours
)
const
bool
condmax
=
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
zoff]
&&
val0
>
N9[localLin
-
zoff]
&&
val0
>
N9[localLin
+
1
-
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
]
&&
val0
>
N9[localLin
+
1
]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
1
+
zoff]
&&
val0
>
N9[localLin
+
zoff]
&&
val0
>
N9[localLin
+
1
+
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
+
zoff]
;
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
zoff]
&&
val0
>
N9[localLin
-
zoff]
&&
val0
>
N9[localLin
+
1
-
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
]
&&
val0
>
N9[localLin
+
1
]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
1
+
zoff]
&&
val0
>
N9[localLin
+
zoff]
&&
val0
>
N9[localLin
+
1
+
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
+
zoff]
;
if
(
condmax
)
{
...
...
@@ -428,7 +456,7 @@ __kernel
}
__kernel
void
icvFindMaximaInLayer
(
void
icvFindMaximaInLayer
(
__global
float
*
det,
__global
float
*
trace,
__global
int4
*
maxPosBuffer,
...
...
@@ -444,7 +472,7 @@ __kernel
int
c_layer_cols,
int
c_max_candidates,
float
c_hessianThreshold
)
)
{
volatile
__local
float
N9[768]
; // threads.x * threads.y * 3
...
...
@@ -483,12 +511,12 @@ __kernel
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
i
<
c_layer_rows
-
margin
&&
j
<
c_layer_cols
-
margin
&&
get_local_id
(
0
)
>
0
&&
get_local_id
(
0
)
<
get_local_size
(
0
)
-
1
&&
get_local_id
(
1
)
>
0
&&
get_local_id
(
1
)
<
get_local_size
(
1
)
-
1
//
these
are
unnecessary
conditions
ported
from
CUDA
)
&&
j
<
c_layer_cols
-
margin
&&
get_local_id
(
0
)
>
0
&&
get_local_id
(
0
)
<
get_local_size
(
0
)
-
1
&&
get_local_id
(
1
)
>
0
&&
get_local_id
(
1
)
<
get_local_size
(
1
)
-
1
//
these
are
unnecessary
conditions
ported
from
CUDA
)
{
float
val0
=
N9[localLin]
;
if
(
val0
>
c_hessianThreshold
)
...
...
@@ -499,38 +527,38 @@ __kernel
//
Check
to
see
if
we
have
a
max
(
in
its
26
neighbours
)
const
bool
condmax
=
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
zoff]
&&
val0
>
N9[localLin
-
zoff]
&&
val0
>
N9[localLin
+
1
-
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
]
&&
val0
>
N9[localLin
+
1
]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
1
+
zoff]
&&
val0
>
N9[localLin
+
zoff]
&&
val0
>
N9[localLin
+
1
+
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
+
zoff]
;
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
zoff]
&&
val0
>
N9[localLin
-
zoff]
&&
val0
>
N9[localLin
+
1
-
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
-
zoff]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
]
&&
val0
>
N9[localLin
+
1
]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
]
&&
val0
>
N9[localLin
-
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
-
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
-
1
+
zoff]
&&
val0
>
N9[localLin
+
zoff]
&&
val0
>
N9[localLin
+
1
+
zoff]
&&
val0
>
N9[localLin
-
1
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
get_local_size
(
0
)
+
zoff]
&&
val0
>
N9[localLin
+
1
+
get_local_size
(
0
)
+
zoff]
;
if
(
condmax
)
{
int
ind
=
atomic_inc
(
maxCounter
)
;
int
ind
=
atomic_inc
(
maxCounter
)
;
if
(
ind
<
c_max_candidates
)
{
...
...
@@ -544,30 +572,30 @@ __kernel
}
//
solve
3x3
linear
system
Ax=b
for
floating
point
input
inline
bool
solve3x3_float
(
volatile
__local
const
float
A[3][3],
volatile
__local
const
float
b[3],
volatile
__local
float
x[3]
)
inline
bool
solve3x3_float
(
volatile
__local
const
float
4
*A,
volatile
__local
const
float
*b,
volatile
__local
float
*x
)
{
float
det
=
A[0]
[0]
*
(
A[1][1]
*
A[2][2]
-
A[1][2]
*
A[2][1]
)
-
A[0][1]
*
(
A[1][0]
*
A[2][2]
-
A[1][2]
*
A[2][0]
)
+
A[0][2]
*
(
A[1][0]
*
A[2][1]
-
A[1][1]
*
A[2][0]
)
;
float
det
=
A[0]
.x
*
(
A[1].y
*
A[2].z
-
A[1].z
*
A[2].y
)
-
A[0].y
*
(
A[1].x
*
A[2].z
-
A[1].z
*
A[2].x
)
+
A[0].z
*
(
A[1].x
*
A[2].y
-
A[1].y
*
A[2].x
)
;
if
(
det
!=
0
)
{
F
invdet
=
1.0
/
det
;
x[0]
=
invdet
*
(
b[0]
*
(
A[1][1]
*
A[2][2]
-
A[1][2]
*
A[2][1]
)
-
A[0][1]
*
(
b[1]
*
A[2][2]
-
A[1][2]
*
b[2]
)
+
A[0][2]
*
(
b[1]
*
A[2][1]
-
A[1][1]
*
b[2]
))
;
(
b[0]
*
(
A[1].y
*
A[2].z
-
A[1].z
*
A[2].y
)
-
A[0].y
*
(
b[1]
*
A[2].z
-
A[1].z
*
b[2]
)
+
A[0].z
*
(
b[1]
*
A[2].y
-
A[1].y
*
b[2]
))
;
x[1]
=
invdet
*
(
A[0][0]
*
(
b[1]
*
A[2][2]
-
A[1][2]
*
b[2]
)
-
b[0]
*
(
A[1][0]
*
A[2][2]
-
A[1][2]
*
A[2][0]
)
+
A[0][2]
*
(
A[1][0]
*
b[2]
-
b[1]
*
A[2][0]
))
;
(
A[0].x
*
(
b[1]
*
A[2].z
-
A[1].z
*
b[2]
)
-
b[0]
*
(
A[1].x
*
A[2].z
-
A[1].z
*
A[2].x
)
+
A[0].z
*
(
A[1].x
*
b[2]
-
b[1]
*
A[2].x
))
;
x[2]
=
invdet
*
(
A[0][0]
*
(
A[1][1]
*
b[2]
-
b[1]
*
A[2][1]
)
-
A[0][1]
*
(
A[1][0]
*
b[2]
-
b[1]
*
A[2][0]
)
+
b[0]
*
(
A[1][0]
*
A[2][1]
-
A[1][1]
*
A[2][0]
))
;
(
A[0].x
*
(
A[1].y
*
b[2]
-
b[1]
*
A[2].y
)
-
A[0].y
*
(
A[1].x
*
b[2]
-
b[1]
*
A[2].x
)
+
b[0]
*
(
A[1].x
*
A[2].y
-
A[1].y
*
A[2].x
))
;
return
true
;
}
...
...
@@ -586,7 +614,7 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
////////////////////////////////////////////////////////////////////////
//
INTERPOLATION
__kernel
void
icvInterpolateKeypoint
(
void
icvInterpolateKeypoint
(
__global
const
float
*
det,
__global
const
int4
*
maxPosBuffer,
__global
float
*
keypoints,
...
...
@@ -598,7 +626,7 @@ __kernel
int
c_octave,
int
c_layer_rows,
int
c_max_features
)
)
{
det_step
/=
sizeof
(
*det
)
;
keypoints_step
/=
sizeof
(
*keypoints
)
;
...
...
@@ -632,26 +660,26 @@ __kernel
//ds
dD[2]
=
-0.5f
*
(
N9[2][1][1]
-
N9[0][1][1]
)
;
volatile
__local
float
H[3]
[3]
;
volatile
__local
float
4
H
[3]
;
//dxx
H[0]
[0]
=
N9[1][1][0]
-
2.0f
*
N9[1][1][1]
+
N9[1][1][2]
;
H[0]
.x
=
N9[1][1][0]
-
2.0f
*
N9[1][1][1]
+
N9[1][1][2]
;
//dxy
H[0]
[1]
=
0.25f
*
(
N9[1][2][2]
-
N9[1][2][0]
-
N9[1][0][2]
+
N9[1][0][0]
)
;
H[0]
.y
=
0.25f
*
(
N9[1][2][2]
-
N9[1][2][0]
-
N9[1][0][2]
+
N9[1][0][0]
)
;
//dxs
H[0]
[2]
=
0.25f
*
(
N9[2][1][2]
-
N9[2][1][0]
-
N9[0][1][2]
+
N9[0][1][0]
)
;
H[0]
.z
=
0.25f
*
(
N9[2][1][2]
-
N9[2][1][0]
-
N9[0][1][2]
+
N9[0][1][0]
)
;
//dyx
=
dxy
H[1]
[0]
=
H[0][1]
;
H[1]
.x
=
H[0].y
;
//dyy
H[1]
[1]
=
N9[1][0][1]
-
2.0f
*
N9[1][1][1]
+
N9[1][2][1]
;
H[1]
.y
=
N9[1][0][1]
-
2.0f
*
N9[1][1][1]
+
N9[1][2][1]
;
//dys
H[1]
[2]
=
0.25f
*
(
N9[2][2][1]
-
N9[2][0][1]
-
N9[0][2][1]
+
N9[0][0][1]
)
;
H[1]
.z
=
0.25f
*
(
N9[2][2][1]
-
N9[2][0][1]
-
N9[0][2][1]
+
N9[0][0][1]
)
;
//dsx
=
dxs
H[2]
[0]
=
H[0][2]
;
H[2]
.x
=
H[0].z
;
//dsy
=
dys
H[2]
[1]
=
H[1][2]
;
H[2]
.y
=
H[1].z
;
//dss
H[2]
[2]
=
N9[0][1][1]
-
2.0f
*
N9[1][1][1]
+
N9[2][1][1]
;
H[2]
.z
=
N9[0][1][1]
-
2.0f
*
N9[1][1][1]
+
N9[2][1][1]
;
volatile
__local
float
x[3]
;
...
...
@@ -689,7 +717,7 @@ __kernel
if
((
c_img_rows
+
1
)
>=
grad_wav_size
&&
(
c_img_cols
+
1
)
>=
grad_wav_size
)
{
//
Get
a
new
feature
index.
int
ind
=
atomic_inc
(
featureCounter
)
;
int
ind
=
atomic_inc
(
featureCounter
)
;
if
(
ind
<
c_max_features
)
{
...
...
@@ -716,31 +744,32 @@ __kernel
__constant
float
c_aptX[ORI_SAMPLES]
=
{-6,
-5
,
-5
,
-5
,
-5
,
-5
,
-5
,
-5
,
-4
,
-4
,
-4
,
-4
,
-4
,
-4
,
-4
,
-4
,
-4
,
-3
,
-3
,
-3
,
-3
,
-3
,
-3
,
-3
,
-3
,
-3
,
-3
,
-3
,
-2
,
-2
,
-2
,
-2
,
-2
,
-2
,
-2
,
-2
,
-2
,
-2
,
-2
,
-1
,
-1
,
-1
,
-1
,
-1
,
-1
,
-1
,
-1
,
-1
,
-1
,
-1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
6}
;
__constant
float
c_aptY[ORI_SAMPLES]
=
{0,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
-5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
5
,
-5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
5
,
-5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
5
,
-6
,
-5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
-5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
5
,
-5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
5
,
-5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
5
,
-4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
4
,
-3
,
-2
,
-1
,
0
,
1
,
2
,
3
,
0}
;
__constant
float
c_aptW[ORI_SAMPLES]
=
{0.001455130288377404f,
0.001707611023448408f,
0.002547456417232752f,
0.003238451667129993f,
0.0035081731621176f,
0.003238451667129993f,
0.002547456417232752f,
0.001707611023448408f,
0.002003900473937392f,
0.0035081731621176f,
0.005233579315245152f,
0.00665318313986063f,
0.00720730796456337f,
0.00665318313986063f,
0.005233579315245152f,
0.0035081731621176f,
0.002003900473937392f,
0.001707611023448408f,
0.0035081731621176f,
0.006141661666333675f,
0.009162282571196556f,
0.01164754293859005f,
0.01261763460934162f,
0.01164754293859005f,
0.009162282571196556f,
0.006141661666333675f,
0.0035081731621176f,
0.001707611023448408f,
0.002547456417232752f,
0.005233579315245152f,
0.009162282571196556f,
0.01366852037608624f,
0.01737609319388866f,
0.0188232995569706f,
0.01737609319388866f,
0.01366852037608624f,
0.009162282571196556f,
0.005233579315245152f,
0.002547456417232752f,
0.003238451667129993f,
0.00665318313986063f,
0.01164754293859005f,
0.01737609319388866f,
0.02208934165537357f,
0.02392910048365593f,
0.02208934165537357f,
0.01737609319388866f,
0.01164754293859005f,
0.00665318313986063f,
0.003238451667129993f,
0.001455130288377404f,
0.0035081731621176f,
0.00720730796456337f,
0.01261763460934162f,
0.0188232995569706f,
0.02392910048365593f,
0.02592208795249462f,
0.02392910048365593f,
0.0188232995569706f,
0.01261763460934162f,
0.00720730796456337f,
0.0035081731621176f,
0.001455130288377404f,
0.003238451667129993f,
0.00665318313986063f,
0.01164754293859005f,
0.01737609319388866f,
0.02208934165537357f,
0.02392910048365593f,
0.02208934165537357f,
0.01737609319388866f,
0.01164754293859005f,
0.00665318313986063f,
0.003238451667129993f,
0.002547456417232752f,
0.005233579315245152f,
0.009162282571196556f,
0.01366852037608624f,
0.01737609319388866f,
0.0188232995569706f,
0.01737609319388866f,
0.01366852037608624f,
0.009162282571196556f,
0.005233579315245152f,
0.002547456417232752f,
0.001707611023448408f,
0.0035081731621176f,
0.006141661666333675f,
0.009162282571196556f,
0.01164754293859005f,
0.01261763460934162f,
0.01164754293859005f,
0.009162282571196556f,
0.006141661666333675f,
0.0035081731621176f,
0.001707611023448408f,
0.002003900473937392f,
0.0035081731621176f,
0.005233579315245152f,
0.00665318313986063f,
0.00720730796456337f,
0.00665318313986063f,
0.005233579315245152f,
0.0035081731621176f,
0.002003900473937392f,
0.001707611023448408f,
0.002547456417232752f,
0.003238451667129993f,
0.0035081731621176f,
0.003238451667129993f,
0.002547456417232752f,
0.001707611023448408f,
0.001455130288377404f}
;
__constant
float
c_NX[2][5]
=
{{0,
0
,
2
,
4
,
-1},
{2,
0
,
4
,
4
,
1}}
;
__constant
float
c_NY[2][5]
=
{{0,
0
,
4
,
2
,
1},
{0,
2
,
4
,
4
,
-1}}
;
0.003238451667129993f,
0.002547456417232752f,
0.001707611023448408f,
0.002003900473937392f,
0.0035081731621176f,
0.005233579315245152f,
0.00665318313986063f,
0.00720730796456337f,
0.00665318313986063f,
0.005233579315245152f,
0.0035081731621176f,
0.002003900473937392f,
0.001707611023448408f,
0.0035081731621176f,
0.006141661666333675f,
0.009162282571196556f,
0.01164754293859005f,
0.01261763460934162f,
0.01164754293859005f,
0.009162282571196556f,
0.006141661666333675f,
0.0035081731621176f,
0.001707611023448408f,
0.002547456417232752f,
0.005233579315245152f,
0.009162282571196556f,
0.01366852037608624f,
0.01737609319388866f,
0.0188232995569706f,
0.01737609319388866f,
0.01366852037608624f,
0.009162282571196556f,
0.005233579315245152f,
0.002547456417232752f,
0.003238451667129993f,
0.00665318313986063f,
0.01164754293859005f,
0.01737609319388866f,
0.02208934165537357f,
0.02392910048365593f,
0.02208934165537357f,
0.01737609319388866f,
0.01164754293859005f,
0.00665318313986063f,
0.003238451667129993f,
0.001455130288377404f,
0.0035081731621176f,
0.00720730796456337f,
0.01261763460934162f,
0.0188232995569706f,
0.02392910048365593f,
0.02592208795249462f,
0.02392910048365593f,
0.0188232995569706f,
0.01261763460934162f,
0.00720730796456337f,
0.0035081731621176f,
0.001455130288377404f,
0.003238451667129993f,
0.00665318313986063f,
0.01164754293859005f,
0.01737609319388866f,
0.02208934165537357f,
0.02392910048365593f,
0.02208934165537357f,
0.01737609319388866f,
0.01164754293859005f,
0.00665318313986063f,
0.003238451667129993f,
0.002547456417232752f,
0.005233579315245152f,
0.009162282571196556f,
0.01366852037608624f,
0.01737609319388866f,
0.0188232995569706f,
0.01737609319388866f,
0.01366852037608624f,
0.009162282571196556f,
0.005233579315245152f,
0.002547456417232752f,
0.001707611023448408f,
0.0035081731621176f,
0.006141661666333675f,
0.009162282571196556f,
0.01164754293859005f,
0.01261763460934162f,
0.01164754293859005f,
0.009162282571196556f,
0.006141661666333675f,
0.0035081731621176f,
0.001707611023448408f,
0.002003900473937392f,
0.0035081731621176f,
0.005233579315245152f,
0.00665318313986063f,
0.00720730796456337f,
0.00665318313986063f,
0.005233579315245152f,
0.0035081731621176f,
0.002003900473937392f,
0.001707611023448408f,
0.002547456417232752f,
0.003238451667129993f,
0.0035081731621176f,
0.003238451667129993f,
0.002547456417232752f,
0.001707611023448408f,
0.001455130288377404f
}
;
__constant
float2
c_NX[5]
=
{
(
float2
)(
0
,
2
)
,
(
float2
)(
0
,
0
)
,
(
float2
)(
2
,
4
)
,
(
float2
)(
4
,
4
)
,
(
float2
)(
-1
,
1
)
}
;
__constant
float2
c_NY[5]
=
{
(
float2
)(
0
,
0
)
,
(
float2
)(
0
,
2
)
,
(
float2
)(
4
,
4
)
,
(
float2
)(
2
,
4
)
,
(
float2
)(
1
,
-1
)
}
;
void
reduce_32_sum
(
volatile
__local
float
*
data,
volatile
float*
partial_reduction,
int
tid
)
{
...
...
@@ -759,14 +788,14 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
if
(
tid
<
8
)
{
#
endif
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
8
]
)
;
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
8]
)
;
#
if
WAVE_SIZE
<
8
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
#
endif
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
4
]
)
;
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
4]
)
;
#
if
WAVE_SIZE
<
4
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -787,14 +816,14 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
}
__kernel
void
icvCalcOrientation
(
void
icvCalcOrientation
(
IMAGE_INT32
sumTex,
__global
float
*
keypoints,
int
keypoints_step,
int
c_img_rows,
int
c_img_cols,
int
sum_step
)
)
{
keypoints_step
/=
sizeof
(
*keypoints
)
;
sum_step
/=
sizeof
(
uint
)
;
...
...
@@ -838,7 +867,7 @@ __kernel
const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
...
...
@@ -934,11 +963,11 @@ __kernel
__kernel
void icvSetUpright(
void icvSetUpright(
__global float * keypoints,
int keypoints_step,
int nFeatures
)
)
{
keypoints_step /= sizeof(*keypoints);
__global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
...
...
@@ -988,7 +1017,7 @@ inline uchar readerGet(
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j, int rows, int cols, int elemPerRow
)
)
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
...
...
@@ -999,7 +1028,7 @@ inline float linearFilter(
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x, int rows, int cols, int elemPerRow
)
)
{
x -= 0.5f;
y -= 0.5f;
...
...
@@ -1028,9 +1057,9 @@ inline float linearFilter(
void calc_dx_dy(
IMAGE_INT8 imgTex,
volatile __local float
s_dx_bin[25]
,
volatile __local float
s_dy_bin[25]
,
volatile __local float
s_PATCH[6][6]
,
volatile __local float
*s_dx_bin
,
volatile __local float
*s_dy_bin
,
volatile __local float
*s_PATCH
,
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
...
...
@@ -1038,7 +1067,7 @@ void calc_dx_dy(
int rows,
int cols,
int elemPerRow
)
)
{
const float centerX = featureX[get_group_id(0)];
const float centerY = featureY[get_group_id(0)];
...
...
@@ -1048,6 +1077,7 @@ void calc_dx_dy(
{
descriptor_dir = 0.0f;
}
descriptor_dir *= (float)(CV_PI_F / 180.0f);
/* The sampling intervals and wavelet sized for selecting an orientation
...
...
@@ -1074,7 +1104,7 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1)
][
get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
s_PATCH[get_local_id(1)
* 6 +
get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
barrier(CLK_LOCAL_MEM_FENCE);
...
...
@@ -1085,17 +1115,17 @@ void calc_dx_dy(
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (
s_PATCH[get_local_id(1) ][
get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][
get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][
get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][
get_local_id(0) ])
* dw;
s_PATCH[ get_local_id(1) * 6 +
get_local_id(0) + 1] -
s_PATCH[ get_local_id(1) * 6 +
get_local_id(0) ] +
s_PATCH[(get_local_id(1) + 1) * 6 +
get_local_id(0) + 1] -
s_PATCH[(get_local_id(1) + 1) * 6 +
get_local_id(0) ])
* dw;
const float vy = (
s_PATCH[get_local_id(1) + 1][
get_local_id(0) ] -
s_PATCH[get_local_id(1) ][
get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][
get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][
get_local_id(0) + 1])
* dw;
s_PATCH[(get_local_id(1) + 1) * 6 +
get_local_id(0) ] -
s_PATCH[ get_local_id(1) * 6 +
get_local_id(0) ] +
s_PATCH[(get_local_id(1) + 1) * 6 +
get_local_id(0) + 1] -
s_PATCH[ get_local_id(1) * 6 +
get_local_id(0) + 1])
* dw;
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
...
...
@@ -1106,7 +1136,7 @@ void reduce_sum25(
volatile __local float* sdata3,
volatile __local float* sdata4,
int tid
)
)
{
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
...
...
@@ -1125,11 +1155,8 @@ void reduce_sum25(
{
#endif
sdata1[tid] += sdata1[tid + 8];
sdata2[tid] += sdata2[tid + 8];
sdata3[tid] += sdata3[tid + 8];
sdata4[tid] += sdata4[tid + 8];
#if WAVE_SIZE < 8
}
...
...
@@ -1166,7 +1193,7 @@ void reduce_sum25(
}
__kernel
void compute_descriptors64(
void compute_descriptors64(
IMAGE_INT8 imgTex,
__global float * descriptors,
__global const float * keypoints,
...
...
@@ -1175,7 +1202,7 @@ __kernel
int rows,
int cols,
int img_step
)
)
{
descriptors_step /= sizeof(float);
keypoints_step /= sizeof(float);
...
...
@@ -1189,7 +1216,7 @@ __kernel
volatile __local float sdy[25];
volatile __local float sdxabs[25];
volatile __local float sdyabs[25];
volatile __local float s_PATCH[6
][
6];
volatile __local float s_PATCH[6
*
6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE);
...
...
@@ -1203,7 +1230,7 @@ __kernel
}
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
...
...
@@ -1221,7 +1248,7 @@ __kernel
}
}
__kernel
void compute_descriptors128(
void compute_descriptors128(
IMAGE_INT8 imgTex,
__global float * descriptors,
__global float * keypoints,
...
...
@@ -1230,7 +1257,7 @@ __kernel
int rows,
int cols,
int img_step
)
)
{
descriptors_step /= sizeof(*descriptors);
keypoints_step /= sizeof(*keypoints);
...
...
@@ -1249,7 +1276,7 @@ __kernel
volatile __local float sd2[25];
volatile __local float sdabs1[25];
volatile __local float sdabs2[25];
volatile __local float s_PATCH[6
][
6];
volatile __local float s_PATCH[6
*
6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE);
...
...
@@ -1275,7 +1302,7 @@ __kernel
}
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
__global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
...
...
@@ -1306,8 +1333,7 @@ __kernel
}
}
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
...
...
@@ -1322,11 +1348,13 @@ __kernel
}
}
}
void
reduce_sum128
(
volatile
__local
float*
smem,
int
tid
)
{
#
ifndef
WAVE_SIZE
#
define
WAVE_SIZE
1
#
endif
if
(
tid
<
64
)
{
smem[tid]
+=
smem[tid
+
64]
;
...
...
@@ -1374,6 +1402,8 @@ void reduce_sum128(volatile __local float* smem, int tid)
smem[tid]
+=
smem[tid
+
1]
;
}
}
void
reduce_sum64
(
volatile
__local
float*
smem,
int
tid
)
{
#
ifndef
WAVE_SIZE
...
...
@@ -1421,7 +1451,7 @@ void reduce_sum64(volatile __local float* smem, int tid)
}
__kernel
void
normalize_descriptors128
(
__global
float
*
descriptors,
int
descriptors_step
)
void
normalize_descriptors128
(
__global
float
*
descriptors,
int
descriptors_step
)
{
descriptors_step
/=
sizeof
(
*descriptors
)
;
//
no
need
for
thread
ID
...
...
@@ -1436,8 +1466,6 @@ __kernel
reduce_sum128
(
sqDesc,
get_local_id
(
0
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
compute
length
(
square
root
)
volatile
__local
float
len
;
if
(
get_local_id
(
0
)
==
0
)
...
...
@@ -1450,7 +1478,7 @@ __kernel
descriptor_base[get_local_id
(
0
)
]
=
lookup
/
len
;
}
__kernel
void
normalize_descriptors64
(
__global
float
*
descriptors,
int
descriptors_step
)
void
normalize_descriptors64
(
__global
float
*
descriptors,
int
descriptors_step
)
{
descriptors_step
/=
sizeof
(
*descriptors
)
;
//
no
need
for
thread
ID
...
...
@@ -1462,7 +1490,6 @@ __kernel
sqDesc[get_local_id
(
0
)
]
=
lookup
*
lookup
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
reduce_sum64
(
sqDesc,
get_local_id
(
0
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment