Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
3dcddad8
Commit
3dcddad8
authored
Dec 05, 2013
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocl: added workaround into Haar kernels
parent
d8a4d3a2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
100 additions
and
87 deletions
+100
-87
haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+47
-39
haarobjectdetect_scaled2.cl
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+53
-48
No files found.
modules/ocl/src/opencl/haarobjectdetect.cl
View file @
3dcddad8
...
...
@@ -62,13 +62,13 @@ typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
GpuHidHaarTreeNode
;
typedef
struct
__attribute__
((
aligned
(
32
)))
GpuHidHaarClassifier
{
int
count
__attribute__
((
aligned
(
4
)))
;
GpuHidHaarTreeNode*
node
__attribute__
((
aligned
(
8
)))
;
float*
alpha
__attribute__
((
aligned
(
8
)))
;
}
GpuHidHaarClassifier
;
//
typedef
struct
__attribute__
((
aligned
(
32
)))
GpuHidHaarClassifier
//
{
//
int
count
__attribute__
((
aligned
(
4
)))
;
//
GpuHidHaarTreeNode*
node
__attribute__
((
aligned
(
8
)))
;
//
float*
alpha
__attribute__
((
aligned
(
8
)))
;
//
}
//
GpuHidHaarClassifier
;
typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarStageClassifier
...
...
@@ -84,22 +84,22 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
GpuHidHaarStageClassifier
;
typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarClassifierCascade
{
int
count
__attribute__
((
aligned
(
4
)))
;
int
is_stump_based
__attribute__
((
aligned
(
4
)))
;
int
has_tilted_features
__attribute__
((
aligned
(
4
)))
;
int
is_tree
__attribute__
((
aligned
(
4
)))
;
int
pq0
__attribute__
((
aligned
(
4
)))
;
int
pq1
__attribute__
((
aligned
(
4
)))
;
int
pq2
__attribute__
((
aligned
(
4
)))
;
int
pq3
__attribute__
((
aligned
(
4
)))
;
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
inv_window_area
__attribute__
((
aligned
(
4
)))
;
}
GpuHidHaarClassifierCascade
;
//
typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarClassifierCascade
//
{
//
int
count
__attribute__
((
aligned
(
4
)))
;
//
int
is_stump_based
__attribute__
((
aligned
(
4
)))
;
//
int
has_tilted_features
__attribute__
((
aligned
(
4
)))
;
//
int
is_tree
__attribute__
((
aligned
(
4
)))
;
//
int
pq0
__attribute__
((
aligned
(
4
)))
;
//
int
pq1
__attribute__
((
aligned
(
4
)))
;
//
int
pq2
__attribute__
((
aligned
(
4
)))
;
//
int
pq3
__attribute__
((
aligned
(
4
)))
;
//
int
p0
__attribute__
((
aligned
(
4
)))
;
//
int
p1
__attribute__
((
aligned
(
4
)))
;
//
int
p2
__attribute__
((
aligned
(
4
)))
;
//
int
p3
__attribute__
((
aligned
(
4
)))
;
//
float
inv_window_area
__attribute__
((
aligned
(
4
)))
;
//
}
GpuHidHaarClassifierCascade
;
#
ifdef
PACKED_CLASSIFIER
...
...
@@ -196,10 +196,12 @@ __kernel void gpuRunHaarClassifierCascadePacked(
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++ )
{//
iterate
until
candidate
is
exist
float
stage_sum
=
0.0f
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
__global
GpuHidHaarStageClassifier*
stageinfo
=
(
__global
GpuHidHaarStageClassifier*
)
((
__global
uchar*
)
stagecascadeptr+stageloop*sizeof
(
GpuHidHaarStageClassifier
))
;
int
stagecount
=
stageinfo->count
;
float
stagethreshold
=
stageinfo->threshold
;
int
lcl_off
=
(
lid_y*DATA_SIZE_X
)
+
(
lid_x
)
;
for
(
int
nodeloop
=
0
; nodeloop < stage
info.x
; nodecounter++,nodeloop++ )
for
(
int
nodeloop
=
0
; nodeloop < stage
count
; nodecounter++,nodeloop++ )
{
//
simple
macro
to
extract
shorts
from
int
#
define
M0
(
_t
)
((
_t
)
&0xFFFF
)
...
...
@@ -355,14 +357,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
variance_norm_factor
=
variance_norm_factor
*
correction
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
for
(
int
stageloop
=
start_stage
; (stageloop < split_stage)
&& result; stageloop++ )
for
(
int
stageloop
=
start_stage
; (stageloop < split_stage) && result; stageloop++ )
{
float
stage_sum
=
0.f
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; )
__global
GpuHidHaarStageClassifier*
stageinfo
=
(
__global
GpuHidHaarStageClassifier*
)
((
__global
uchar*
)
stagecascadeptr+stageloop*sizeof
(
GpuHidHaarStageClassifier
))
;
int
stagecount
=
stageinfo->count
;
float
stagethreshold
=
stageinfo->threshold
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount; )
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
__global
GpuHidHaarTreeNode*
)
(((
__global
uchar*
)
nodeptr
)
+
nodecounter
*
sizeof
(
GpuHidHaarTreeNode
))
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
...
...
@@ -418,7 +423,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
#endif
}
result = (stage_sum >= stagethreshold);
result = (stage_sum >= stagethreshold)
? 1 : 0
;
}
if(factor < 2)
{
...
...
@@ -447,14 +452,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
//int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
int stagecount = stageinfo->count;
float stagethreshold = stageinfo->threshold;
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stage
info.x
+ lcl_compute_win -1) >> (6-perfscale);
int lcl_loops = (stage
count
+ lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
{
...
...
@@ -469,10 +477,10 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
float part_sum = 0.f;
const int stump_factor = STUMP_BASED ? 1 : 2;
int root_offset = 0;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stage
info.x
;)
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stage
count
;)
{
__global GpuHidHaarTreeNode* currentnodeptr =
nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset
;
__global GpuHidHaarTreeNode* currentnodeptr =
(__global GpuHidHaarTreeNode*)
(((__global uchar*)nodeptr) + sizeof(GpuHidHaarTreeNode) * ((nodecounter + tempnodecounter) * stump_factor + root_offset))
;
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
...
...
@@ -549,7 +557,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
+=
stage
info.x
;
nodecounter
+=
stage
count
;
}//end
for
(
int
stageloop
=
splitstage
; stageloop< endstage && queuecount>0;stageloop++)
if
(
lcl_id<queuecount
)
...
...
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
View file @
3dcddad8
...
...
@@ -59,13 +59,13 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
int
right
__attribute__
((
aligned
(
4
)))
;
}
GpuHidHaarTreeNode
;
typedef
struct
__attribute__
((
aligned
(
32
)))
GpuHidHaarClassifier
{
int
count
__attribute__
((
aligned
(
4
)))
;
GpuHidHaarTreeNode
*node
__attribute__
((
aligned
(
8
)))
;
float
*alpha
__attribute__
((
aligned
(
8
)))
;
}
GpuHidHaarClassifier
;
//
typedef
struct
__attribute__
((
aligned
(
32
)))
GpuHidHaarClassifier
//
{
//
int
count
__attribute__
((
aligned
(
4
)))
;
//
GpuHidHaarTreeNode
*node
__attribute__
((
aligned
(
8
)))
;
//
float
*alpha
__attribute__
((
aligned
(
8
)))
;
//
}
//
GpuHidHaarClassifier
;
typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarStageClassifier
{
int
count
__attribute__
((
aligned
(
4
)))
;
...
...
@@ -77,29 +77,29 @@ typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
int
reserved3
__attribute__
((
aligned
(
8
)))
;
}
GpuHidHaarStageClassifier
;
typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarClassifierCascade
{
int
count
__attribute__
((
aligned
(
4
)))
;
int
is_stump_based
__attribute__
((
aligned
(
4
)))
;
int
has_tilted_features
__attribute__
((
aligned
(
4
)))
;
int
is_tree
__attribute__
((
aligned
(
4
)))
;
int
pq0
__attribute__
((
aligned
(
4
)))
;
int
pq1
__attribute__
((
aligned
(
4
)))
;
int
pq2
__attribute__
((
aligned
(
4
)))
;
int
pq3
__attribute__
((
aligned
(
4
)))
;
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
inv_window_area
__attribute__
((
aligned
(
4
)))
;
}
GpuHidHaarClassifierCascade
;
//
typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarClassifierCascade
//
{
//
int
count
__attribute__
((
aligned
(
4
)))
;
//
int
is_stump_based
__attribute__
((
aligned
(
4
)))
;
//
int
has_tilted_features
__attribute__
((
aligned
(
4
)))
;
//
int
is_tree
__attribute__
((
aligned
(
4
)))
;
//
int
pq0
__attribute__
((
aligned
(
4
)))
;
//
int
pq1
__attribute__
((
aligned
(
4
)))
;
//
int
pq2
__attribute__
((
aligned
(
4
)))
;
//
int
pq3
__attribute__
((
aligned
(
4
)))
;
//
int
p0
__attribute__
((
aligned
(
4
)))
;
//
int
p1
__attribute__
((
aligned
(
4
)))
;
//
int
p2
__attribute__
((
aligned
(
4
)))
;
//
int
p3
__attribute__
((
aligned
(
4
)))
;
//
float
inv_window_area
__attribute__
((
aligned
(
4
)))
;
//
}
GpuHidHaarClassifierCascade
;
__kernel
void
gpuRunHaarClassifierCascade_scaled2
(
global
GpuHidHaarStageClassifier
*stagecascadeptr,
global
GpuHidHaarStageClassifier
*stagecascadeptr
_
,
global
int4
*info,
global
GpuHidHaarTreeNode
*nodeptr,
global
GpuHidHaarTreeNode
*nodeptr
_
,
global
const
int
*restrict
sum,
global
const
float
*restrict
sqsum,
global
const
float
*restrict
sqsum,
global
int4
*candidate,
const
int
rows,
const
int
cols,
...
...
@@ -132,8 +132,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
int
max_idx
=
rows
*
cols
-
1
;
for
(
int
scalei
=
0
; scalei < loopcount; scalei++)
{
int4
scaleinfo1
;
scaleinfo1
=
info[scalei]
;
int4
scaleinfo1
=
info[scalei]
;
int
grpnumperline
=
(
scaleinfo1.y
&
0xffff0000
)
>>
16
;
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
...
...
@@ -174,15 +173,18 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++)
{
float
stage_sum
=
0.f
;
int
stagecount
=
stagecascadeptr[stageloop].count
;
__global
GpuHidHaarStageClassifier*
stageinfo
=
(
__global
GpuHidHaarStageClassifier*
)
(((
__global
uchar*
)
stagecascadeptr_
)
+stageloop*sizeof
(
GpuHidHaarStageClassifier
))
;
int
stagecount
=
stageinfo->count
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount;)
{
__global
GpuHidHaarTreeNode
*currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
__global
GpuHidHaarTreeNode*
)
(((
__global
uchar*
)
nodeptr_
)
+
nodecounter
*
sizeof
(
GpuHidHaarTreeNode
))
;
int4
info1
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4
*
)(
&
(
currentnodeptr->weight[0]
))
;
float3
alpha3
=
*
(
__global
float3
*
)(
&
(
currentnodeptr->alpha[0]
))
;
float3
alpha3
=
*
(
__global
float3*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=
p_offset
;
...
...
@@ -204,7 +206,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
sum[clamp
(
mad24
(
info3.w,
step,
info3.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
info3.w,
step,
info3.z
)
,
0
,
max_idx
)
]
)
*
w.z
;
bool
passThres
=
classsum
>=
nodethreshold
;
bool
passThres
=
(
classsum
>=
nodethreshold
)
?
1
:
0
;
#
if
STUMP_BASED
stage_sum
+=
passThres
?
alpha3.y
:
alpha3.x
;
...
...
@@ -234,7 +236,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
}
#endif
}
result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
result = (stage_sum >= stageinfo->threshold) ? 1 : 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
...
...
@@ -281,11 +284,14 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
}
}
}
__kernel
void
gpuscaleclassifier
(
global
GpuHidHaarTreeNode
*orinode,
global
GpuHidHaarTreeNode
*newnode,
float
scale,
float
weight_scale,
int
nodenum
)
__kernel
void
gpuscaleclassifier
(
global
GpuHidHaarTreeNode
*orinode,
global
GpuHidHaarTreeNode
*newnode,
float
scale,
float
weight_scale,
const
int
nodenum
)
{
int
counter
=
get_global_id
(
0
)
;
const
int
counter
=
get_global_id
(
0
)
;
int
tr_x[3],
tr_y[3],
tr_h[3],
tr_w[3],
i
=
0
;
GpuHidHaarTreeNode
t1
=
*
(
orinode
+
counter
)
;
GpuHidHaarTreeNode
t1
=
*
(
__global
GpuHidHaarTreeNode*
)
(((
__global
uchar*
)
orinode
)
+
counter
*
sizeof
(
GpuHidHaarTreeNode
))
;
__global
GpuHidHaarTreeNode*
pNew
=
(
__global
GpuHidHaarTreeNode*
)
(((
__global
uchar*
)
newnode
)
+
(
counter
+
nodenum
)
*
sizeof
(
GpuHidHaarTreeNode
))
;
#
pragma
unroll
for
(
i
=
0
; i < 3; i++)
...
...
@@ -297,22 +303,21 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
}
t1.weight[0]
=
-
(
t1.weight[1]
*
tr_h[1]
*
tr_w[1]
+
t1.weight[2]
*
tr_h[2]
*
tr_w[2]
)
/
(
tr_h[0]
*
tr_w[0]
)
;
counter
+=
nodenum
;
#
pragma
unroll
for
(
i
=
0
; i < 3; i++)
{
newnode[counter].
p[i][0]
=
tr_x[i]
;
newnode[counter].
p[i][1]
=
tr_y[i]
;
newnode[counter].
p[i][2]
=
tr_x[i]
+
tr_w[i]
;
newnode[counter].
p[i][3]
=
tr_y[i]
+
tr_h[i]
;
newnode[counter].
weight[i]
=
t1.weight[i]
*
weight_scale
;
pNew->
p[i][0]
=
tr_x[i]
;
pNew->
p[i][1]
=
tr_y[i]
;
pNew->
p[i][2]
=
tr_x[i]
+
tr_w[i]
;
pNew->
p[i][3]
=
tr_y[i]
+
tr_h[i]
;
pNew->
weight[i]
=
t1.weight[i]
*
weight_scale
;
}
newnode[counter].
left
=
t1.left
;
newnode[counter].
right
=
t1.right
;
newnode[counter].
threshold
=
t1.threshold
;
newnode[counter].
alpha[0]
=
t1.alpha[0]
;
newnode[counter].
alpha[1]
=
t1.alpha[1]
;
newnode[counter].
alpha[2]
=
t1.alpha[2]
;
pNew->
left
=
t1.left
;
pNew->
right
=
t1.right
;
pNew->
threshold
=
t1.threshold
;
pNew->
alpha[0]
=
t1.alpha[0]
;
pNew->
alpha[1]
=
t1.alpha[1]
;
pNew->
alpha[2]
=
t1.alpha[2]
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment