Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
720eaf1e
Commit
720eaf1e
authored
Feb 23, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix the haar kernel problems on Nvidia and Intel OCL
parent
e6dd4e84
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
208 additions
and
203 deletions
+208
-203
haar.cpp
modules/ocl/src/haar.cpp
+1
-1
haarobjectdetect.cl
modules/ocl/src/kernels/haarobjectdetect.cl
+207
-202
No files found.
modules/ocl/src/haar.cpp
View file @
720eaf1e
...
...
@@ -926,7 +926,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
gimg
.
cols
<
minSize
.
width
||
gimg
.
rows
<
minSize
.
height
)
CV_Error
(
CV_StsError
,
"Image too small"
);
if
(
(
flags
&
CV_HAAR_SCALE_IMAGE
)
&&
gimg
.
clCxt
->
impl
->
devName
.
find
(
"Intel(R) HD Graphics"
)
==
string
::
npos
)
if
(
(
flags
&
CV_HAAR_SCALE_IMAGE
)
)
{
CvSize
winSize0
=
cascade
->
orig_window_size
;
//float scalefactor = 1.1f;
...
...
modules/ocl/src/kernels/haarobjectdetect.cl
View file @
720eaf1e
...
...
@@ -9,6 +9,7 @@
//
Niko
Li,
newlife20080214@gmail.com
//
Wang
Weiyan,
wangweiyanster@gmail.com
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
Nathan,
liujun@multicorewareinc.com
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
...
...
@@ -47,14 +48,14 @@ typedef float sqsumtype;
typedef
struct
__attribute__
((
aligned
(
128
)))
GpuHidHaarFeature
{
struct
__attribute__
((
aligned
(
32
)))
{
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
weight
__attribute__
((
aligned
(
4
)))
;
}
rect[CV_HAAR_FEATURE_MAX]
__attribute__
((
aligned
(
32
)))
;
{
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
weight
__attribute__
((
aligned
(
4
)))
;
}
rect[CV_HAAR_FEATURE_MAX]
__attribute__
((
aligned
(
32
)))
;
}
GpuHidHaarFeature
;
...
...
@@ -108,31 +109,31 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
inv_window_area
__attribute__
((
aligned
(
4
)))
;
}GpuHidHaarClassifierCascade
;
}
GpuHidHaarClassifierCascade
;
__kernel
void
__attribute__
((
reqd_work_group_size
(
8
,
8
,
1
)))
gpuRunHaarClassifierCascade
(
//constant
GpuHidHaarClassifierCascade
*
cascade,
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
int4
*
info,
global
GpuHidHaarTreeNode
*
nodeptr,
global
const
int
*
restrict
sum1,
global
const
float
*
restrict
sqsum1,
global
int4
*
candidate,
const
int
pixelstep,
const
int
loopcount,
const
int
start_stage,
const
int
split_stage,
const
int
end_stage,
const
int
startnode,
const
int
splitnode,
const
int4
p,
const
int4
pq,
const
float
correction
//const
int
width,
//const
int
height,
//const
int
grpnumperline,
//const
int
totalgrp
)
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
int4
*
info,
global
GpuHidHaarTreeNode
*
nodeptr,
global
const
int
*
restrict
sum1,
global
const
float
*
restrict
sqsum1,
global
int4
*
candidate,
const
int
pixelstep,
const
int
loopcount,
const
int
start_stage,
const
int
split_stage,
const
int
end_stage,
const
int
startnode,
const
int
splitnode,
const
int4
p,
const
int4
pq,
const
float
correction
//const
int
width,
//const
int
height,
//const
int
grpnumperline,
//const
int
totalgrp
)
{
int
grpszx
=
get_local_size
(
0
)
;
int
grpszy
=
get_local_size
(
1
)
;
...
...
@@ -184,7 +185,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
__global
const
int
*
sum
=
sum1
+
imgoff
;
__global
const
float
*
sqsum
=
sqsum1
+
imgoff
;
for
(
int
grploop=grpidx
;
grploop<totalgrp;
grploop+=grpnumx)
for
(
int
grploop=grpidx
;
grploop<totalgrp;
grploop+=grpnumx)
{
int
grpidy
=
grploop
/
grpnumperline
;
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
...
...
@@ -195,7 +196,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
grpoffx
=
x-lclidx
;
int
grpoffy
=
y-lclidy
;
for
(
int
i=0
;
i<read_loop;
i++)
for
(
int
i=0
;
i<read_loop;
i++)
{
int
pos_id
=
mad24
(
i,lcl_sz,lcl_id
)
;
pos_id
=
pos_id
<
total_read
?
pos_id
:
0
;
...
...
@@ -234,15 +235,15 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
cascadeinfo1.x
+=lcl_off
;
cascadeinfo1.z
+=lcl_off
;
mean
=
(
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.x
)
]
-
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.z
)
]
-
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.x
)
]
+
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.z
)
]
)
*correction
;
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.x
)
]
+
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.z
)
]
)
*correction
;
int
p_offset
=
mad24
(
y,
pixelstep,
x
)
;
cascadeinfo2.x
+=p_offset
;
cascadeinfo2.z
+=p_offset
;
variance_norm_factor
=sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.x
)
]
-
sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.z
)
]
-
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.x
)
]
+
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.z
)
]
;
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.x
)
]
+
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.z
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
...
...
@@ -270,19 +271,19 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
info2.z
+=lcl_off
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
//if
((
info3.z
-
info3.x
)
&&
(
!stageinfo.z
))
//{
info3.x
+=lcl_off
;
info3.z
+=lcl_off
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
info3.x
+=lcl_off
;
info3.z
+=lcl_off
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
//}
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
...
...
@@ -299,12 +300,13 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
=
splitnode
;
for
(
int
stageloop
=
split_stage
; stageloop< end_stage && queuecount>0;stageloop++)
for
(
int
stageloop
=
split_stage
; stageloop< end_stage && queuecount>0;
stageloop++)
{
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//if
(
lcl_id
==
0
)
lclcount[0]=0
;
lclcount[0]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
...
...
@@ -316,70 +318,73 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
lcl_compute_win_id
=
(
lcl_id
>>
(
6-perfscale
))
;
int
lcl_loops
=
(
stageinfo.x
+
lcl_compute_win
-1
)
>>
(
6-perfscale
)
;
int
lcl_compute_id
=
lcl_id
-
(
lcl_compute_win_id
<<
(
6-perfscale
))
;
for
(
int
queueloop=0
;
queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;
queueloop++)
for
(
int
queueloop=0
;
queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;
queueloop++)
{
float
stage_sum
=
0.f
;
int
temp_coord
=
lcloutindex[lcl_compute_win_id<<1]
;
float
variance_norm_factor
=
as_float
(
lcloutindex[
(
lcl_compute_win_id<<1
)
+1]
)
;
int
queue_pixel
=
mad24
(((
temp_coord
&
(
int
)
0xffff0000
)
>>16
)
,
readwidth,temp_coord
&
0xffff
)
;
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
int
tempnodecounter
=
lcl_compute_id
;
float
part_sum
=
0.f
;
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
+
tempnodecounter
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
int
tempnodecounter
=
lcl_compute_id
;
float
part_sum
=
0.f
;
for
(
int
lcl_loop=0
; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
+
tempnodecounter
)
;
info1.x
+=queue_pixel
;
info1.z
+=queue_pixel
;
info2.x
+=queue_pixel
;
info2.z
+=queue_pixel
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
info1.x
+=queue_pixel
;
info1.z
+=queue_pixel
;
info2.x
+=queue_pixel
;
info2.z
+=queue_pixel
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
//if
((
info3.z
-
info3.x
)
&&
(
!stageinfo.z
))
//{
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
//if
((
info3.z
-
info3.x
)
&&
(
!stageinfo.z
))
//{
info3.x
+=queue_pixel
;
info3.z
+=queue_pixel
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
//}
part_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
tempnodecounter
+=lcl_compute_win
;
}//end
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
for
(
int
i=0
;i<lcl_compute_win && (lcl_compute_id==0);i++)
{
stage_sum
+=
partialsum[lcl_id+i]
;
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
//}
part_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
tempnodecounter
+=lcl_compute_win
;
}//end
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum
;
}
if
(
stage_sum
>=
stagethreshold
&&
(
lcl_compute_id==0
))
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
temp_coord
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
for
(
int
i=0
; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum
+=
partialsum[lcl_id+i]
;
}
if
(
stage_sum
>=
stagethreshold
&&
(
lcl_compute_id==0
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
temp_coord
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
lcl_compute_win_id
+=
(
1<<perfscale
)
;
}
lcl_compute_win_id
+=
(
1<<perfscale
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
for
(
int
queueloop=0
;queueloop<queuecount_loop;queueloop++)
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
+=
stageinfo.x
;
}//end
for
(
int
stageloop
=
splitstage
; stageloop< endstage && queuecount>0;stageloop++)
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -420,138 +425,138 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
/*
if
(
stagecascade->two_rects
)
{
#
pragma
unroll
for
(
n
=
0
; n < stagecascade->count; n++ )
{
t1
=
*
(
node
+
counter
)
;
t
=
t1.threshold
*
variance_norm_factor
;
classsum
=
calc_sum1
(
t1,p_offset,0
)
*
t1.weight[0]
;
/*
if
(
stagecascade->two_rects
)
{
#
pragma
unroll
for
(
n
=
0
; n < stagecascade->count; n++ )
{
t1
=
*
(
node
+
counter
)
;
t
=
t1.threshold
*
variance_norm_factor
;
classsum
=
calc_sum1
(
t1,p_offset,0
)
*
t1.weight[0]
;
classsum
+=
calc_sum1
(
t1,
p_offset,1
)
*
t1.weight[1]
;
stage_sum
+=
classsum
>=
t
?
t1.alpha[1]:t1.alpha[0]
;
classsum
+=
calc_sum1
(
t1,
p_offset,1
)
*
t1.weight[1]
;
stage_sum
+=
classsum
>=
t
?
t1.alpha[1]:t1.alpha[0]
;
counter++
;
}
}
else
{
#
pragma
unroll
for
(
n
=
0
; n < stagecascade->count; n++ )
{
t
=
node[counter].threshold*variance_norm_factor
;
classsum
=
calc_sum1
(
node[counter],p_offset,0
)
*
node[counter].weight[0]
;
classsum
+=
calc_sum1
(
node[counter],p_offset,1
)
*
node[counter].weight[1]
;
counter++
;
}
}
else
{
#
pragma
unroll
for
(
n
=
0
; n < stagecascade->count; n++ )
{
t
=
node[counter].threshold*variance_norm_factor
;
classsum
=
calc_sum1
(
node[counter],p_offset,0
)
*
node[counter].weight[0]
;
classsum
+=
calc_sum1
(
node[counter],p_offset,1
)
*
node[counter].weight[1]
;
if
(
node[counter].p0[2]
)
classsum
+=
calc_sum1
(
node[counter],p_offset,2
)
*
node[counter].weight[2]
;
if
(
node[counter].p0[2]
)
classsum
+=
calc_sum1
(
node[counter],p_offset,2
)
*
node[counter].weight[2]
;
stage_sum
+=
classsum
>=
t
?
node[counter].alpha[1]:node[counter].alpha[0]
;// modify
stage_sum
+=
classsum
>=
t
?
node[counter].alpha[1]:node[counter].alpha[0]
;// modify
counter++
;
}
}
*/
/*
counter++
;
}
}
*/
/*
__kernel
void
gpuRunHaarClassifierCascade_ScaleWindow
(
constant
GpuHidHaarClassifierCascade
*
_cascade,
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
//global
GpuHidHaarClassifier
*
classifierptr,
global
GpuHidHaarTreeNode
*
nodeptr,
global
int
*
sum,
global
float
*
sqsum,
global
int
*
_candidate,
int
pixel_step,
int
cols,
int
rows,
int
start_stage,
int
end_stage,
//int
counts,
int
nodenum,
int
ystep,
int
detect_width,
//int
detect_height,
int
loopcount,
int
outputstep
)
//float
scalefactor
)
constant
GpuHidHaarClassifierCascade
*
_cascade,
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
//global
GpuHidHaarClassifier
*
classifierptr,
global
GpuHidHaarTreeNode
*
nodeptr,
global
int
*
sum,
global
float
*
sqsum,
global
int
*
_candidate,
int
pixel_step,
int
cols,
int
rows,
int
start_stage,
int
end_stage,
//int
counts,
int
nodenum,
int
ystep,
int
detect_width,
//int
detect_height,
int
loopcount,
int
outputstep
)
//float
scalefactor
)
{
unsigned
int
x1
=
get_global_id
(
0
)
;
unsigned
int
y1
=
get_global_id
(
1
)
;
int
p_offset
;
int
m,
n
;
int
result
;
int
counter
;
float
mean,
variance_norm_factor
;
for
(
int
i=0
;i<loopcount;i++)
{
constant
GpuHidHaarClassifierCascade
*
cascade
=
_cascade
+
i
;
global
int
*
candidate
=
_candidate
+
i*outputstep
;
int
window_width
=
cascade->p1
-
cascade->p0
;
int
window_height
=
window_width
;
result
=
1
;
counter
=
0
;
unsigned
int
x
=
mul24
(
x1,ystep
)
;
unsigned
int
y
=
mul24
(
y1,ystep
)
;
if
((
x
<
cols
-
window_width
-
1
)
&&
(
y
<
rows
-
window_height
-1
))
{
global
GpuHidHaarStageClassifier
*stagecascade
=
stagecascadeptr
+cascade->count*i+
start_stage
;
//global
GpuHidHaarClassifier
*classifier
=
classifierptr
;
global
GpuHidHaarTreeNode
*node
=
nodeptr
+
nodenum*i
;
unsigned
int
x1
=
get_global_id
(
0
)
;
unsigned
int
y1
=
get_global_id
(
1
)
;
int
p_offset
;
int
m,
n
;
int
result
;
int
counter
;
float
mean,
variance_norm_factor
;
for
(
int
i=0
;i<loopcount;i++)
{
constant
GpuHidHaarClassifierCascade
*
cascade
=
_cascade
+
i
;
global
int
*
candidate
=
_candidate
+
i*outputstep
;
int
window_width
=
cascade->p1
-
cascade->p0
;
int
window_height
=
window_width
;
result
=
1
;
counter
=
0
;
unsigned
int
x
=
mul24
(
x1,ystep
)
;
unsigned
int
y
=
mul24
(
y1,ystep
)
;
if
((
x
<
cols
-
window_width
-
1
)
&&
(
y
<
rows
-
window_height
-1
))
{
global
GpuHidHaarStageClassifier
*stagecascade
=
stagecascadeptr
+cascade->count*i+
start_stage
;
//global
GpuHidHaarClassifier
*classifier
=
classifierptr
;
global
GpuHidHaarTreeNode
*node
=
nodeptr
+
nodenum*i
;
p_offset
=
mad24
(
y,
pixel_step,
x
)
;// modify
p_offset
=
mad24
(
y,
pixel_step,
x
)
;// modify
mean
=
(
*
(
sum
+
p_offset
+
(
int
)
cascade->p0
)
-
*
(
sum
+
p_offset
+
(
int
)
cascade->p1
)
-
*
(
sum
+
p_offset
+
(
int
)
cascade->p2
)
+
*
(
sum
+
p_offset
+
(
int
)
cascade->p3
))
*cascade->inv_window_area
;
mean
=
(
*
(
sum
+
p_offset
+
(
int
)
cascade->p0
)
-
*
(
sum
+
p_offset
+
(
int
)
cascade->p1
)
-
*
(
sum
+
p_offset
+
(
int
)
cascade->p2
)
+
*
(
sum
+
p_offset
+
(
int
)
cascade->p3
))
*cascade->inv_window_area
;
variance_norm_factor
=
*
(
sqsum
+
p_offset
+
cascade->p0
)
-
*
(
sqsum
+
cascade->p1
+
p_offset
)
-
*
(
sqsum
+
p_offset
+
cascade->p2
)
+
*
(
sqsum
+
cascade->p3
+
p_offset
)
;
variance_norm_factor
=
variance_norm_factor
*
cascade->inv_window_area
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1
;//modify
variance_norm_factor
=
*
(
sqsum
+
p_offset
+
cascade->p0
)
-
*
(
sqsum
+
cascade->p1
+
p_offset
)
-
*
(
sqsum
+
p_offset
+
cascade->p2
)
+
*
(
sqsum
+
cascade->p3
+
p_offset
)
;
variance_norm_factor
=
variance_norm_factor
*
cascade->inv_window_area
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1
;//modify
//
if
(
cascade->is_stump_based
)
//{
for
(
m
=
start_stage
; m < end_stage; m++ )
{
float
stage_sum
=
0.f
;
float
t
,
classsum
;
GpuHidHaarTreeNode
t1
;
//
if
(
cascade->is_stump_based
)
//{
for
(
m
=
start_stage
; m < end_stage; m++ )
{
float
stage_sum
=
0.f
;
float
t
,
classsum
;
GpuHidHaarTreeNode
t1
;
//#pragma
unroll
for
(
n
=
0
; n < stagecascade->count; n++ )
{
t1
=
*
(
node
+
counter
)
;
t
=
t1.threshold
*
variance_norm_factor
;
classsum
=
calc_sum1
(
t1,
p_offset
,
0
)
*
t1.weight[0]
+
calc_sum1
(
t1,
p_offset
,
1
)
*
t1.weight[1]
;
//#pragma
unroll
for
(
n
=
0
; n < stagecascade->count; n++ )
{
t1
=
*
(
node
+
counter
)
;
t
=
t1.threshold
*
variance_norm_factor
;
classsum
=
calc_sum1
(
t1,
p_offset
,
0
)
*
t1.weight[0]
+
calc_sum1
(
t1,
p_offset
,
1
)
*
t1.weight[1]
;
if
((
t1.p0[2]
)
&&
(
!stagecascade->two_rects
))
classsum
+=
calc_sum1
(
t1,
p_offset,
2
)
*
t1.weight[2]
;
if
((
t1.p0[2]
)
&&
(
!stagecascade->two_rects
))
classsum
+=
calc_sum1
(
t1,
p_offset,
2
)
*
t1.weight[2]
;
stage_sum
+=
classsum
>=
t
?
t1.alpha[1]
:
t1.alpha[0]
;// modify
counter++
;
}
stage_sum
+=
classsum
>=
t
?
t1.alpha[1]
:
t1.alpha[0]
;// modify
counter++
;
}
if
(
stage_sum
<
stagecascade->threshold
)
{
result
=
0
;
break
;
}
if
(
stage_sum
<
stagecascade->threshold
)
{
result
=
0
;
break
;
}
stagecascade++
;
stagecascade++
;
}
if
(
result
)
{
candidate[4
*
(
y1
*
detect_width
+
x1
)
]
=
x
;
candidate[4
*
(
y1
*
detect_width
+
x1
)
+
1]
=
y
;
candidate[4
*
(
y1
*
detect_width
+
x1
)
+2]
=
window_width
;
candidate[4
*
(
y1
*
detect_width
+
x1
)
+
3]
=
window_height
;
}
//}
}
}
}
if
(
result
)
{
candidate[4
*
(
y1
*
detect_width
+
x1
)
]
=
x
;
candidate[4
*
(
y1
*
detect_width
+
x1
)
+
1]
=
y
;
candidate[4
*
(
y1
*
detect_width
+
x1
)
+2]
=
window_width
;
candidate[4
*
(
y1
*
detect_width
+
x1
)
+
3]
=
window_height
;
}
//}
}
}
}
*/
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment