Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b674cd85
Commit
b674cd85
authored
Dec 19, 2013
by
Andrey Pavlenko
Committed by
OpenCV Buildbot
Dec 19, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #2007 from krodyush:pullreq/2.4-opt-131202-haar
parents
8c91a1af
917b883c
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
61 additions
and
32 deletions
+61
-32
haar.cpp
modules/ocl/src/haar.cpp
+48
-19
haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+13
-13
No files found.
modules/ocl/src/haar.cpp
View file @
b674cd85
...
...
@@ -866,16 +866,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
gcascade
->
is_stump_based
&&
gsum
.
clCxt
->
supportsFeature
(
FEATURE_CL_INTEL_DEVICE
))
{
//setup local group size
localThreads
[
0
]
=
8
;
localThreads
[
1
]
=
16
;
//setup local group size
for "pixel step" = 1
localThreads
[
0
]
=
16
;
localThreads
[
1
]
=
32
;
localThreads
[
2
]
=
1
;
//
init
maximal number of workgroups
//
calc
maximal number of workgroups
int
WGNumX
=
1
+
(
sizev
[
0
].
width
/
(
localThreads
[
0
]));
int
WGNumY
=
1
+
(
sizev
[
0
].
height
/
(
localThreads
[
1
]));
int
WGNumZ
=
loopcount
;
int
WGNum
=
0
;
//accurate number of non -empty workgroups
int
WGNumTotal
=
0
;
//accurate number of non-empty workgroups
int
WGNumSampled
=
0
;
//accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
oclMat
oclWGInfo
(
1
,
sizeof
(
cl_int4
)
*
WGNumX
*
WGNumY
*
WGNumZ
,
CV_8U
);
{
cl_int4
*
pWGInfo
=
(
cl_int4
*
)
clEnqueueMapBuffer
(
getClCommandQueue
(
oclWGInfo
.
clCxt
),(
cl_mem
)
oclWGInfo
.
datastart
,
true
,
CL_MAP_WRITE
,
0
,
oclWGInfo
.
step
,
0
,
0
,
0
,
&
status
);
...
...
@@ -895,12 +896,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
gx
>=
(
Width
-
cascade
->
orig_window_size
.
width
))
continue
;
// no data to process
if
(
scaleinfo
[
z
].
factor
<=
2
)
{
WGNumSampled
++
;
}
// save no-empty workgroup info into array
pWGInfo
[
WGNum
].
s
[
0
]
=
scaleinfo
[
z
].
width_height
;
pWGInfo
[
WGNum
].
s
[
1
]
=
(
gx
<<
16
)
|
gy
;
pWGInfo
[
WGNum
].
s
[
2
]
=
scaleinfo
[
z
].
imgoff
;
memcpy
(
&
(
pWGInfo
[
WGNum
].
s
[
3
]),
&
(
scaleinfo
[
z
].
factor
),
sizeof
(
float
));
WGNum
++
;
pWGInfo
[
WGNum
Total
].
s
[
0
]
=
scaleinfo
[
z
].
width_height
;
pWGInfo
[
WGNum
Total
].
s
[
1
]
=
(
gx
<<
16
)
|
gy
;
pWGInfo
[
WGNum
Total
].
s
[
2
]
=
scaleinfo
[
z
].
imgoff
;
memcpy
(
&
(
pWGInfo
[
WGNum
Total
].
s
[
3
]),
&
(
scaleinfo
[
z
].
factor
),
sizeof
(
float
));
WGNum
Total
++
;
}
}
}
...
...
@@ -908,13 +913,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
pWGInfo
=
NULL
;
}
// setup global sizes to have linear array of workgroups with WGNum size
globalThreads
[
0
]
=
localThreads
[
0
]
*
WGNum
;
globalThreads
[
1
]
=
localThreads
[
1
];
globalThreads
[
2
]
=
1
;
#define NODE_SIZE 12
// pack node info to have less memory loads
// pack node info to have less memory loads
on the device side
oclMat
oclNodesPK
(
1
,
sizeof
(
cl_int
)
*
NODE_SIZE
*
nodenum
,
CV_8U
);
{
cl_int
status
;
...
...
@@ -963,8 +963,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
options
+=
format
(
" -D WND_SIZE_X=%d"
,
cascade
->
orig_window_size
.
width
);
options
+=
format
(
" -D WND_SIZE_Y=%d"
,
cascade
->
orig_window_size
.
height
);
options
+=
format
(
" -D STUMP_BASED=%d"
,
gcascade
->
is_stump_based
);
options
+=
format
(
" -D LSx=%d"
,
localThreads
[
0
]);
options
+=
format
(
" -D LSy=%d"
,
localThreads
[
1
]);
options
+=
format
(
" -D SPLITNODE=%d"
,
splitnode
);
options
+=
format
(
" -D SPLITSTAGE=%d"
,
splitstage
);
options
+=
format
(
" -D OUTPUTSZ=%d"
,
outputsz
);
...
...
@@ -972,8 +970,39 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
// init candiate global count by 0
int
pattern
=
0
;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
candidatebuffer
,
1
,
0
,
1
*
sizeof
(
pattern
),
&
pattern
,
0
,
NULL
,
NULL
));
if
(
WGNumTotal
>
WGNumSampled
)
{
// small images and each pixel is processed
// setup global sizes to have linear array of workgroups with WGNum size
int
pixelstep
=
1
;
size_t
LS
[
3
]
=
{
localThreads
[
0
]
/
pixelstep
,
localThreads
[
1
]
/
pixelstep
,
1
};
globalThreads
[
0
]
=
LS
[
0
]
*
(
WGNumTotal
-
WGNumSampled
);
globalThreads
[
1
]
=
LS
[
1
];
globalThreads
[
2
]
=
1
;
string
options1
=
options
;
options1
+=
format
(
" -D PIXEL_STEP=%d"
,
pixelstep
);
options1
+=
format
(
" -D WGSTART=%d"
,
WGNumSampled
);
options1
+=
format
(
" -D LSx=%d"
,
LS
[
0
]);
options1
+=
format
(
" -D LSy=%d"
,
LS
[
1
]);
// execute face detector
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascadePacked"
,
globalThreads
,
LS
,
args
,
-
1
,
-
1
,
options1
.
c_str
());
}
if
(
WGNumSampled
>
0
)
{
// large images each 4th pixel is processed
// setup global sizes to have linear array of workgroups with WGNum size
int
pixelstep
=
2
;
size_t
LS
[
3
]
=
{
localThreads
[
0
]
/
pixelstep
,
localThreads
[
1
]
/
pixelstep
,
1
};
globalThreads
[
0
]
=
LS
[
0
]
*
WGNumSampled
;
globalThreads
[
1
]
=
LS
[
1
];
globalThreads
[
2
]
=
1
;
string
options2
=
options
;
options2
+=
format
(
" -D PIXEL_STEP=%d"
,
pixelstep
);
options2
+=
format
(
" -D WGSTART=%d"
,
0
);
options2
+=
format
(
" -D LSx=%d"
,
LS
[
0
]);
options2
+=
format
(
" -D LSy=%d"
,
LS
[
1
]);
// execute face detector
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascadePacked"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
options
.
c_str
());
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascadePacked"
,
globalThreads
,
LS
,
args
,
-
1
,
-
1
,
options2
.
c_str
());
}
//read candidate buffer back and put it into host list
openCLReadBuffer
(
gsum
.
clCxt
,
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
assert
(
candidate
[
0
]
<
outputsz
);
...
...
modules/ocl/src/opencl/haarobjectdetect.cl
View file @
b674cd85
...
...
@@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
)
{
//
this
version
used
information
provided
for
each
workgroup
//
no
empty
WG
int
gid
=
(
int
)
get_group_id
(
0
)
;
int
lid_x
=
(
int
)
get_local_id
(
0
)
;
int
lid_y
=
(
int
)
get_local_id
(
1
)
;
int
lid
=
lid_y*LSx+lid_x
;
int4
WGInfo
=
pWGInfo[gid]
;
int4
WGInfo
=
pWGInfo[
WGSTART+
gid]
;
int
GroupX
=
(
WGInfo.y
>>
16
)
&0xFFFF
;
int
GroupY
=
(
WGInfo.y
>>
0
)
&
0xFFFF
;
int
Width
=
(
WGInfo.x
>>
16
)
&0xFFFF
;
...
...
@@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int
ImgOffset
=
WGInfo.z
;
float
ScaleFactor
=
as_float
(
WGInfo.w
)
;
#
define
DATA_SIZE_X
(
LSx+WND_SIZE_X
)
#
define
DATA_SIZE_Y
(
LSy+WND_SIZE_Y
)
#
define
DATA_SIZE_X
(
PIXEL_STEP*
LSx+WND_SIZE_X
)
#
define
DATA_SIZE_Y
(
PIXEL_STEP*
LSy+WND_SIZE_Y
)
#
define
DATA_SIZE
(
DATA_SIZE_X*DATA_SIZE_Y
)
local
int
SumL[DATA_SIZE]
;
...
...
@@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int4
info1
=
p
;
int4
info2
=
pq
;
{
int
xl
=
lid_x
;
int
yl
=
lid_y
;
//
calc
processed
ROI
coordinate
in
local
mem
int
xl
=
lid_x*PIXEL_STEP
;
int
yl
=
lid_y*PIXEL_STEP
;
{//
calc
variance_norm_factor
for
all
stages
int
OffsetLocal
=
yl
*
DATA_SIZE_X
+
xl
;
int
OffsetGlobal
=
(
GroupY+yl
)
*
pixelstep
+
(
GroupX+xl
)
;
...
...
@@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int
result
=
(
1.0f>0.0f
)
;
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++ )
{//
iterate
until
candidate
is
exist
{//
iterate
until
candidate
is
valid
float
stage_sum
=
0.0f
;
__global
GpuHidHaarStageClassifier*
stageinfo
=
(
__global
GpuHidHaarStageClassifier*
)
((
__global
uchar*
)
stagecascadeptr+stageloop*sizeof
(
GpuHidHaarStageClassifier
))
;
int
lcl_off
=
(
yl*DATA_SIZE_X
)
+
(
xl
)
;
int
stagecount
=
stageinfo->count
;
float
stagethreshold
=
stageinfo->threshold
;
int
lcl_off
=
(
lid_y*DATA_SIZE_X
)
+
(
lid_x
)
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount; nodecounter++,nodeloop++ )
{
//
simple
macro
to
extract
shorts
from
int
...
...
@@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int4
n1
=
pN[1]
;
int4
n2
=
pN[2]
;
float
nodethreshold
=
as_float
(
n2.y
)
*
variance_norm_factor
;
//
calc
sum
of
intensity
pixels
according
to
node
information
//
calc
sum
of
intensity
pixels
according
to
classifier
node
information
float
classsum
=
(
SumL[M0
(
n0.x
)
+lcl_off]
-
SumL[M1
(
n0.x
)
+lcl_off]
-
SumL[M0
(
n0.y
)
+lcl_off]
+
SumL[M1
(
n0.y
)
+lcl_off]
)
*
as_float
(
n1.z
)
+
(
SumL[M0
(
n0.z
)
+lcl_off]
-
SumL[M1
(
n0.z
)
+lcl_off]
-
SumL[M0
(
n0.w
)
+lcl_off]
+
SumL[M1
(
n0.w
)
+lcl_off]
)
*
as_float
(
n1.w
)
+
...
...
@@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int
index
=
1+atomic_inc
((
volatile
global
int*
)
candidate
)
; //get index to write global data with face info
if
(
index<OUTPUTSZ
)
{
int
x
=
GroupX+
lid_x
;
int
y
=
GroupY+
lid_y
;
int
x
=
GroupX+
xl
;
int
y
=
GroupY+
yl
;
int4
candidate_result
;
candidate_result.x
=
convert_int_rtn
(
x*ScaleFactor
)
;
candidate_result.y
=
convert_int_rtn
(
y*ScaleFactor
)
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment