Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
69a0b5dd
Commit
69a0b5dd
authored
Apr 15, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add OclCascadeClassifierBuf interface
parent
abe2ea59
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
956 additions
and
499 deletions
+956
-499
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+38
-0
haar.cpp
modules/ocl/src/haar.cpp
+587
-128
haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+142
-176
haarobjectdetect_scaled2.cl
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+141
-145
test_haar.cpp
modules/ocl/test/test_haar.cpp
+48
-50
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
69a0b5dd
...
...
@@ -802,6 +802,44 @@ namespace cv
int
minNeighbors
,
int
flags
,
CvSize
minSize
=
cvSize
(
0
,
0
),
CvSize
maxSize
=
cvSize
(
0
,
0
));
};
class
CV_EXPORTS
OclCascadeClassifierBuf
:
public
cv
::
CascadeClassifier
{
public
:
OclCascadeClassifierBuf
()
:
m_flags
(
0
),
initialized
(
false
),
m_scaleFactor
(
0
),
buffers
(
NULL
)
{}
~
OclCascadeClassifierBuf
()
{}
void
detectMultiScale
(
oclMat
&
image
,
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
double
scaleFactor
=
1.1
,
int
minNeighbors
=
3
,
int
flags
=
0
,
Size
minSize
=
Size
(),
Size
maxSize
=
Size
());
void
release
();
private
:
void
Init
(
const
int
rows
,
const
int
cols
,
double
scaleFactor
,
int
flags
,
const
int
outputsz
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
);
void
CreateBaseBufs
(
const
int
datasize
,
const
int
totalclassifier
,
const
int
flags
,
const
int
outputsz
);
void
CreateFactorRelatedBufs
(
const
int
rows
,
const
int
cols
,
const
int
flags
,
const
double
scaleFactor
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
);
void
GenResult
(
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
const
std
::
vector
<
cv
::
Rect
>
&
rectList
,
const
std
::
vector
<
int
>
&
rweights
);
int
m_rows
;
int
m_cols
;
int
m_flags
;
int
m_loopcount
;
int
m_nodenum
;
bool
findBiggestObject
;
bool
initialized
;
double
m_scaleFactor
;
Size
m_minSize
;
Size
m_maxSize
;
vector
<
CvSize
>
sizev
;
vector
<
float
>
scalev
;
oclMat
gimg1
,
gsum
,
gsqsum
;
void
*
buffers
;
};
/////////////////////////////// Pyramid /////////////////////////////////////
...
...
modules/ocl/src/haar.cpp
View file @
69a0b5dd
...
...
@@ -20,6 +20,7 @@
// Jia Haipeng, jiahaipeng95@gmail.com
// Wu Xinglong, wxl370@126.com
// Wang Yao, bitwangyaoyao@gmail.com
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
...
...
@@ -842,15 +843,13 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
}
/* j */
}
}
CvSeq
*
cv
::
ocl
::
OclCascadeClassifier
::
oclHaarDetectObjects
(
oclMat
&
gimg
,
CvMemStorage
*
storage
,
double
scaleFactor
,
int
minNeighbors
,
int
flags
,
CvSize
minSize
,
CvSize
maxSize
)
{
CvHaarClassifierCascade
*
cascade
=
oldCascade
;
//double alltime = (double)cvGetTickCount();
//double t = (double)cvGetTickCount();
const
double
GROUP_EPS
=
0.2
;
oclMat
gtemp
,
gsum1
,
gtilted1
,
gsqsum1
,
gnormImg
,
gsumcanny
;
CvSeq
*
result_seq
=
0
;
cv
::
Ptr
<
CvMemStorage
>
temp_storage
;
...
...
@@ -861,7 +860,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
datasize
=
0
;
int
totalclassifier
=
0
;
//void *out;
GpuHidHaarClassifierCascade
*
gcascade
;
GpuHidHaarStageClassifier
*
stage
;
GpuHidHaarClassifier
*
classifier
;
...
...
@@ -870,11 +868,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
*
candidate
;
cl_int
status
;
// bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
bool
findBiggestObject
=
(
flags
&
CV_HAAR_FIND_BIGGEST_OBJECT
)
!=
0
;
// bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
//double t = 0;
if
(
maxSize
.
height
==
0
||
maxSize
.
width
==
0
)
{
maxSize
.
height
=
gimg
.
rows
;
...
...
@@ -896,27 +891,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
findBiggestObject
)
flags
&=
~
CV_HAAR_SCALE_IMAGE
;
//gtemp = oclMat( gimg.rows, gimg.cols, CV_8UC1);
//gsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
//gsqsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32FC1 );
if
(
!
cascade
->
hid_cascade
)
/*out = (void *)*/
gpuCreateHidHaarClassifierCascade
(
cascade
,
&
datasize
,
&
totalclassifier
);
if
(
cascade
->
hid_cascade
->
has_tilted_features
)
gtilted1
=
oclMat
(
gimg
.
rows
+
1
,
gimg
.
cols
+
1
,
CV_32SC1
);
gpuCreateHidHaarClassifierCascade
(
cascade
,
&
datasize
,
&
totalclassifier
);
result_seq
=
cvCreateSeq
(
0
,
sizeof
(
CvSeq
),
sizeof
(
CvAvgComp
),
storage
);
if
(
CV_MAT_CN
(
gimg
.
type
())
>
1
)
{
oclMat
gtemp
;
cvtColor
(
gimg
,
gtemp
,
CV_BGR2GRAY
);
gimg
=
gtemp
;
}
if
(
findBiggestObject
)
flags
&=
~
(
CV_HAAR_SCALE_IMAGE
|
CV_HAAR_DO_CANNY_PRUNING
);
//t = (double)cvGetTickCount() - t;
//printf( "before if time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
if
(
gimg
.
cols
<
minSize
.
width
||
gimg
.
rows
<
minSize
.
height
)
CV_Error
(
CV_StsError
,
"Image too small"
);
...
...
@@ -924,12 +912,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
(
flags
&
CV_HAAR_SCALE_IMAGE
)
)
{
CvSize
winSize0
=
cascade
->
orig_window_size
;
//float scalefactor = 1.1f;
//float factor = 1.f;
int
totalheight
=
0
;
int
indexy
=
0
;
CvSize
sz
;
//t = (double)cvGetTickCount();
vector
<
CvSize
>
sizev
;
vector
<
float
>
scalev
;
for
(
factor
=
1.
f
;;
factor
*=
scaleFactor
)
...
...
@@ -950,20 +935,15 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
sizev
.
push_back
(
sz
);
scalev
.
push_back
(
factor
);
}
//int flag = 0;
oclMat
gimg1
(
gimg
.
rows
,
gimg
.
cols
,
CV_8UC1
);
oclMat
gsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32SC1
);
oclMat
gsqsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32FC1
);
//cl_mem cascadebuffer;
cl_mem
stagebuffer
;
//cl_mem classifierbuffer;
cl_mem
nodebuffer
;
cl_mem
candidatebuffer
;
cl_mem
scaleinfobuffer
;
//cl_kernel kernel;
//kernel = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade");
cv
::
Rect
roi
,
roi2
;
cv
::
Mat
imgroi
,
imgroisq
;
cv
::
ocl
::
oclMat
resizeroi
,
gimgroi
,
gimgroisq
;
...
...
@@ -971,18 +951,13 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t
blocksize
=
8
;
size_t
localThreads
[
3
]
=
{
blocksize
,
blocksize
,
1
};
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
((
gsum
.
clCxt
)
->
computeUnits
()
)
*
localThreads
[
0
],
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
gsum
.
clCxt
->
computeUnits
(
)
*
localThreads
[
0
],
localThreads
[
1
],
1
};
int
outputsz
=
256
*
globalThreads
[
0
]
/
localThreads
[
0
];
int
loopcount
=
sizev
.
size
();
detect_piramid_info
*
scaleinfo
=
(
detect_piramid_info
*
)
malloc
(
sizeof
(
detect_piramid_info
)
*
loopcount
);
//t = (double)cvGetTickCount() - t;
// printf( "pre time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//int *it =scaleinfo;
// t = (double)cvGetTickCount();
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
...
...
@@ -992,7 +967,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
resizeroi
=
gimg1
(
roi2
);
gimgroi
=
gsum
(
roi
);
gimgroisq
=
gsqsum
(
roi
);
//scaleinfo[i].rows = gimgroi.rows;
int
width
=
gimgroi
.
cols
-
1
-
cascade
->
orig_window_size
.
width
;
int
height
=
gimgroi
.
rows
-
1
-
cascade
->
orig_window_size
.
height
;
scaleinfo
[
i
].
width_height
=
(
width
<<
16
)
|
height
;
...
...
@@ -1000,76 +974,40 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
//outputsz +=width*height;
scaleinfo
[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
scaleinfo
[
i
].
imgoff
=
gimgroi
.
offset
>>
2
;
scaleinfo
[
i
].
factor
=
factor
;
//printf("rows = %d,ystep = %d,width = %d,height = %d,grpnumperline = %d,totalgrp = %d,imgoff = %d,factor = %f\n",
// scaleinfo[i].rows,scaleinfo[i].ystep,scaleinfo[i].width,scaleinfo[i].height,scaleinfo[i].grpnumperline,
// scaleinfo[i].totalgrp,scaleinfo[i].imgoff,scaleinfo[i].factor);
cv
::
ocl
::
resize
(
gimg
,
resizeroi
,
Size
(
sz
.
width
-
1
,
sz
.
height
-
1
),
0
,
0
,
INTER_LINEAR
);
//cv::imwrite("D:\\1.jpg",gimg1);
cv
::
ocl
::
integral
(
resizeroi
,
gimgroi
,
gimgroisq
);
//cv::ocl::oclMat chk(sz.height,sz.width,CV_32SC1),chksq(sz.height,sz.width,CV_32FC1);
//cv::ocl::integral(gimg1, chk, chksq);
//double r = cv::norm(chk,gimgroi,NORM_INF);
//if(r > std::numeric_limits<double>::epsilon())
//{
// printf("failed");
//}
indexy
+=
sz
.
height
;
}
//int ystep = factor > 2 ? 1 : 2;
// t = (double)cvGetTickCount() - t;
//printf( "resize integral time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
gcascade
=
(
GpuHidHaarClassifierCascade
*
)
cascade
->
hid_cascade
;
stage
=
(
GpuHidHaarStageClassifier
*
)(
gcascade
+
1
);
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
//int m,n;
//m = (gsum.cols - 1 - cascade->orig_window_size.width + ystep - 1)/ystep;
//n = (gsum.rows - 1 - cascade->orig_window_size.height + ystep - 1)/ystep;
//int counter = m*n;
int
nodenum
=
(
datasize
-
sizeof
(
GpuHidHaarClassifierCascade
)
-
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
-
sizeof
(
GpuHidHaarClassifier
)
*
totalclassifier
)
/
sizeof
(
GpuHidHaarTreeNode
);
//if(flag == 0){
candidate
=
(
int
*
)
malloc
(
4
*
sizeof
(
int
)
*
outputsz
);
//memset((char*)candidate,0,4*sizeof(int)*outputsz);
gpuSetImagesForHaarClassifierCascade
(
cascade
,
/* &sum1, &sqsum1, _tilted,*/
1.
,
gsum
.
step
/
4
);
//cascadebuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifierCascade),NULL,&status);
//openCLVerifyCall(status);
//openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,cascadebuffer,1,0,sizeof(GpuHidHaarClassifierCascade),gcascade,0,NULL,NULL));
gpuSetImagesForHaarClassifierCascade
(
cascade
,
1.
,
gsum
.
step
/
4
);
stagebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
//classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
//status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
();
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
nodebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
nodebuffer
,
1
,
0
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
nodebuffer
,
1
,
0
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
candidatebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_WRITE_ONLY
,
4
*
sizeof
(
int
)
*
outputsz
);
//openCLVerifyCall(status);
scaleinfobuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
//flag = 1;
//}
//t = (double)cvGetTickCount() - t
;
//printf( "update time = %g ms\n", t/((double)cvGetTickFrequency()*1000.)
);
scaleinfobuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
)
;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
)
);
//size_t globalThreads[3] = { counter+blocksize*blocksize-counter%(blocksize*blocksize),1,1};
//t = (double)cvGetTickCount();
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
int
startnode
=
0
;
...
...
@@ -1087,11 +1025,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
pq
.
s
[
3
]
=
gcascade
->
pq3
;
float
correction
=
gcascade
->
inv_window_area
;
//int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]);
//int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline;
// openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads);
//openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_mem),(void*)&cascadebuffer));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
stagebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
scaleinfobuffer
));
...
...
@@ -1111,28 +1044,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
correction
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascade"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
//t = (double)cvGetTickCount() - t;
//printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, 0, 4 * sizeof(int)*outputsz, candidate, 0, NULL, NULL));
openCLReadBuffer
(
gsum
.
clCxt
,
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
// t = (double)cvGetTickCount() - t;
//printf( "post time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
free
(
scaleinfo
);
free
(
candidate
);
//openCLSafeCall(clReleaseMemObject(cascadebuffer));
openCLSafeCall
(
clReleaseMemObject
(
stagebuffer
));
openCLSafeCall
(
clReleaseMemObject
(
scaleinfobuffer
));
openCLSafeCall
(
clReleaseMemObject
(
nodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(
candidatebuffer
));
// openCLSafeCall(clReleaseKernel(kernel));
//t = (double)cvGetTickCount() - t;
//printf( "release time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
}
else
{
...
...
@@ -1150,7 +1075,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
cl_mem
stagebuffer
;
//cl_mem classifierbuffer;
cl_mem
nodebuffer
;
cl_mem
candidatebuffer
;
cl_mem
scaleinfobuffer
;
...
...
@@ -1187,24 +1111,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t
blocksize
=
8
;
size_t
localThreads
[
3
]
=
{
blocksize
,
blocksize
,
1
};
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
gsum
.
clCxt
->
computeUnits
()
*
localThreads
[
0
],
localThreads
[
1
],
1
};
localThreads
[
1
],
1
};
int
outputsz
=
256
*
globalThreads
[
0
]
/
localThreads
[
0
];
int
nodenum
=
(
datasize
-
sizeof
(
GpuHidHaarClassifierCascade
)
-
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
-
sizeof
(
GpuHidHaarClassifier
)
*
totalclassifier
)
/
sizeof
(
GpuHidHaarTreeNode
);
nodebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
//openCLVerifyCall(status
);
openCLSafeCall
(
clEnqueueWriteBuffer
(
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
()
,
nodebuffer
,
1
,
0
,
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(
);
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
nodebuffer
,
1
,
0
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
cl_mem
newnodebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_WRITE
,
loopcount
*
nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
//cl_kernel kernel;
//kernel = openCLGetKernelFromSource(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2");
//cl_kernel kernel2 = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier");
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
...
...
@@ -1223,7 +1143,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
height
=
(
gsum
.
rows
-
1
-
sz
.
height
+
ystep
-
1
)
/
ystep
;
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
//outputsz +=width*height;
scaleinfo
[
i
].
width_height
=
(
width
<<
16
)
|
height
;
scaleinfo
[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
scaleinfo
[
i
].
imgoff
=
0
;
...
...
@@ -1241,28 +1161,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t
globalThreads2
[
3
]
=
{
nodenum
,
1
,
1
};
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuscaleclassifier"
,
globalThreads2
,
NULL
/*localThreads2*/
,
args1
,
-
1
,
-
1
);
//clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel2, 1, NULL, globalThreads2, 0, 0, NULL, NULL);
//clFinish(gsum.clCxt->impl->clCmdQueue);
}
//clReleaseKernel(kernel2);
int
step
=
gsum
.
step
/
4
;
int
startnode
=
0
;
int
splitstage
=
3
;
int
splitnode
=
stage
[
0
].
count
+
stage
[
1
].
count
+
stage
[
2
].
count
;
stagebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
candidatebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_WRITE_ONLY
|
CL_MEM_ALLOC_HOST_PTR
,
4
*
sizeof
(
int
)
*
outputsz
);
//openCLVerifyCall(status);
scaleinfobuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
pbuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
cl_int4
)
*
loopcount
);
openCLSafeCall
(
clEnqueueWriteBuffer
(
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
()
,
pbuffer
,
1
,
0
,
sizeof
(
cl_int4
)
*
loopcount
,
p
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
pbuffer
,
1
,
0
,
sizeof
(
cl_int4
)
*
loopcount
,
p
,
0
,
NULL
,
NULL
));
correctionbuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
cl_float
)
*
loopcount
);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
correctionbuffer
,
1
,
0
,
sizeof
(
cl_float
)
*
loopcount
,
correction
,
0
,
NULL
,
NULL
));
//int argcount = 0;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
correctionbuffer
,
1
,
0
,
sizeof
(
cl_float
)
*
loopcount
,
correction
,
0
,
NULL
,
NULL
));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
stagebuffer
));
...
...
@@ -1271,22 +1183,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
candidatebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
loopcount
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
endstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
pbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
correctionbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
nodenum
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuRunHaarClassifierCascade_scaled2"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
candidate
=
(
int
*
)
clEnqueueMapBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
candidatebuffer
,
1
,
CL_MAP_READ
,
0
,
4
*
sizeof
(
int
),
0
,
0
,
0
,
&
status
);
candidate
=
(
int
*
)
clEnqueueMapBuffer
(
qu
,
candidatebuffer
,
1
,
CL_MAP_READ
,
0
,
4
*
sizeof
(
int
)
*
outputsz
,
0
,
0
,
0
,
&
status
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
{
...
...
@@ -1297,7 +1208,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
free
(
scaleinfo
);
free
(
p
);
free
(
correction
);
clEnqueueUnmapMemObject
(
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
()
,
candidatebuffer
,
candidate
,
0
,
0
,
0
);
clEnqueueUnmapMemObject
(
qu
,
candidatebuffer
,
candidate
,
0
,
0
,
0
);
openCLSafeCall
(
clReleaseMemObject
(
stagebuffer
));
openCLSafeCall
(
clReleaseMemObject
(
scaleinfobuffer
));
openCLSafeCall
(
clReleaseMemObject
(
nodebuffer
));
...
...
@@ -1306,20 +1217,547 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
openCLSafeCall
(
clReleaseMemObject
(
pbuffer
));
openCLSafeCall
(
clReleaseMemObject
(
correctionbuffer
));
}
//t = (double)cvGetTickCount() ;
cvFree
(
&
cascade
->
hid_cascade
);
// printf("%d\n",globalcounter);
rectList
.
resize
(
allCandidates
.
size
());
if
(
!
allCandidates
.
empty
())
std
::
copy
(
allCandidates
.
begin
(),
allCandidates
.
end
(),
rectList
.
begin
());
//cout << "count = " << rectList.size()<< endl;
if
(
minNeighbors
!=
0
||
findBiggestObject
)
groupRectangles
(
rectList
,
rweights
,
std
::
max
(
minNeighbors
,
1
),
GROUP_EPS
);
else
rweights
.
resize
(
rectList
.
size
(),
0
);
if
(
findBiggestObject
&&
rectList
.
size
()
)
{
CvAvgComp
result_comp
=
{{
0
,
0
,
0
,
0
},
0
};
for
(
size_t
i
=
0
;
i
<
rectList
.
size
();
i
++
)
{
cv
::
Rect
r
=
rectList
[
i
];
if
(
r
.
area
()
>
cv
::
Rect
(
result_comp
.
rect
).
area
()
)
{
result_comp
.
rect
=
r
;
result_comp
.
neighbors
=
rweights
[
i
];
}
}
cvSeqPush
(
result_seq
,
&
result_comp
);
}
else
{
for
(
size_t
i
=
0
;
i
<
rectList
.
size
();
i
++
)
{
CvAvgComp
c
;
c
.
rect
=
rectList
[
i
];
c
.
neighbors
=
rweights
[
i
];
cvSeqPush
(
result_seq
,
&
c
);
}
}
return
result_seq
;
}
struct
OclBuffers
{
cl_mem
stagebuffer
;
cl_mem
nodebuffer
;
cl_mem
candidatebuffer
;
cl_mem
scaleinfobuffer
;
cl_mem
pbuffer
;
cl_mem
correctionbuffer
;
cl_mem
newnodebuffer
;
};
struct
getRect
{
Rect
operator
()(
const
CvAvgComp
&
e
)
const
{
return
e
.
rect
;
}
};
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
detectMultiScale
(
oclMat
&
gimg
,
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
double
scaleFactor
,
int
minNeighbors
,
int
flags
,
Size
minSize
,
Size
maxSize
)
{
int
blocksize
=
8
;
int
grp_per_CU
=
12
;
size_t
localThreads
[
3
]
=
{
blocksize
,
blocksize
,
1
};
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
Context
::
getContext
()
->
computeUnits
()
*
localThreads
[
0
],
localThreads
[
1
],
1
};
int
outputsz
=
256
*
globalThreads
[
0
]
/
localThreads
[
0
];
Init
(
gimg
.
rows
,
gimg
.
cols
,
scaleFactor
,
flags
,
outputsz
,
localThreads
,
minSize
,
maxSize
);
const
double
GROUP_EPS
=
0.2
;
cv
::
ConcurrentRectVector
allCandidates
;
std
::
vector
<
cv
::
Rect
>
rectList
;
std
::
vector
<
int
>
rweights
;
CvHaarClassifierCascade
*
cascade
=
oldCascade
;
GpuHidHaarClassifierCascade
*
gcascade
;
GpuHidHaarStageClassifier
*
stage
;
GpuHidHaarClassifier
*
classifier
;
GpuHidHaarTreeNode
*
node
;
if
(
CV_MAT_DEPTH
(
gimg
.
type
())
!=
CV_8U
)
CV_Error
(
CV_StsUnsupportedFormat
,
"Only 8-bit images are supported"
);
if
(
CV_MAT_CN
(
gimg
.
type
())
>
1
)
{
oclMat
gtemp
;
cvtColor
(
gimg
,
gtemp
,
CV_BGR2GRAY
);
gimg
=
gtemp
;
}
int
*
candidate
;
if
(
(
flags
&
CV_HAAR_SCALE_IMAGE
)
)
{
int
indexy
=
0
;
CvSize
sz
;
cv
::
Rect
roi
,
roi2
;
cv
::
Mat
imgroi
,
imgroisq
;
cv
::
ocl
::
oclMat
resizeroi
,
gimgroi
,
gimgroisq
;
for
(
int
i
=
0
;
i
<
m_loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
roi
=
Rect
(
0
,
indexy
,
sz
.
width
,
sz
.
height
);
roi2
=
Rect
(
0
,
0
,
sz
.
width
-
1
,
sz
.
height
-
1
);
resizeroi
=
gimg1
(
roi2
);
gimgroi
=
gsum
(
roi
);
gimgroisq
=
gsqsum
(
roi
);
cv
::
ocl
::
resize
(
gimg
,
resizeroi
,
Size
(
sz
.
width
-
1
,
sz
.
height
-
1
),
0
,
0
,
INTER_LINEAR
);
cv
::
ocl
::
integral
(
resizeroi
,
gimgroi
,
gimgroisq
);
indexy
+=
sz
.
height
;
}
gcascade
=
(
GpuHidHaarClassifierCascade
*
)(
cascade
->
hid_cascade
);
stage
=
(
GpuHidHaarStageClassifier
*
)(
gcascade
+
1
);
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
gpuSetImagesForHaarClassifierCascade
(
cascade
,
1.
,
gsum
.
step
/
4
);
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
();
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
nodebuffer
,
1
,
0
,
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
int
startnode
=
0
;
int
pixelstep
=
gsum
.
step
/
4
;
int
splitstage
=
3
;
int
splitnode
=
stage
[
0
].
count
+
stage
[
1
].
count
+
stage
[
2
].
count
;
cl_int4
p
,
pq
;
p
.
s
[
0
]
=
gcascade
->
p0
;
p
.
s
[
1
]
=
gcascade
->
p1
;
p
.
s
[
2
]
=
gcascade
->
p2
;
p
.
s
[
3
]
=
gcascade
->
p3
;
pq
.
s
[
0
]
=
gcascade
->
pq0
;
pq
.
s
[
1
]
=
gcascade
->
pq1
;
pq
.
s
[
2
]
=
gcascade
->
pq2
;
pq
.
s
[
3
]
=
gcascade
->
pq3
;
float
correction
=
gcascade
->
inv_window_area
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
stagebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
nodebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
pixelstep
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
m_loopcount
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
endstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int4
)
,
(
void
*
)
&
p
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int4
)
,
(
void
*
)
&
pq
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
correction
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascade"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
candidate
=
(
int
*
)
malloc
(
4
*
sizeof
(
int
)
*
outputsz
);
memset
(
candidate
,
0
,
4
*
sizeof
(
int
)
*
outputsz
);
openCLReadBuffer
(
gsum
.
clCxt
,
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
free
((
void
*
)
candidate
);
candidate
=
NULL
;
}
else
{
cv
::
ocl
::
integral
(
gimg
,
gsum
,
gsqsum
);
gpuSetHaarClassifierCascade
(
cascade
);
gcascade
=
(
GpuHidHaarClassifierCascade
*
)
cascade
->
hid_cascade
;
stage
=
(
GpuHidHaarStageClassifier
*
)(
gcascade
+
1
);
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
();
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
nodebuffer
,
1
,
0
,
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
cl_int4
*
p
=
(
cl_int4
*
)
malloc
(
sizeof
(
cl_int4
)
*
m_loopcount
);
float
*
correction
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
m_loopcount
);
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
double
factor
;
for
(
int
i
=
0
;
i
<
m_loopcount
;
i
++
)
{
factor
=
scalev
[
i
];
int
equRect_x
=
(
int
)(
factor
*
gcascade
->
p0
+
0.5
);
int
equRect_y
=
(
int
)(
factor
*
gcascade
->
p1
+
0.5
);
int
equRect_w
=
(
int
)(
factor
*
gcascade
->
p3
+
0.5
);
int
equRect_h
=
(
int
)(
factor
*
gcascade
->
p2
+
0.5
);
p
[
i
].
s
[
0
]
=
equRect_x
;
p
[
i
].
s
[
1
]
=
equRect_y
;
p
[
i
].
s
[
2
]
=
equRect_x
+
equRect_w
;
p
[
i
].
s
[
3
]
=
equRect_y
+
equRect_h
;
correction
[
i
]
=
1.
/
(
equRect_w
*
equRect_h
);
int
startnodenum
=
m_nodenum
*
i
;
float
factor2
=
(
float
)
factor
;
vector
<
pair
<
size_t
,
const
void
*>
>
args1
;
args1
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
nodebuffer
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
factor2
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
correction
[
i
]
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnodenum
));
size_t
globalThreads2
[
3
]
=
{
m_nodenum
,
1
,
1
};
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuscaleclassifier"
,
globalThreads2
,
NULL
/*localThreads2*/
,
args1
,
-
1
,
-
1
);
}
int
step
=
gsum
.
step
/
4
;
int
startnode
=
0
;
int
splitstage
=
3
;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
pbuffer
,
1
,
0
,
sizeof
(
cl_int4
)
*
m_loopcount
,
p
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
correctionbuffer
,
1
,
0
,
sizeof
(
cl_float
)
*
m_loopcount
,
correction
,
0
,
NULL
,
NULL
));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
stagebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
m_loopcount
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
endstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
pbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
m_nodenum
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuRunHaarClassifierCascade_scaled2"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
candidate
=
(
int
*
)
clEnqueueMapBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
,
1
,
CL_MAP_READ
,
0
,
4
*
sizeof
(
int
)
*
outputsz
,
0
,
0
,
0
,
NULL
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
{
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
}
free
(
p
);
free
(
correction
);
clEnqueueUnmapMemObject
(
qu
,
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
,
candidate
,
0
,
0
,
0
);
}
rectList
.
resize
(
allCandidates
.
size
());
if
(
!
allCandidates
.
empty
())
std
::
copy
(
allCandidates
.
begin
(),
allCandidates
.
end
(),
rectList
.
begin
());
if
(
minNeighbors
!=
0
||
findBiggestObject
)
groupRectangles
(
rectList
,
rweights
,
std
::
max
(
minNeighbors
,
1
),
GROUP_EPS
);
else
rweights
.
resize
(
rectList
.
size
(),
0
);
GenResult
(
faces
,
rectList
,
rweights
);
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
Init
(
const
int
rows
,
const
int
cols
,
double
scaleFactor
,
int
flags
,
const
int
outputsz
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
)
{
CvHaarClassifierCascade
*
cascade
=
oldCascade
;
if
(
!
CV_IS_HAAR_CLASSIFIER
(
cascade
)
)
CV_Error
(
!
cascade
?
CV_StsNullPtr
:
CV_StsBadArg
,
"Invalid classifier cascade"
);
if
(
scaleFactor
<=
1
)
CV_Error
(
CV_StsOutOfRange
,
"scale factor must be > 1"
);
if
(
cols
<
minSize
.
width
||
rows
<
minSize
.
height
)
CV_Error
(
CV_StsError
,
"Image too small"
);
int
datasize
=
0
;
int
totalclassifier
=
0
;
if
(
!
cascade
->
hid_cascade
)
gpuCreateHidHaarClassifierCascade
(
cascade
,
&
datasize
,
&
totalclassifier
);
if
(
maxSize
.
height
==
0
||
maxSize
.
width
==
0
)
{
maxSize
.
height
=
rows
;
maxSize
.
width
=
cols
;
}
findBiggestObject
=
(
flags
&
CV_HAAR_FIND_BIGGEST_OBJECT
)
!=
0
;
if
(
findBiggestObject
)
flags
&=
~
(
CV_HAAR_SCALE_IMAGE
|
CV_HAAR_DO_CANNY_PRUNING
);
CreateBaseBufs
(
datasize
,
totalclassifier
,
flags
,
outputsz
);
CreateFactorRelatedBufs
(
rows
,
cols
,
flags
,
scaleFactor
,
localThreads
,
minSize
,
maxSize
);
m_scaleFactor
=
scaleFactor
;
m_rows
=
rows
;
m_cols
=
cols
;
m_flags
=
flags
;
m_minSize
=
minSize
;
m_maxSize
=
maxSize
;
initialized
=
true
;
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
CreateBaseBufs
(
const
int
datasize
,
const
int
totalclassifier
,
const
int
flags
,
const
int
outputsz
)
{
if
(
!
initialized
)
{
buffers
=
malloc
(
sizeof
(
OclBuffers
));
size_t
tempSize
=
sizeof
(
GpuHidHaarStageClassifier
)
*
((
GpuHidHaarClassifierCascade
*
)
oldCascade
->
hid_cascade
)
->
count
;
m_nodenum
=
(
datasize
-
sizeof
(
GpuHidHaarClassifierCascade
)
-
tempSize
-
sizeof
(
GpuHidHaarClassifier
)
*
totalclassifier
)
/
sizeof
(
GpuHidHaarTreeNode
);
((
OclBuffers
*
)
buffers
)
->
stagebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
tempSize
);
((
OclBuffers
*
)
buffers
)
->
nodebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
}
if
(
initialized
&&
((
m_flags
&
CV_HAAR_SCALE_IMAGE
)
^
(
flags
&
CV_HAAR_SCALE_IMAGE
)))
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
}
if
(
flags
&
CV_HAAR_SCALE_IMAGE
)
{
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_WRITE_ONLY
,
4
*
sizeof
(
int
)
*
outputsz
);
}
else
{
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_WRITE_ONLY
|
CL_MEM_ALLOC_HOST_PTR
,
4
*
sizeof
(
int
)
*
outputsz
);
}
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
CreateFactorRelatedBufs
(
const
int
rows
,
const
int
cols
,
const
int
flags
,
const
double
scaleFactor
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
)
{
if
(
initialized
)
{
if
((
m_flags
&
CV_HAAR_SCALE_IMAGE
)
&&
!
(
flags
&
CV_HAAR_SCALE_IMAGE
))
{
gimg1
.
release
();
gsum
.
release
();
gsqsum
.
release
();
}
else
if
(
!
(
m_flags
&
CV_HAAR_SCALE_IMAGE
)
&&
(
flags
&
CV_HAAR_SCALE_IMAGE
))
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
pbuffer
));
}
else
if
((
m_flags
&
CV_HAAR_SCALE_IMAGE
)
&&
(
flags
&
CV_HAAR_SCALE_IMAGE
))
{
if
(
fabs
(
m_scaleFactor
-
scaleFactor
)
<
1e-6
&&
(
rows
==
m_rows
&&
cols
==
m_cols
)
&&
(
minSize
.
width
==
m_minSize
.
width
)
&&
(
minSize
.
height
==
m_minSize
.
height
)
&&
(
maxSize
.
width
==
m_maxSize
.
width
)
&&
(
maxSize
.
height
==
m_maxSize
.
height
))
{
return
;
}
}
else
{
if
(
fabs
(
m_scaleFactor
-
scaleFactor
)
<
1e-6
&&
(
rows
==
m_rows
&&
cols
==
m_cols
)
&&
(
minSize
.
width
==
m_minSize
.
width
)
&&
(
minSize
.
height
==
m_minSize
.
height
)
&&
(
maxSize
.
width
==
m_maxSize
.
width
)
&&
(
maxSize
.
height
==
m_maxSize
.
height
))
{
return
;
}
else
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
pbuffer
));
}
}
}
int
loopcount
;
int
indexy
=
0
;
int
totalheight
=
0
;
double
factor
;
Rect
roi
;
CvSize
sz
;
CvSize
winSize0
=
oldCascade
->
orig_window_size
;
detect_piramid_info
*
scaleinfo
;
if
(
flags
&
CV_HAAR_SCALE_IMAGE
)
{
for
(
factor
=
1.
f
;;
factor
*=
scaleFactor
)
{
CvSize
winSize
=
{
cvRound
(
winSize0
.
width
*
factor
),
cvRound
(
winSize0
.
height
*
factor
)
};
sz
.
width
=
cvRound
(
cols
/
factor
)
+
1
;
sz
.
height
=
cvRound
(
rows
/
factor
)
+
1
;
CvSize
sz1
=
{
sz
.
width
-
winSize0
.
width
-
1
,
sz
.
height
-
winSize0
.
height
-
1
};
if
(
sz1
.
width
<=
0
||
sz1
.
height
<=
0
)
break
;
if
(
winSize
.
width
>
maxSize
.
width
||
winSize
.
height
>
maxSize
.
height
)
break
;
if
(
winSize
.
width
<
minSize
.
width
||
winSize
.
height
<
minSize
.
height
)
continue
;
totalheight
+=
sz
.
height
;
sizev
.
push_back
(
sz
);
scalev
.
push_back
(
static_cast
<
float
>
(
factor
));
}
loopcount
=
sizev
.
size
();
gimg1
.
create
(
rows
,
cols
,
CV_8UC1
);
gsum
.
create
(
totalheight
+
4
,
cols
+
1
,
CV_32SC1
);
gsqsum
.
create
(
totalheight
+
4
,
cols
+
1
,
CV_32FC1
);
scaleinfo
=
(
detect_piramid_info
*
)
malloc
(
sizeof
(
detect_piramid_info
)
*
loopcount
);
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
roi
=
Rect
(
0
,
indexy
,
sz
.
width
,
sz
.
height
);
int
width
=
sz
.
width
-
1
-
oldCascade
->
orig_window_size
.
width
;
int
height
=
sz
.
height
-
1
-
oldCascade
->
orig_window_size
.
height
;
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
width_height
=
(
width
<<
16
)
|
height
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
imgoff
=
gsum
(
roi
).
offset
>>
2
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
factor
=
scalev
[
i
];
indexy
+=
sz
.
height
;
}
}
else
{
for
(
factor
=
1
;
cvRound
(
factor
*
winSize0
.
width
)
<
cols
-
10
&&
cvRound
(
factor
*
winSize0
.
height
)
<
rows
-
10
;
factor
*=
scaleFactor
)
{
CvSize
winSize
=
{
cvRound
(
winSize0
.
width
*
factor
),
cvRound
(
winSize0
.
height
*
factor
)
};
if
(
winSize
.
width
<
minSize
.
width
||
winSize
.
height
<
minSize
.
height
)
{
continue
;
}
sizev
.
push_back
(
winSize
);
scalev
.
push_back
(
factor
);
}
loopcount
=
scalev
.
size
();
if
(
loopcount
==
0
)
{
loopcount
=
1
;
sizev
.
push_back
(
minSize
);
scalev
.
push_back
(
min
(
cvRound
(
minSize
.
width
/
winSize0
.
width
),
cvRound
(
minSize
.
height
/
winSize0
.
height
))
);
}
((
OclBuffers
*
)
buffers
)
->
pbuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
sizeof
(
cl_int4
)
*
loopcount
);
((
OclBuffers
*
)
buffers
)
->
correctionbuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
sizeof
(
cl_float
)
*
loopcount
);
((
OclBuffers
*
)
buffers
)
->
newnodebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_WRITE
,
loopcount
*
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
scaleinfo
=
(
detect_piramid_info
*
)
malloc
(
sizeof
(
detect_piramid_info
)
*
loopcount
);
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
factor
=
scalev
[
i
];
int
ystep
=
cvRound
(
std
::
max
(
2.
,
factor
));
int
width
=
(
cols
-
1
-
sz
.
width
+
ystep
-
1
)
/
ystep
;
int
height
=
(
rows
-
1
-
sz
.
height
+
ystep
-
1
)
/
ystep
;
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
width_height
=
(
width
<<
16
)
|
height
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
imgoff
=
0
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
factor
=
factor
;
}
}
if
(
loopcount
!=
m_loopcount
)
{
if
(
initialized
)
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
}
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
);
}
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
cv
::
ocl
::
Context
::
getContext
()
->
oclCommandQueue
(),
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
free
(
scaleinfo
);
m_loopcount
=
loopcount
;
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
GenResult
(
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
const
std
::
vector
<
cv
::
Rect
>
&
rectList
,
const
std
::
vector
<
int
>
&
rweights
)
{
CvSeq
*
result_seq
=
cvCreateSeq
(
0
,
sizeof
(
CvSeq
),
sizeof
(
CvAvgComp
),
cvCreateMemStorage
(
0
)
);
if
(
findBiggestObject
&&
rectList
.
size
()
)
{
...
...
@@ -1346,13 +1784,34 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
cvSeqPush
(
result_seq
,
&
c
);
}
}
//t = (double)cvGetTickCount() - t;
//printf( "get face time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) )
;
//alltime = (double)cvGetTickCount() - alltime
;
//printf( "all time = %g ms\n", alltime/((double)cvGetTickFrequency()*1000.)
);
return
result_seq
;
vector
<
CvAvgComp
>
vecAvgComp
;
Seq
<
CvAvgComp
>
(
result_seq
).
copyTo
(
vecAvgComp
)
;
faces
.
resize
(
vecAvgComp
.
size
()
);
std
::
transform
(
vecAvgComp
.
begin
(),
vecAvgComp
.
end
(),
faces
.
begin
(),
getRect
())
;
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
release
()
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
stagebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
nodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
if
(
(
m_flags
&
CV_HAAR_SCALE_IMAGE
)
)
{
cvFree
(
&
oldCascade
->
hid_cascade
);
}
else
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
pbuffer
));
}
free
(
buffers
);
buffers
=
NULL
;
}
#ifndef _MAX_PATH
#define _MAX_PATH 1024
...
...
modules/ocl/src/opencl/haarobjectdetect.cl
View file @
69a0b5dd
...
...
@@ -112,7 +112,7 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
}
GpuHidHaarClassifierCascade
;
__kernel
void
__attribute__
((
reqd_work_group_size
(
8
,
8
,
1
)))
gpuRunHaarClassifierCascade
(
//constant
GpuHidHaarClassifierCascade
*
cascade,
__kernel
void
__attribute__
((
reqd_work_group_size
(
8
,
8
,
1
)))
gpuRunHaarClassifierCascade
(
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
int4
*
info,
global
GpuHidHaarTreeNode
*
nodeptr,
...
...
@@ -128,12 +128,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
const
int
splitnode,
const
int4
p,
const
int4
pq,
const
float
correction
//const
int
width,
//const
int
height,
//const
int
grpnumperline,
//const
int
totalgrp
)
const
float
correction
)
{
int
grpszx
=
get_local_size
(
0
)
;
int
grpszy
=
get_local_size
(
1
)
;
...
...
@@ -145,13 +140,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
lcl_sz
=
mul24
(
grpszx,grpszy
)
;
int
lcl_id
=
mad24
(
lclidy,grpszx,lclidx
)
;
//assume
lcl_sz
==
256
or
128
or
64
//int
lcl_sz_shift
=
(
lcl_sz
==
256
)
?
8
:
7
;
//lcl_sz_shift
=
(
lcl_sz
==
64
)
?
6
:
lcl_sz_shift
;
__local
int
lclshare[1024]
;
#
define
OFF
0
__local
int*
lcldata
=
lclshare
+
OFF
;//for save win data
__local
int*
lcldata
=
lclshare
;//for save win data
__local
int*
glboutindex
=
lcldata
+
28*28
;//for save global out index
__local
int*
lclcount
=
glboutindex
+
1
;//for save the numuber of temp pass pixel
__local
int*
lcloutindex
=
lclcount
+
1
;//for save info of temp pass pixel
...
...
@@ -181,7 +171,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
int
imgoff
=
scaleinfo1.z
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
//int
ystep
=1
;// factor > 2.0 ? 1 : 2;
__global
const
int
*
sum
=
sum1
+
imgoff
;
__global
const
float
*
sqsum
=
sqsum1
+
imgoff
;
...
...
@@ -191,8 +180,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
int
x
=
mad24
(
grpidx,grpszx,lclidx
)
;
int
y
=
mad24
(
grpidy,grpszy,lclidy
)
;
//candidate_result.x
=
convert_int_rtn
(
x*factor
)
;
//candidate_result.y
=
convert_int_rtn
(
y*factor
)
;
int
grpoffx
=
x-lclidx
;
int
grpoffy
=
y-lclidy
;
...
...
@@ -211,14 +198,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int4
data
=
*
(
__global
int4*
)
&sum[glb_off]
;
int
lcl_off
=
mad24
(
lcl_y,
readwidth,
lcl_x<<2
)
;
#
if
OFF
lcldata[lcl_off]
=
data.x
;
lcldata[lcl_off+1]
=
data.y
;
lcldata[lcl_off+2]
=
data.z
;
lcldata[lcl_off+3]
=
data.w
;
#
else
vstore4
(
data,
0
,
&lcldata[lcl_off]
)
;
#
endif
}
lcloutindex[lcl_id]
=
0
;
...
...
@@ -231,184 +211,170 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
lcl_off
=
mad24
(
lclidy,readwidth,lclidx
)
;
int4
cascadeinfo1,
cascadeinfo2
;
cascadeinfo1
=
p
;
cascadeinfo2
=
pq
;
// + mad24(y, pixelstep, x);
cascadeinfo2
=
pq
;
cascadeinfo1.x
+=lcl_off
;
cascadeinfo1.z
+=lcl_off
;
mean
=
(
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.x
)
]
-
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.z
)
]
-
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.x
)
]
+
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.z
)
]
)
*correction
;
//if
((
x
<
width
)
&&
(
y
<
height
))
{
cascadeinfo1.x
+=lcl_off
;
cascadeinfo1.z
+=lcl_off
;
mean
=
(
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.x
)
]
-
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.z
)
]
-
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.x
)
]
+
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.z
)
]
)
*correction
;
int
p_offset
=
mad24
(
y,
pixelstep,
x
)
;
cascadeinfo2.x
+=p_offset
;
cascadeinfo2.z
+=p_offset
;
variance_norm_factor
=sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.x
)
]
-
sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.z
)
]
-
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.x
)
]
+
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.z
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
//if
(
cascade->is_stump_based
)
//{
for
(
int
stageloop
=
start_stage
; (stageloop < split_stage) && result; stageloop++ )
{
float
stage_sum
=
0.f
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; nodeloop++ )
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int
p_offset
=
mad24
(
y,
pixelstep,
x
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
cascadeinfo2.x
+=p_offset
;
cascadeinfo2.z
+=p_offset
;
variance_norm_factor
=sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.x
)
]
-
sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.z
)
]
-
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.x
)
]
+
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.z
)
]
;
info1.x
+=lcl_off
;
info1.z
+=lcl_off
;
info2.x
+=lcl_off
;
info2.z
+=lcl_off
;
variance_norm_factor
=
variance_norm_factor
*
correction
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
for
(
int
stageloop
=
start_stage
; (stageloop < split_stage) && result; stageloop++ )
{
float
stage_sum
=
0.f
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; nodeloop++ )
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
info1.x
+=lcl_off
;
info1.z
+=lcl_off
;
info2.x
+=lcl_off
;
info2.z
+=lcl_off
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
//if
((
info3.z
-
info3.x
)
&&
(
!stageinfo.z
))
//{
info3.x
+=lcl_off
;
info3.z
+=lcl_off
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
//}
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
result
=
(
stage_sum
>=
stagethreshold
)
;
}
info3.x
+=lcl_off
;
info3.z
+=lcl_off
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
if
(
result
&&
(
x
<
width
)
&&
(
y
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
(
lclidy
<<
16
)
|
lclidx
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
result
=
(
stage_sum
>=
stagethreshold
)
;
}
if
(
result
&&
(
x
<
width
)
&&
(
y
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
(
lclidy
<<
16
)
|
lclidx
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
=
splitnode
;
for
(
int
stageloop
=
split_stage
; stageloop< end_stage && queuecount>0; stageloop++)
{
lclcount[0]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
=
splitnode
;
for
(
int
stageloop
=
split_stage
; stageloop< end_stage && queuecount>0; stageloop++)
{
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//if
(
lcl_id
==
0
)
lclcount[0]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
int
perfscale
=
queuecount
>
4
?
3
:
2
;
int
queuecount_loop
=
(
queuecount
+
(
1<<perfscale
)
-1
)
>>
perfscale
;
int
lcl_compute_win
=
lcl_sz
>>
perfscale
;
int
lcl_compute_win_id
=
(
lcl_id
>>
(
6-perfscale
))
;
int
lcl_loops
=
(
stageinfo.x
+
lcl_compute_win
-1
)
>>
(
6-perfscale
)
;
int
lcl_compute_id
=
lcl_id
-
(
lcl_compute_win_id
<<
(
6-perfscale
))
;
for
(
int
queueloop=0
; queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/; queueloop++)
int
perfscale
=
queuecount
>
4
?
3
:
2
;
int
queuecount_loop
=
(
queuecount
+
(
1<<perfscale
)
-1
)
>>
perfscale
;
int
lcl_compute_win
=
lcl_sz
>>
perfscale
;
int
lcl_compute_win_id
=
(
lcl_id
>>
(
6-perfscale
))
;
int
lcl_loops
=
(
stageinfo.x
+
lcl_compute_win
-1
)
>>
(
6-perfscale
)
;
int
lcl_compute_id
=
lcl_id
-
(
lcl_compute_win_id
<<
(
6-perfscale
))
;
for
(
int
queueloop=0
; queueloop<queuecount_loop; queueloop++)
{
float
stage_sum
=
0.f
;
int
temp_coord
=
lcloutindex[lcl_compute_win_id<<1]
;
float
variance_norm_factor
=
as_float
(
lcloutindex[
(
lcl_compute_win_id<<1
)
+1]
)
;
int
queue_pixel
=
mad24
(((
temp_coord
&
(
int
)
0xffff0000
)
>>16
)
,
readwidth,temp_coord
&
0xffff
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
float
stage_sum
=
0.f
;
int
temp_coord
=
lcloutindex[lcl_compute_win_id<<1]
;
float
variance_norm_factor
=
as_float
(
lcloutindex[
(
lcl_compute_win_id<<1
)
+1]
)
;
int
queue_pixel
=
mad24
(((
temp_coord
&
(
int
)
0xffff0000
)
>>16
)
,
readwidth,temp_coord
&
0xffff
)
;
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
int
tempnodecounter
=
lcl_compute_id
;
float
part_sum
=
0.f
;
for
(
int
lcl_loop=0
; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
+
tempnodecounter
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=queue_pixel
;
info1.z
+=queue_pixel
;
info2.x
+=queue_pixel
;
info2.z
+=queue_pixel
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
int
tempnodecounter
=
lcl_compute_id
;
float
part_sum
=
0.f
;
for
(
int
lcl_loop=0
; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
+
tempnodecounter
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=queue_pixel
;
info1.z
+=queue_pixel
;
info2.x
+=queue_pixel
;
info2.z
+=queue_pixel
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
//if
((
info3.z
-
info3.x
)
&&
(
!stageinfo.z
))
//{
info3.x
+=queue_pixel
;
info3.z
+=queue_pixel
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
//}
part_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
tempnodecounter
+=lcl_compute_win
;
}//end
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
info3.x
+=queue_pixel
;
info3.z
+=queue_pixel
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
part_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
tempnodecounter
+=lcl_compute_win
;
}//end
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
for
(
int
i=0
; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum
+=
partialsum[lcl_id+i]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
if
(
stage_sum
>=
stagethreshold
&&
(
lcl_compute_id==0
))
{
for
(
int
i=0
; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum
+=
partialsum[lcl_id+i]
;
}
if
(
stage_sum
>=
stagethreshold
&&
(
lcl_compute_id==0
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
temp_coord
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
lcl_compute_win_id
+=
(
1<<perfscale
)
;
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
temp_coord
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
for
(
int
queueloop=0
;queueloop<queuecount_loop;queueloop++)
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
queuecount
=
lclcount[0]
;
lcl_compute_win_id
+=
(
1<<perfscale
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
+=
stageinfo.x
;
}//end
for
(
int
stageloop
=
splitstage
; stageloop< endstage && queuecount>0;stageloop++)
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_id<queuecount
)
{
int
temp
=
lcloutindex[lcl_id<<1]
;
int
x
=
mad24
(
grpidx,grpszx,temp
&
0xffff
)
;
int
y
=
mad24
(
grpidy,grpszy,
((
temp
&
(
int
)
0xffff0000
)
>>
16
))
;
temp
=
glboutindex[0]
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor*20.f
)
;
candidate_result.x
=
convert_int_rtn
(
x*factor
)
;
candidate_result.y
=
convert_int_rtn
(
y*factor
)
;
atomic_inc
(
glboutindex
)
;
candidate[outputoff+temp+lcl_id]
=
candidate_result
;
}
}//end
for
(
int
queueloop=0
;queueloop<queuecount_loop;queueloop++)
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
if
((
x
<
width
)
&&
(
y
<
height
))
nodecounter
+=
stageinfo.x
;
}//end
for
(
int
stageloop
=
splitstage
; stageloop< endstage && queuecount>0;stageloop++)
if
(
lcl_id<queuecount
)
{
int
temp
=
lcloutindex[lcl_id<<1]
;
int
x
=
mad24
(
grpidx,grpszx,temp
&
0xffff
)
;
int
y
=
mad24
(
grpidy,grpszy,
((
temp
&
(
int
)
0xffff0000
)
>>
16
))
;
temp
=
glboutindex[0]
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor*20.f
)
;
candidate_result.x
=
convert_int_rtn
(
x*factor
)
;
candidate_result.y
=
convert_int_rtn
(
y*factor
)
;
atomic_inc
(
glboutindex
)
;
candidate[outputoff+temp+lcl_id]
=
candidate_result
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
for
(
int
grploop=grpidx
;grploop<totalgrp;grploop+=grpnumx)
//outputoff
+=mul24
(
width,height
)
;
}//end
for
(
int
scalei
=
0
; scalei <loopcount; scalei++)
}
...
...
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
View file @
69a0b5dd
...
...
@@ -16,6 +16,7 @@
//
//
@Authors
//
Wu
Xinglong,
wxl370@126.com
//
Sen
Liu,
swjtuls1987@126.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
...
...
@@ -52,11 +53,11 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarFeature
{
struct
__attribute__
((
aligned
(
32
)))
{
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
weight
__attribute__
((
aligned
(
4
)))
;
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
weight
__attribute__
((
aligned
(
4
)))
;
}
rect[CV_HAAR_FEATURE_MAX]
__attribute__
((
aligned
(
32
)))
;
}
...
...
@@ -113,173 +114,168 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
global
const
int
*restrict
sum,
global
const
float
*restrict
sqsum,
global
int4
*candidate,
const
int
rows,
const
int
cols,
const
int
step,
const
int
loopcount,
const
int
start_stage,
const
int
split_stage,
const
int
end_stage,
const
int
startnode,
const
int
splitnode,
global
int4
*p,
//const
int4
*
pq,
global
float
*correction,
const
int
nodecount
)
{
int
grpszx
=
get_local_size
(
0
)
;
int
grpszy
=
get_local_size
(
1
)
;
int
grpnumx
=
get_num_groups
(
0
)
;
int
grpidx
=
get_group_id
(
0
)
;
int
lclidx
=
get_local_id
(
0
)
;
int
lclidy
=
get_local_id
(
1
)
;
int
lcl_sz
=
mul24
(
grpszx,
grpszy
)
;
int
lcl_id
=
mad24
(
lclidy,
grpszx,
lclidx
)
;
__local
int
lclshare[1024]
;
__local
int
*glboutindex
=
lclshare
+
0
;
__local
int
*lclcount
=
glboutindex
+
1
;
__local
int
*lcloutindex
=
lclcount
+
1
;
__local
float
*partialsum
=
(
__local
float
*
)(
lcloutindex
+
(
lcl_sz
<<
1
))
;
glboutindex[0]
=
0
;
int
outputoff
=
mul24
(
grpidx,
256
)
;
candidate[outputoff
+
(
lcl_id
<<
2
)
]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
1]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
2]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
3]
=
(
int4
)
0
;
int
grpszx
=
get_local_size
(
0
)
;
int
grpszy
=
get_local_size
(
1
)
;
int
grpnumx
=
get_num_groups
(
0
)
;
int
grpidx
=
get_group_id
(
0
)
;
int
lclidx
=
get_local_id
(
0
)
;
int
lclidy
=
get_local_id
(
1
)
;
int
lcl_sz
=
mul24
(
grpszx,
grpszy
)
;
int
lcl_id
=
mad24
(
lclidy,
grpszx,
lclidx
)
;
__local
int
glboutindex[1]
;
__local
int
lclcount[1]
;
__local
int
lcloutindex[64]
;
glboutindex[0]
=
0
;
int
outputoff
=
mul24
(
grpidx,
256
)
;
candidate[outputoff
+
(
lcl_id
<<
2
)
]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
1]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
2]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
3]
=
(
int4
)
0
;
int
max_idx
=
rows
*
cols
-
1
;
for
(
int
scalei
=
0
; scalei < loopcount; scalei++)
{
int4
scaleinfo1
;
scaleinfo1
=
info[scalei]
;
int
width
=
(
scaleinfo1.x
&
0xffff0000
)
>>
16
;
int
height
=
scaleinfo1.x
&
0xffff
;
int
grpnumperline
=
(
scaleinfo1.y
&
0xffff0000
)
>>
16
;
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
float
correction_t
=
correction[scalei]
;
int
ystep
=
(
int
)(
max
(
2.0f,
factor
)
+
0.5f
)
;
for
(
int
scalei
=
0
; scalei < loopcount; scalei++
)
for
(
int
grploop
=
get_group_id
(
0
)
; grploop < totalgrp; grploop += grpnumx
)
{
int4
scaleinfo1
;
scaleinfo1
=
info[scalei]
;
int
width
=
(
scaleinfo1.x
&
0xffff0000
)
>>
16
;
int
height
=
scaleinfo1.x
&
0xffff
;
int
grpnumperline
=
(
scaleinfo1.y
&
0xffff0000
)
>>
16
;
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
float
correction_t
=
correction[scalei]
;
int
ystep
=
(
int
)(
max
(
2.0f,
factor
)
+
0.5f
)
;
int4
cascadeinfo
=
p[scalei]
;
int
grpidy
=
grploop
/
grpnumperline
;
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
int
ix
=
mad24
(
grpidx,
grpszx,
lclidx
)
;
int
iy
=
mad24
(
grpidy,
grpszy,
lclidy
)
;
int
x
=
ix
*
ystep
;
int
y
=
iy
*
ystep
;
lcloutindex[lcl_id]
=
0
;
lclcount[0]
=
0
;
int
nodecounter
;
float
mean,
variance_norm_factor
;
//if
((
ix
<
width
)
&&
(
iy
<
height
))
{
const
int
p_offset
=
mad24
(
y,
step,
x
)
;
cascadeinfo.x
+=
p_offset
;
cascadeinfo.z
+=
p_offset
;
mean
=
(
sum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
)
*
correction_t
;
variance_norm_factor
=
sqsum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
-
sqsum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
-
sqsum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
+
sqsum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction_t
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=
0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
bool
result
=
true
;
nodecounter
=
startnode
+
nodecount
*
scalei
;
for
(
int
grploop
=
get_group_id
(
0
)
; grploop < totalgrp; grploop += grpnumx
)
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++
)
{
int4
cascadeinfo
=
p[scalei]
;
int
grpidy
=
grploop
/
grpnumperline
;
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
int
ix
=
mad24
(
grpidx,
grpszx,
lclidx
)
;
int
iy
=
mad24
(
grpidy,
grpszy,
lclidy
)
;
int
x
=
ix
*
ystep
;
int
y
=
iy
*
ystep
;
lcloutindex[lcl_id]
=
0
;
lclcount[0]
=
0
;
int
result
=
1
,
nodecounter
;
float
mean,
variance_norm_factor
;
//if
((
ix
<
width
)
&&
(
iy
<
height
))
{
const
int
p_offset
=
mad24
(
y,
step,
x
)
;
cascadeinfo.x
+=
p_offset
;
cascadeinfo.z
+=
p_offset
;
mean
=
(
sum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
]
-
sum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
]
-
sum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
]
+
sum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
]
)
*
correction_t
;
variance_norm_factor
=
sqsum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
]
-
sqsum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
]
-
sqsum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
]
+
sqsum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction_t
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=
0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
result
=
1
;
nodecounter
=
startnode
+
nodecount
*
scalei
;
for
(
int
stageloop
=
start_stage
; stageloop < end_stage && result; stageloop++)
{
float
stage_sum
=
0.f
;
int4
stageinfo
=
*
(
global
int4
*
)(
stagecascadeptr
+
stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; nodeloop++)
{
__global
GpuHidHaarTreeNode
*currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int4
info1
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4
*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2
*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=
p_offset
;
info1.z
+=
p_offset
;
info2.x
+=
p_offset
;
info2.z
+=
p_offset
;
float
classsum
=
(
sum[mad24
(
info1.y,
step,
info1.x
)
]
-
sum[mad24
(
info1.y,
step,
info1.z
)
]
-
sum[mad24
(
info1.w,
step,
info1.x
)
]
+
sum[mad24
(
info1.w,
step,
info1.z
)
]
)
*
w.x
;
classsum
+=
(
sum[mad24
(
info2.y,
step,
info2.x
)
]
-
sum[mad24
(
info2.y,
step,
info2.z
)
]
-
sum[mad24
(
info2.w,
step,
info2.x
)
]
+
sum[mad24
(
info2.w,
step,
info2.z
)
]
)
*
w.y
;
info3.x
+=
p_offset
;
info3.z
+=
p_offset
;
classsum
+=
(
sum[mad24
(
info3.y,
step,
info3.x
)
]
-
sum[mad24
(
info3.y,
step,
info3.z
)
]
-
sum[mad24
(
info3.w,
step,
info3.x
)
]
+
sum[mad24
(
info3.w,
step,
info3.z
)
]
)
*
w.z
;
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
result
=
(
stage_sum
>=
stagethreshold
)
;
}
float
stage_sum
=
0.f
;
int
stagecount
=
stagecascadeptr[stageloop].count
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount; nodeloop++)
{
__global
GpuHidHaarTreeNode
*currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int4
info1
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4
*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2
*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=
p_offset
;
info1.z
+=
p_offset
;
info2.x
+=
p_offset
;
info2.z
+=
p_offset
;
float
classsum
=
(
sum[clamp
(
mad24
(
info1.y,
step,
info1.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info1.y,
step,
info1.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info1.w,
step,
info1.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
info1.w,
step,
info1.z
)
,
0
,
max_idx
)
]
)
*
w.x
;
classsum
+=
(
sum[clamp
(
mad24
(
info2.y,
step,
info2.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info2.y,
step,
info2.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info2.w,
step,
info2.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
info2.w,
step,
info2.z
)
,
0
,
max_idx
)
]
)
*
w.y
;
info3.x
+=
p_offset
;
info3.z
+=
p_offset
;
classsum
+=
(
sum[clamp
(
mad24
(
info3.y,
step,
info3.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info3.y,
step,
info3.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info3.w,
step,
info3.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
info3.w,
step,
info3.z
)
,
0
,
max_idx
)
]
)
*
w.z
;
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
result
=
(
bool
)(
stage_sum
>=
stagecascadeptr[stageloop].threshold
)
;
}
if
(
result
&&
(
ix
<
width
)
&&
(
iy
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex
<<
1]
=
(
y
<<
16
)
|
x
;
lcloutindex[
(
queueindex
<<
1
)
+
1]
=
as_int
(
variance_norm_factor
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
nodecounter
=
splitnode
+
nodecount
*
scalei
;
if
(
result
&&
(
ix
<
width
)
&&
(
iy
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex]
=
(
y
<<
16
)
|
x
;
}
if
(
lcl_id
<
queuecount
)
{
int
temp
=
lcloutindex[lcl_id
<<
1]
;
int
x
=
temp
&
0xffff
;
int
y
=
(
temp
&
(
int
)
0xffff0000
)
>>
16
;
temp
=
glboutindex[0]
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor
*
20.f
)
;
candidate_result.x
=
x
;
candidate_result.y
=
y
;
atomic_inc
(
glboutindex
)
;
candidate[outputoff
+
temp
+
lcl_id]
=
candidate_result
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lcl_id
<
queuecount
)
{
int
temp
=
lcloutindex[lcl_id]
;
int
x
=
temp
&
0xffff
;
int
y
=
(
temp
&
(
int
)
0xffff0000
)
>>
16
;
temp
=
atomic_inc
(
glboutindex
)
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor
*
20.f
)
;
candidate_result.x
=
x
;
candidate_result.y
=
y
;
candidate[outputoff
+
temp
+
lcl_id]
=
candidate_result
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
}
}
__kernel
void
gpuscaleclassifier
(
global
GpuHidHaarTreeNode
*orinode,
global
GpuHidHaarTreeNode
*newnode,
float
scale,
float
weight_scale,
int
nodenum
)
{
int
counter
=
get_global_id
(
0
)
;
int
tr_x[3],
tr_y[3],
tr_h[3],
tr_w[3],
i
=
0
;
GpuHidHaarTreeNode
t1
=
*
(
orinode
+
counter
)
;
int
counter
=
get_global_id
(
0
)
;
int
tr_x[3],
tr_y[3],
tr_h[3],
tr_w[3],
i
=
0
;
GpuHidHaarTreeNode
t1
=
*
(
orinode
+
counter
)
;
#
pragma
unroll
for
(
i
=
0
; i < 3; i++)
{
tr_x[i]
=
(
int
)(
t1.p[i][0]
*
scale
+
0.5f
)
;
tr_y[i]
=
(
int
)(
t1.p[i][1]
*
scale
+
0.5f
)
;
tr_w[i]
=
(
int
)(
t1.p[i][2]
*
scale
+
0.5f
)
;
tr_h[i]
=
(
int
)(
t1.p[i][3]
*
scale
+
0.5f
)
;
}
for
(
i
=
0
; i < 3; i++)
{
tr_x[i]
=
(
int
)(
t1.p[i][0]
*
scale
+
0.5f
)
;
tr_y[i]
=
(
int
)(
t1.p[i][1]
*
scale
+
0.5f
)
;
tr_w[i]
=
(
int
)(
t1.p[i][2]
*
scale
+
0.5f
)
;
tr_h[i]
=
(
int
)(
t1.p[i][3]
*
scale
+
0.5f
)
;
}
t1.weight[0]
=
t1.p[2][0]
?
-
(
t1.weight[1]
*
tr_h[1]
*
tr_w[1]
+
t1.weight[2]
*
tr_h[2]
*
tr_w[2]
)
/
(
tr_h[0]
*
tr_w[0]
)
:
-t1.weight[1]
*
tr_h[1]
*
tr_w[1]
/
(
tr_h[0]
*
tr_w[0]
)
;
counter
+=
nodenum
;
t1.weight[0]
=
t1.p[2][0]
?
-
(
t1.weight[1]
*
tr_h[1]
*
tr_w[1]
+
t1.weight[2]
*
tr_h[2]
*
tr_w[2]
)
/
(
tr_h[0]
*
tr_w[0]
)
:
-t1.weight[1]
*
tr_h[1]
*
tr_w[1]
/
(
tr_h[0]
*
tr_w[0]
)
;
counter
+=
nodenum
;
#
pragma
unroll
for
(
i
=
0
; i < 3; i++)
{
newnode[counter].p[i][0]
=
tr_x[i]
;
newnode[counter].p[i][1]
=
tr_y[i]
;
newnode[counter].p[i][2]
=
tr_x[i]
+
tr_w[i]
;
newnode[counter].p[i][3]
=
tr_y[i]
+
tr_h[i]
;
newnode[counter].weight[i]
=
t1.weight[i]
*
weight_scale
;
}
for
(
i
=
0
; i < 3; i++)
{
newnode[counter].p[i][0]
=
tr_x[i]
;
newnode[counter].p[i][1]
=
tr_y[i]
;
newnode[counter].p[i][2]
=
tr_x[i]
+
tr_w[i]
;
newnode[counter].p[i][3]
=
tr_y[i]
+
tr_h[i]
;
newnode[counter].weight[i]
=
t1.weight[i]
*
weight_scale
;
}
newnode[counter].left
=
t1.left
;
newnode[counter].right
=
t1.right
;
newnode[counter].threshold
=
t1.threshold
;
newnode[counter].alpha[0]
=
t1.alpha[0]
;
newnode[counter].alpha[1]
=
t1.alpha[1]
;
newnode[counter].left
=
t1.left
;
newnode[counter].right
=
t1.right
;
newnode[counter].threshold
=
t1.threshold
;
newnode[counter].alpha[0]
=
t1.alpha[0]
;
newnode[counter].alpha[1]
=
t1.alpha[1]
;
}
modules/ocl/test/test_haar.cpp
View file @
69a0b5dd
...
...
@@ -16,6 +16,7 @@
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Sen Liu, swjutls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
...
...
@@ -61,40 +62,31 @@ struct getRect
}
};
PARAM_TEST_CASE
(
Haar
TestBase
,
int
,
int
)
PARAM_TEST_CASE
(
Haar
,
double
,
int
)
{
//std::vector<cv::ocl::Info> oclinfo;
cv
::
ocl
::
OclCascadeClassifier
cascade
,
nestedCascade
;
cv
::
ocl
::
OclCascadeClassifierBuf
cascadebuf
;
cv
::
CascadeClassifier
cpucascade
,
cpunestedCascade
;
// Mat img;
double
scale
;
int
index
;
int
flags
;
virtual
void
SetUp
()
{
scale
=
1.0
;
index
=
0
;
scale
=
GET_PARAM
(
0
)
;
flags
=
GET_PARAM
(
1
)
;
string
cascadeName
=
workdir
+
"../../data/haarcascades/haarcascade_frontalface_alt.xml"
;
if
(
(
!
cascade
.
load
(
cascadeName
))
||
(
!
cpucascade
.
load
(
cascadeName
)))
if
(
(
!
cascade
.
load
(
cascadeName
))
||
(
!
cpucascade
.
load
(
cascadeName
))
||
(
!
cascadebuf
.
load
(
cascadeName
))
)
{
cout
<<
"ERROR: Could not load classifier cascade"
<<
endl
;
return
;
}
//int devnums = getDevice(oclinfo);
//CV_Assert(devnums>0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
//cv::ocl::setBinpath("E:\\");
}
};
////////////////////////////////faceDetect/////////////////////////////////////////////////
struct
Haar
:
HaarTestBase
{};
TEST_F
(
Haar
,
FaceDetect
)
TEST_P
(
Haar
,
FaceDetect
)
{
string
imgName
=
workdir
+
"lena.jpg"
;
Mat
img
=
imread
(
imgName
,
1
);
...
...
@@ -105,59 +97,65 @@ TEST_F(Haar, FaceDetect)
return
;
}
//int i = 0;
//double t = 0;
vector
<
Rect
>
faces
,
oclfaces
;
// const static Scalar colors[] = { CV_RGB(0, 0, 255),
// CV_RGB(0, 128, 255),
// CV_RGB(0, 255, 255),
// CV_RGB(0, 255, 0),
// CV_RGB(255, 128, 0),
// CV_RGB(255, 255, 0),
// CV_RGB(255, 0, 0),
// CV_RGB(255, 0, 255)
// } ;
Mat
gray
,
smallImg
(
cvRound
(
img
.
rows
/
scale
),
cvRound
(
img
.
cols
/
scale
),
CV_8UC1
);
MemStorage
storage
(
cvCreateMemStorage
(
0
));
cvtColor
(
img
,
gray
,
CV_BGR2GRAY
);
resize
(
gray
,
smallImg
,
smallImg
.
size
(),
0
,
0
,
INTER_LINEAR
);
equalizeHist
(
smallImg
,
smallImg
);
cv
::
ocl
::
oclMat
image
;
CvSeq
*
_objects
;
image
.
upload
(
smallImg
);
_objects
=
cascade
.
oclHaarDetectObjects
(
image
,
storage
,
1.1
,
3
,
0
|
CV_HAAR_SCALE_IMAGE
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
vector
<
CvAvgComp
>
vecAvgComp
;
Seq
<
CvAvgComp
>
(
_objects
).
copyTo
(
vecAvgComp
);
oclfaces
.
resize
(
vecAvgComp
.
size
());
std
::
transform
(
vecAvgComp
.
begin
(),
vecAvgComp
.
end
(),
oclfaces
.
begin
(),
getRect
());
cpucascade
.
detectMultiScale
(
smallImg
,
faces
,
1.1
,
3
,
0
|
CV_HAAR_SCALE_IMAGE
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
cpucascade
.
detectMultiScale
(
smallImg
,
faces
,
1.1
,
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
EXPECT_EQ
(
faces
.
size
(),
oclfaces
.
size
());
/* for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
}
TEST_P
(
Haar
,
FaceDetectUseBuf
)
{
string
imgName
=
workdir
+
"lena.jpg"
;
Mat
img
=
imread
(
imgName
,
1
);
if
(
img
.
empty
())
{
Mat smallImgROI;
Point center;
Scalar color = colors[i%8];
int radius;
center.x = cvRound((r->x + r->width*0.5)*scale);
center.y = cvRound((r->y + r->height*0.5)*scale);
radius = cvRound((r->width + r->height)*0.25*scale);
circle( img, center, radius, color, 3, 8, 0 );
} */
//namedWindow("result");
//imshow("result",img);
//waitKey(0);
//destroyAllWindows();
std
::
cout
<<
"Couldn't read "
<<
imgName
<<
std
::
endl
;
return
;
}
vector
<
Rect
>
faces
,
oclfaces
;
Mat
gray
,
smallImg
(
cvRound
(
img
.
rows
/
scale
),
cvRound
(
img
.
cols
/
scale
),
CV_8UC1
);
MemStorage
storage
(
cvCreateMemStorage
(
0
));
cvtColor
(
img
,
gray
,
CV_BGR2GRAY
);
resize
(
gray
,
smallImg
,
smallImg
.
size
(),
0
,
0
,
INTER_LINEAR
);
equalizeHist
(
smallImg
,
smallImg
);
cv
::
ocl
::
oclMat
image
;
image
.
upload
(
smallImg
);
cascadebuf
.
detectMultiScale
(
image
,
oclfaces
,
1.1
,
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
cascadebuf
.
release
();
cpucascade
.
detectMultiScale
(
smallImg
,
faces
,
1.1
,
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
EXPECT_EQ
(
faces
.
size
(),
oclfaces
.
size
());
}
INSTANTIATE_TEST_CASE_P
(
FaceDetect
,
Haar
,
Combine
(
Values
(
1.0
),
Values
(
CV_HAAR_SCALE_IMAGE
,
0
)));
#endif // HAVE_OPENCL
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment