Commit 060d6751 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #3518 from wangyan42164:ocl_cascade_detect

parents 199f1aec efa84d82
...@@ -1060,6 +1060,7 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl ...@@ -1060,6 +1060,7 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
} }
int nstages = (int)data.stages.size(); int nstages = (int)data.stages.size();
int splitstage_ocl = 1;
if( featureType == FeatureEvaluator::HAAR ) if( featureType == FeatureEvaluator::HAAR )
{ {
...@@ -1071,11 +1072,11 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl ...@@ -1071,11 +1072,11 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
{ {
String opts; String opts;
if (lbufSize.area()) if (lbufSize.area())
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d", opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree); localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
else else
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d", opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, data.maxNodesPerTree); localsz.width, localsz.height, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts); haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
if( haarKernel.empty() ) if( haarKernel.empty() )
return false; return false;
...@@ -1083,7 +1084,6 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl ...@@ -1083,7 +1084,6 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
Rect normrect = haar->getNormRect(); Rect normrect = haar->getNormRect();
int sqofs = haar->getSquaresOffset(); int sqofs = haar->getSquaresOffset();
int splitstage_ocl = 1;
haarKernel.args((int)scales.size(), haarKernel.args((int)scales.size(),
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
...@@ -1091,13 +1091,12 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl ...@@ -1091,13 +1091,12 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier // cascade classifier
splitstage_ocl, nstages,
ocl::KernelArg::PtrReadOnly(ustages), ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(unodes), ocl::KernelArg::PtrReadOnly(unodes),
ocl::KernelArg::PtrReadOnly(uleaves), ocl::KernelArg::PtrReadOnly(uleaves),
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
normrect, sqofs, data.origWinSize, (int)MAX_FACES); normrect, sqofs, data.origWinSize);
ok = haarKernel.run(2, globalsize, localsize, true); ok = haarKernel.run(2, globalsize, localsize, true);
} }
else if( featureType == FeatureEvaluator::LBP ) else if( featureType == FeatureEvaluator::LBP )
...@@ -1113,16 +1112,16 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl ...@@ -1113,16 +1112,16 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
{ {
String opts; String opts;
if (lbufSize.area()) if (lbufSize.area())
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d", opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width); localsz.width, localsz.height, lbufSize.area(), lbufSize.width, splitstage_ocl, nstages, MAX_FACES);
else else
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d", localsz.width, localsz.height); opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, splitstage_ocl, nstages, MAX_FACES);
lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts); lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
if( lbpKernel.empty() ) if( lbpKernel.empty() )
return false; return false;
} }
int splitstage_ocl = 1;
int subsetSize = (data.ncategories + 31)/32; int subsetSize = (data.ncategories + 31)/32;
lbpKernel.args((int)scales.size(), lbpKernel.args((int)scales.size(),
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
...@@ -1130,14 +1129,13 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl ...@@ -1130,14 +1129,13 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier // cascade classifier
splitstage_ocl, nstages,
ocl::KernelArg::PtrReadOnly(ustages), ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(unodes), ocl::KernelArg::PtrReadOnly(unodes),
ocl::KernelArg::PtrReadOnly(usubsets), ocl::KernelArg::PtrReadOnly(usubsets),
subsetSize, subsetSize,
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
data.origWinSize, (int)MAX_FACES); data.origWinSize);
ok = lbpKernel.run(2, globalsize, localsize, true); ok = lbpKernel.run(2, globalsize, localsize, true);
} }
......
...@@ -70,14 +70,12 @@ void runHaarClassifier( ...@@ -70,14 +70,12 @@ void runHaarClassifier(
__global const int* sum, __global const int* sum,
int _sumstep, int sumoffset, int _sumstep, int sumoffset,
__global const OptHaarFeature* optfeatures, __global const OptHaarFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages, __global const Stage* stages,
__global const Node* nodes, __global const Node* nodes,
__global const float* leaves0, __global const float* leaves0,
volatile __global int* facepos, volatile __global int* facepos,
int4 normrect, int sqofs, int2 windowsize, int maxFaces) int4 normrect, int sqofs, int2 windowsize)
{ {
int lx = get_local_id(0); int lx = get_local_id(0);
int ly = get_local_id(1); int ly = get_local_id(1);
...@@ -165,7 +163,7 @@ void runHaarClassifier( ...@@ -165,7 +163,7 @@ void runHaarClassifier(
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f)); float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
nf = nf > 0 ? nf : 1.f; nf = nf > 0 ? nf : 1.f;
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ ) for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
{ {
int ntrees = stages[stageIdx].ntrees; int ntrees = stages[stageIdx].ntrees;
float s = 0.f; float s = 0.f;
...@@ -221,7 +219,7 @@ void runHaarClassifier( ...@@ -221,7 +219,7 @@ void runHaarClassifier(
break; break;
} }
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) ) if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
{ {
int count = atomic_inc(lcount); int count = atomic_inc(lcount);
lbuf[count] = (int)(ix | (iy << 8)); lbuf[count] = (int)(ix | (iy << 8));
...@@ -229,7 +227,7 @@ void runHaarClassifier( ...@@ -229,7 +227,7 @@ void runHaarClassifier(
} }
} }
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ ) for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
{ {
int nrects = lcount[0]; int nrects = lcount[0];
...@@ -335,13 +333,13 @@ void runHaarClassifier( ...@@ -335,13 +333,13 @@ void runHaarClassifier(
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if( stageIdx == nstages ) if( stageIdx == N_STAGES )
{ {
int nrects = lcount[0]; int nrects = lcount[0];
if( lidx < nrects ) if( lidx < nrects )
{ {
int nfaces = atomic_inc(facepos); int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces ) if( nfaces < MAX_FACES )
{ {
volatile __global int* face = facepos + 1 + nfaces*3; volatile __global int* face = facepos + 1 + nfaces*3;
int val = lbuf[lidx]; int val = lbuf[lidx];
...@@ -364,15 +362,13 @@ __kernel void runLBPClassifierStumpSimple( ...@@ -364,15 +362,13 @@ __kernel void runLBPClassifierStumpSimple(
__global const int* sum, __global const int* sum,
int _sumstep, int sumoffset, int _sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures, __global const OptLBPFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages, __global const Stage* stages,
__global const Stump* stumps, __global const Stump* stumps,
__global const int* bitsets, __global const int* bitsets,
int bitsetSize, int bitsetSize,
volatile __global int* facepos, volatile __global int* facepos,
int2 windowsize, int maxFaces) int2 windowsize)
{ {
int lx = get_local_id(0); int lx = get_local_id(0);
int ly = get_local_id(1); int ly = get_local_id(1);
...@@ -381,7 +377,6 @@ __kernel void runLBPClassifierStumpSimple( ...@@ -381,7 +377,6 @@ __kernel void runLBPClassifierStumpSimple(
int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0); int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
int ngroups = get_num_groups(0)*get_num_groups(1); int ngroups = get_num_groups(0)*get_num_groups(1);
int scaleIdx, tileIdx, stageIdx; int scaleIdx, tileIdx, stageIdx;
int startStage = 0, endStage = nstages;
int sumstep = (int)(_sumstep/sizeof(int)); int sumstep = (int)(_sumstep/sizeof(int));
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
...@@ -404,7 +399,7 @@ __kernel void runLBPClassifierStumpSimple( ...@@ -404,7 +399,7 @@ __kernel void runLBPClassifierStumpSimple(
__global const Stump* stump = stumps; __global const Stump* stump = stumps;
__global const int* bitset = bitsets; __global const int* bitset = bitsets;
for( stageIdx = 0; stageIdx < endStage; stageIdx++ ) for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )
{ {
int i, ntrees = stages[stageIdx].ntrees; int i, ntrees = stages[stageIdx].ntrees;
float s = 0.f; float s = 0.f;
...@@ -433,10 +428,10 @@ __kernel void runLBPClassifierStumpSimple( ...@@ -433,10 +428,10 @@ __kernel void runLBPClassifierStumpSimple(
break; break;
} }
if( stageIdx == nstages ) if( stageIdx == N_STAGES )
{ {
int nfaces = atomic_inc(facepos); int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces ) if( nfaces < MAX_FACES )
{ {
volatile __global int* face = facepos + 1 + nfaces*3; volatile __global int* face = facepos + 1 + nfaces*3;
face[0] = scaleIdx; face[0] = scaleIdx;
...@@ -455,15 +450,13 @@ void runLBPClassifierStump( ...@@ -455,15 +450,13 @@ void runLBPClassifierStump(
__global const int* sum, __global const int* sum,
int _sumstep, int sumoffset, int _sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures, __global const OptLBPFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages, __global const Stage* stages,
__global const Stump* stumps, __global const Stump* stumps,
__global const int* bitsets, __global const int* bitsets,
int bitsetSize, int bitsetSize,
volatile __global int* facepos, volatile __global int* facepos,
int2 windowsize, int maxFaces) int2 windowsize)
{ {
int lx = get_local_id(0); int lx = get_local_id(0);
int ly = get_local_id(1); int ly = get_local_id(1);
...@@ -525,7 +518,7 @@ void runLBPClassifierStump( ...@@ -525,7 +518,7 @@ void runLBPClassifierStump(
__global const int* p = psum0 + mad24(iy, sumstep, ix); __global const int* p = psum0 + mad24(iy, sumstep, ix);
#endif #endif
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ ) for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
{ {
int ntrees = stages[stageIdx].ntrees; int ntrees = stages[stageIdx].ntrees;
float s = 0.f; float s = 0.f;
...@@ -554,14 +547,14 @@ void runLBPClassifierStump( ...@@ -554,14 +547,14 @@ void runLBPClassifierStump(
break; break;
} }
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) ) if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
{ {
int count = atomic_inc(lcount); int count = atomic_inc(lcount);
lbuf[count] = (int)(ix | (iy << 8)); lbuf[count] = (int)(ix | (iy << 8));
} }
} }
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ ) for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
{ {
int nrects = lcount[0]; int nrects = lcount[0];
...@@ -639,13 +632,13 @@ void runLBPClassifierStump( ...@@ -639,13 +632,13 @@ void runLBPClassifierStump(
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if( stageIdx == nstages ) if( stageIdx == N_STAGES )
{ {
int nrects = lcount[0]; int nrects = lcount[0];
if( lidx < nrects ) if( lidx < nrects )
{ {
int nfaces = atomic_inc(facepos); int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces ) if( nfaces < MAX_FACES )
{ {
volatile __global int* face = facepos + 1 + nfaces*3; volatile __global int* face = facepos + 1 + nfaces*3;
int val = lbuf[lidx]; int val = lbuf[lidx];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment