Commit efa84d82 authored by Yan Wang's avatar Yan Wang

Use preprocessor for constant values in OpenCL kernel instead of

the parameter variable.

It could improve the performance of
OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/*.
Especially,
OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/15
OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/16
could be improved about 2% in Intel platform.
Signed-off-by: 's avatarYan Wang <yan.wang@linux.intel.com>
parent 009aec51
......@@ -1060,6 +1060,7 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
}
int nstages = (int)data.stages.size();
int splitstage_ocl = 1;
if( featureType == FeatureEvaluator::HAAR )
{
......@@ -1071,11 +1072,11 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
{
String opts;
if (lbufSize.area())
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
else
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d",
localsz.width, localsz.height, data.maxNodesPerTree);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
if( haarKernel.empty() )
return false;
......@@ -1083,7 +1084,6 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
Rect normrect = haar->getNormRect();
int sqofs = haar->getSquaresOffset();
int splitstage_ocl = 1;
haarKernel.args((int)scales.size(),
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
......@@ -1091,13 +1091,12 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier
splitstage_ocl, nstages,
ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(unodes),
ocl::KernelArg::PtrReadOnly(uleaves),
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
normrect, sqofs, data.origWinSize, (int)MAX_FACES);
normrect, sqofs, data.origWinSize);
ok = haarKernel.run(2, globalsize, localsize, true);
}
else if( featureType == FeatureEvaluator::LBP )
......@@ -1113,16 +1112,16 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
{
String opts;
if (lbufSize.area())
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, splitstage_ocl, nstages, MAX_FACES);
else
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d", localsz.width, localsz.height);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, splitstage_ocl, nstages, MAX_FACES);
lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
if( lbpKernel.empty() )
return false;
}
int splitstage_ocl = 1;
int subsetSize = (data.ncategories + 31)/32;
lbpKernel.args((int)scales.size(),
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
......@@ -1130,14 +1129,13 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier
splitstage_ocl, nstages,
ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(unodes),
ocl::KernelArg::PtrReadOnly(usubsets),
subsetSize,
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
data.origWinSize, (int)MAX_FACES);
data.origWinSize);
ok = lbpKernel.run(2, globalsize, localsize, true);
}
......
......@@ -70,14 +70,12 @@ void runHaarClassifier(
__global const int* sum,
int _sumstep, int sumoffset,
__global const OptHaarFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages,
__global const Node* nodes,
__global const float* leaves0,
volatile __global int* facepos,
int4 normrect, int sqofs, int2 windowsize, int maxFaces)
int4 normrect, int sqofs, int2 windowsize)
{
int lx = get_local_id(0);
int ly = get_local_id(1);
......@@ -165,7 +163,7 @@ void runHaarClassifier(
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
nf = nf > 0 ? nf : 1.f;
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
{
int ntrees = stages[stageIdx].ntrees;
float s = 0.f;
......@@ -221,7 +219,7 @@ void runHaarClassifier(
break;
}
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
{
int count = atomic_inc(lcount);
lbuf[count] = (int)(ix | (iy << 8));
......@@ -229,7 +227,7 @@ void runHaarClassifier(
}
}
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
{
int nrects = lcount[0];
......@@ -335,13 +333,13 @@ void runHaarClassifier(
}
barrier(CLK_LOCAL_MEM_FENCE);
if( stageIdx == nstages )
if( stageIdx == N_STAGES )
{
int nrects = lcount[0];
if( lidx < nrects )
{
int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces )
if( nfaces < MAX_FACES )
{
volatile __global int* face = facepos + 1 + nfaces*3;
int val = lbuf[lidx];
......@@ -364,15 +362,13 @@ __kernel void runLBPClassifierStumpSimple(
__global const int* sum,
int _sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages,
__global const Stump* stumps,
__global const int* bitsets,
int bitsetSize,
volatile __global int* facepos,
int2 windowsize, int maxFaces)
int2 windowsize)
{
int lx = get_local_id(0);
int ly = get_local_id(1);
......@@ -381,7 +377,6 @@ __kernel void runLBPClassifierStumpSimple(
int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
int ngroups = get_num_groups(0)*get_num_groups(1);
int scaleIdx, tileIdx, stageIdx;
int startStage = 0, endStage = nstages;
int sumstep = (int)(_sumstep/sizeof(int));
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
......@@ -404,7 +399,7 @@ __kernel void runLBPClassifierStumpSimple(
__global const Stump* stump = stumps;
__global const int* bitset = bitsets;
for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )
{
int i, ntrees = stages[stageIdx].ntrees;
float s = 0.f;
......@@ -433,10 +428,10 @@ __kernel void runLBPClassifierStumpSimple(
break;
}
if( stageIdx == nstages )
if( stageIdx == N_STAGES )
{
int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces )
if( nfaces < MAX_FACES )
{
volatile __global int* face = facepos + 1 + nfaces*3;
face[0] = scaleIdx;
......@@ -455,15 +450,13 @@ void runLBPClassifierStump(
__global const int* sum,
int _sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages,
__global const Stump* stumps,
__global const int* bitsets,
int bitsetSize,
volatile __global int* facepos,
int2 windowsize, int maxFaces)
int2 windowsize)
{
int lx = get_local_id(0);
int ly = get_local_id(1);
......@@ -525,7 +518,7 @@ void runLBPClassifierStump(
__global const int* p = psum0 + mad24(iy, sumstep, ix);
#endif
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
{
int ntrees = stages[stageIdx].ntrees;
float s = 0.f;
......@@ -554,14 +547,14 @@ void runLBPClassifierStump(
break;
}
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
{
int count = atomic_inc(lcount);
lbuf[count] = (int)(ix | (iy << 8));
}
}
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
{
int nrects = lcount[0];
......@@ -639,13 +632,13 @@ void runLBPClassifierStump(
}
barrier(CLK_LOCAL_MEM_FENCE);
if( stageIdx == nstages )
if( stageIdx == N_STAGES )
{
int nrects = lcount[0];
if( lidx < nrects )
{
int nfaces = atomic_inc(facepos);
if( nfaces < maxFaces )
if( nfaces < MAX_FACES )
{
volatile __global int* face = facepos + 1 + nfaces*3;
int val = lbuf[lidx];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment