Commit 86d78562 authored by Marina Kolpakova's avatar Marina Kolpakova

LBP: switched to texture implementation

parent b0606b05
......@@ -1435,7 +1435,7 @@ public:
bool load(const std::string& filename);
void release();
int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/);
void preallocateIntegralBuffer(cv::Size desired);
Size getClassifierSize() const;
......
......@@ -69,16 +69,14 @@ GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
cv::gpu::GpuMat img(img_host);
cv::gpu::GpuMat gpu_rects, buffer;
cv::gpu::GpuMat gpu_rects;
cv::gpu::CascadeClassifier_GPU_LBP cascade(img.size());
ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
// cascade.detectMultiScale(img, objects_buffer);
cascade.detectMultiScale(img, buffer, gpu_rects);
cascade.detectMultiScale(img, gpu_rects);
TEST_CYCLE()
{
cascade.detectMultiScale(img, buffer, gpu_rects);
cascade.detectMultiScale(img, gpu_rects);
}
}
......
......@@ -70,7 +70,7 @@ Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const
void cv::gpu::CascadeClassifier_GPU_LBP::preallocateIntegralBuffer(cv::Size /*desired*/) { throw_nogpu();}
void cv::gpu::CascadeClassifier_GPU_LBP::initializeBuffers(cv::Size /*frame*/) { throw_nogpu();}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*scaledImageBuffer*/, cv::gpu::GpuMat& /*objectsBuf*/,
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
#else
......@@ -299,28 +299,29 @@ namespace cv { namespace gpu { namespace device
{
namespace lbp
{
void classifyStump(const DevMem2Db mstages,
const int nstages,
const DevMem2Di mnodes,
const DevMem2Df mleaves,
const DevMem2Di msubsets,
const DevMem2Db mfeatures,
const DevMem2Di integral,
const int workWidth,
const int workHeight,
const int clWidth,
const int clHeight,
float scale,
int step,
int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void classifyStump(const DevMem2Db& mstages,
const int nstages,
const DevMem2Di& mnodes,
const DevMem2Df& mleaves,
const DevMem2Di& msubsets,
const DevMem2Db& mfeatures,
const int workWidth,
const int workHeight,
const int clWidth,
const int clHeight,
float scale,
int step,
int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral);
void unbindIntegral();
}
}}}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
{
CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
......@@ -332,10 +333,12 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
if( !objects.empty() && objects.depth() == CV_32S)
objects.reshape(4, 1);
else
objects.create(1 , defaultObjSearchNum, CV_32SC4);
GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
// GpuMat candidates(objects);
objects.create(1 , image.cols >> 4, CV_32SC4);
GpuMat candidates(1 , image.cols >> 1, CV_32SC4);
// GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
// used for debug
// candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0));
if (maxObjectSize == cv::Size())
maxObjectSize = image.size();
......@@ -347,9 +350,11 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
cudaMalloc(&dclassified, sizeof(int));
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
int step;
cv::gpu::device::lbp::bindIntegral(integral);
for( double factor = 1; ; factor *= scaleFactor )
{
// if (factor > 2.0) break;
cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
......@@ -365,7 +370,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
GpuMat currBuff = integralBuffer;//(integralBuffer, cv::Rect(0, 0, integralBuffer.width, integralBuffer.height));
GpuMat currBuff = integralBuffer;
cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
......@@ -373,8 +378,10 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
step = (factor <= 2.) + 1;
cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
scaledIntegral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
}
cv::gpu::device::lbp::unbindIntegral();
if (groupThreshold <= 0 || objects.empty())
return 0;
cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);
......
......@@ -48,8 +48,102 @@ namespace cv { namespace gpu { namespace device
{
namespace lbp
{
texture<int, cudaTextureType2D, cudaReadModeElementType> tintegral(false, cudaFilterModePoint, cudaAddressModeClamp);
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
{
int anchors[9];
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
ty += fh;
anchors[3] = tex2D(tintegral, tx, ty);
anchors[4] = tex2D(tintegral, tx + featurez, ty);
anchors[3] -= anchors[4];
anchors[5] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[4] -= anchors[5];
anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
ty += fh;
anchors[6] = tex2D(tintegral, tx, ty);
anchors[7] = tex2D(tintegral, tx + featurez, ty);
anchors[6] -= anchors[7];
anchors[8] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[7] -= anchors[8];
anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 4;
response |= (~(anchors[1] >> 31)) & 2;;
response |= (~(anchors[2] >> 31)) & 1;
shift = (~(anchors[5] >> 31)) & 16;
shift |= (~(anchors[3] >> 31)) & 1;
ty += fh;
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
shift |= (~(anchors[6] >> 31)) & 2;
shift |= (~(anchors[7] >> 31)) & 4;
shift |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
void bindIntegral(DevMem2Di integral)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
cudaSafeCall( cudaBindTexture2D(0, &tintegral, integral.ptr(), &desc, (size_t)integral.cols, (size_t)integral.rows, (size_t)integral.step));
}
void unbindIntegral()
{
cudaSafeCall( cudaUnbindTexture(&tintegral));
}
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
const int* integral, const int istep, const int workWidth,const int workHeight, const int clWidth, const int clHeight, const float scale, const int step,
/* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
{
int x = threadIdx.x * step;
......@@ -63,21 +157,18 @@ namespace cv { namespace gpu { namespace device
{
float sum = 0;
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
{
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
int c = evaluator( (y + feature.y) * istep + x + feature.x , feature.w * istep, feature.z, integral, istep);
const int* subsetIdx = subsets + (current_node * subsetSize);
int idx = (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
int shift;
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
}
if (sum < stage.threshold)
return;
}
......@@ -85,8 +176,8 @@ namespace cv { namespace gpu { namespace device
int4 rect;
rect.x = roundf(x * scale);
rect.y = roundf(y * scale);
rect.z = roundf(clWidth);
rect.w = roundf(clHeight);
rect.z = clWidth;
rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, 100U);
#else
......@@ -178,8 +269,8 @@ namespace cv { namespace gpu { namespace device
}
}
void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
/*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
DevMem2D_<int4> objects, unsigned int* classified)
{
int blocks = ceilf(workHeight / (float)step);
......@@ -190,11 +281,8 @@ namespace cv { namespace gpu { namespace device
const float* leaves = mleaves.ptr();
const int* subsets = msubsets.ptr();
const uchar4* features = (uchar4*)(mfeatures.ptr());
const int* integ = integral.ptr();
int istep = integral.step / sizeof(int);
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integ, istep,
workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
}
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
......
......@@ -153,90 +153,8 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
__syncthreads();
// printf("tid %d label %d\n", tid, labels[tid]);
}
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (unsigned int y, int featurew, int featurez, const int* integral, int step) const
{
int x_off = 2 * featurez;
int anchors[9];
anchors[0] = integral[y];
anchors[1] = integral[y + featurez];
anchors[0] -= anchors[1];
anchors[2] = integral[y + x_off];
anchors[1] -= anchors[2];
anchors[2] -= integral[y + featurez + x_off];
y += featurew;
anchors[3] = integral[y];
anchors[4] = integral[y + featurez];
anchors[3] -= anchors[4];
anchors[5] = integral[y + x_off];
anchors[4] -= anchors[5];
anchors[5] -= integral[y + featurez + x_off];
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
y += featurew;
anchors[6] = integral[y];
anchors[7] = integral[y + featurez];
anchors[6] -= anchors[7];
anchors[8] = integral[y + x_off];
anchors[7] -= anchors[8];
anchors[8] -= integral[y + x_off + featurez];
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 128;
response |= (~(anchors[1] >> 31)) & 64;;
response |= (~(anchors[2] >> 31)) & 32;
response |= (~(anchors[5] >> 31)) & 16;
response |= (~(anchors[3] >> 31)) & 1;
y += featurew;
anchors[0] = integral[y];
anchors[1] = integral[y + featurez];
anchors[0] -= anchors[1];
anchors[2] = integral[y + x_off];
anchors[1] -= anchors[2];
anchors[2] -= integral[y + x_off + featurez];
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
response |= (~(anchors[6] >> 31)) & 2;
response |= (~(anchors[7] >> 31)) & 4;
response |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
} // lbp
} } }// namespaces
#endif
\ No newline at end of file
......@@ -343,15 +343,16 @@ TEST_P(LBP_classify, Accuracy)
cv::gpu::CascadeClassifier_GPU_LBP gpuClassifier;
ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
cv::gpu::GpuMat gpu_rects, buffer;
cv::gpu::GpuMat gpu_rects;
cv::gpu::GpuMat tested(grey);
int count = gpuClassifier.detectMultiScale(tested, buffer, gpu_rects);
int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
cv::Mat gpu_f(gpu_rects);
int* gpu_faces = (int*)gpu_f.ptr();
for (int i = 0; i < count; i++)
{
cv::Rect r(gpu_faces[i * 4],gpu_faces[i * 4 + 1],gpu_faces[i * 4 + 2],gpu_faces[i * 4 + 3]);
std::cout << gpu_faces[i * 4]<< " " << gpu_faces[i * 4 + 1] << " " << gpu_faces[i * 4 + 2] << " " << gpu_faces[i * 4 + 3] << std::endl;
cv::rectangle(markedImage, r , cv::Scalar(0, 0, 255, 255));
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment