Commit e63ab8de authored by Marina Kolpakova's avatar Marina Kolpakova

LBP: multiscale approach; refactored atomics usage

parent 5dc7752d
......@@ -1464,6 +1464,7 @@ private:
GpuMat resuzeBuffer;
GpuMat candidates;
static const int integralFactor = 4;
};
////////////////////////////////// SURF //////////////////////////////////////////
......
......@@ -67,7 +67,7 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP()
bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const { throw_nogpu(); return true; }
bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&) { throw_nogpu(); return true; }
Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const { throw_nogpu(); return Size(); }
void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();}
void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
......@@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame)
{
resuzeBuffer.create(frame, CV_8UC1);
integral.create(frame.height + 1, frame.width + 1, CV_32SC1);
integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
NcvSize32u roiSize;
roiSize.width = frame.width;
roiSize.height = frame.height;
......@@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device
DevMem2D_<int4> objects,
unsigned int* classified);
void classifyPyramid(int frameW,
int frameH,
int windowW,
int windowH,
float initalScale,
float factor,
int total,
const DevMem2Db& mstages,
const int nstages,
const DevMem2Di& mnodes,
const DevMem2Df& mleaves,
const DevMem2Di& msubsets,
const DevMem2Db& mfeatures,
const int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified,
DevMem2Di integral);
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral);
void unbindIntegral();
}
}}}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
cv::Size operator -(const cv::Size& a, const cv::Size& b)
{
return cv::Size(a.width - b.width, a.height - b.height);
}
cv::Size operator +(const cv::Size& a, const int& i)
{
return cv::Size(a.width + i, a.height + i);
}
cv::Size operator *(const cv::Size& a, const float& f)
{
return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
}
cv::Size operator /(const cv::Size& a, const float& f)
{
return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
}
bool operator <=(const cv::Size& a, const cv::Size& b)
{
return a.width <= b.width && a.height <= b.width;
}
struct PyrLavel
{
PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order)
{
scale = pow(_scale, order);
sFrame = frame / scale;
workArea = sFrame - window + 1;
sWindow = window * scale;
}
bool isFeasible(cv::Size maxObj)
{
return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
}
PyrLavel next(float factor, cv::Size frame, cv::Size window)
{
return PyrLavel(order + 1, factor, frame, window);
}
int order;
float scale;
cv::Size sFrame;
cv::Size workArea;
cv::Size sWindow;
};
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize)
{
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
......@@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
// used for debug
// candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0));
if (maxObjectSize == cv::Size())
maxObjectSize = image.size();
......@@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
GpuMat dclassified(1, 1, CV_32S);
cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
// cv::gpu::device::lbp::bindIntegral(integral);
PyrLavel level(0, 1.0f, image.size(), NxM);
Size scaledImageSize(image.cols, image.rows);
Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
Size windowSize(NxM.width, NxM.height);
float factor = 1;
for (;;)
while (level.isFeasible(maxObjectSize))
{
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
break;
int acc = level.sFrame.width + 1;
float iniScale = level.scale;
cv::Size area = level.workArea;
float step = (float)(1 + (level.scale <= 2.f));
if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
break;
int total = 0, prev = 0;
// if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
// continue;
while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
{
// create sutable matrix headers
GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
GpuMat buff = integralBuffer;
GpuMat scaledImg = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
GpuMat currBuff = integralBuffer;
// generate integral for scale
gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR);
gpu::integralBuffered(src, sint, buff);
gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
total += cvCeil(area.width / step) * cvCeil(area.height / step);
// std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl;
int step = factor <= 2.f ? 2 : 1;
// increment pyr lavel
level = level.next(scaleFactor, image.size(), NxM);
area = level.workArea;
device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr<unsigned int>());
step = (float)(1 + (level.scale <= 2.f));
prev = acc;
acc += level.sFrame.width + 1;
}
factor *= scaleFactor;
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
}
// cv::gpu::device::lbp::unbindIntegral();
if (groupThreshold <= 0 || objects.empty())
return 0;
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
// candidates.copyTo(objects);
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaDeviceSynchronize() );
// std::cout << classified << " !!!!!!!!!!" << std::endl;
return classified;
}
......
This diff is collapsed.
......@@ -44,18 +44,19 @@
#define OPENCV_GPU_EMULATION_HPP_
#include "warp_reduce.hpp"
#include <stdio.h>
namespace cv { namespace gpu { namespace device
{
struct Emulation
{
template<int CTA_SIZE>
template<int CTA_SIZE>
static __forceinline__ __device__ int Ballot(int predicate)
{
#if (__CUDA_ARCH__ >= 200)
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
return __ballot(predicate);
#else
__shared__ volatile int cta_buffer[CTA_SIZE]
__shared__ volatile int cta_buffer[CTA_SIZE];
int tid = threadIdx.x;
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
......@@ -63,41 +64,62 @@ namespace cv { namespace gpu { namespace device
#endif
}
struct smem
{
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
template<typename T>
static __device__ __forceinline__ T atomicInc(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
template<typename T>
static __device__ __forceinline__ void atomicAdd(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
};
struct smem
{
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
template<typename T>
static __device__ __forceinline__ T atomicInc(T* address, T val)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + 1);
*address = count;
} while (*address != count);
return (count & TAG_MASK) - 1;
#else
return ::atomicInc(address, val);
#endif
}
template<typename T>
static __device__ __forceinline__ void atomicAdd(T* address, T val)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + val);
*address = count;
} while (*address != count);
#else
::atomicAdd(address, val);
#endif
}
template<typename T>
static __device__ __forceinline__ T atomicMin(T* address, T val)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T count = min(*address, val);
do
{
*address = count;
} while (*address > count);
return count;
#else
return ::atomicMin(address, val);
#endif
}
};
};
}}} // namespace cv { namespace gpu { namespace device
......
......@@ -44,52 +44,11 @@
#define __OPENCV_GPU_DEVICE_LBP_HPP_
#include "internal_shared.hpp"
#include <opencv2/gpu/device/emulation.hpp>
namespace cv { namespace gpu { namespace device {
namespace lbp{
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
template<typename T>
__device__ __forceinline__ T __atomicInc(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + 1);
*address = count;
} while (*address != count);
return (count & TAG_MASK) - 1;
}
template<typename T>
__device__ __forceinline__ void __atomicAdd(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + val);
*address = count;
} while (*address != count);
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
T count = min(*address, val);
do
{
*address = count;
} while (*address > count);
return count;
}
namespace lbp {
struct Stage
{
......@@ -127,27 +86,25 @@ namespace lbp{
unsigned tid = threadIdx.x;
labels[tid] = tid;
__syncthreads();
for (unsigned int id = 0; id < n; id++)
{
if (tid != id && predicate(vec[tid], vec[id]))
{
int p = labels[tid];
int q = labels[id];
if (p != q)
{
int m = min(p, q);
#if (__CUDA_ARCH__ < 120)
__atomicMin(labels + id, m);
#else
atomicMin(labels + id, m);
#endif
}
if (p < q)
{
Emulation::smem::atomicMin(labels + id, p);
}
else if (p > q)
{
Emulation::smem::atomicMin(labels + tid, q);
}
}
}
__syncthreads();
}
} // lbp
} } }// namespaces
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment