Commit e63ab8de authored by Marina Kolpakova's avatar Marina Kolpakova

LBP: multiscale approach; refactored atomics usage

parent 5dc7752d
...@@ -1464,6 +1464,7 @@ private: ...@@ -1464,6 +1464,7 @@ private:
GpuMat resuzeBuffer; GpuMat resuzeBuffer;
GpuMat candidates; GpuMat candidates;
static const int integralFactor = 4;
}; };
////////////////////////////////// SURF ////////////////////////////////////////// ////////////////////////////////// SURF //////////////////////////////////////////
......
...@@ -67,7 +67,7 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP() ...@@ -67,7 +67,7 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP()
bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const { throw_nogpu(); return true; } bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const { throw_nogpu(); return true; }
bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&) { throw_nogpu(); return true; } bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&) { throw_nogpu(); return true; }
Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const { throw_nogpu(); return Size(); } Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const { throw_nogpu(); return Size(); }
void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();} void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/, int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;} double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
...@@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame) ...@@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame)
{ {
resuzeBuffer.create(frame, CV_8UC1); resuzeBuffer.create(frame, CV_8UC1);
integral.create(frame.height + 1, frame.width + 1, CV_32SC1); integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
NcvSize32u roiSize; NcvSize32u roiSize;
roiSize.width = frame.width; roiSize.width = frame.width;
roiSize.height = frame.height; roiSize.height = frame.height;
...@@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device ...@@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device
DevMem2D_<int4> objects, DevMem2D_<int4> objects,
unsigned int* classified); unsigned int* classified);
void classifyPyramid(int frameW,
int frameH,
int windowW,
int windowH,
float initalScale,
float factor,
int total,
const DevMem2Db& mstages,
const int nstages,
const DevMem2Di& mnodes,
const DevMem2Df& mleaves,
const DevMem2Di& msubsets,
const DevMem2Db& mfeatures,
const int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified,
DevMem2Di integral);
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral); void bindIntegral(DevMem2Di integral);
void unbindIntegral(); void unbindIntegral();
} }
}}} }}}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, cv::Size operator -(const cv::Size& a, const cv::Size& b)
double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/) {
return cv::Size(a.width - b.width, a.height - b.height);
}
cv::Size operator +(const cv::Size& a, const int& i)
{
return cv::Size(a.width + i, a.height + i);
}
cv::Size operator *(const cv::Size& a, const float& f)
{
return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
}
cv::Size operator /(const cv::Size& a, const float& f)
{
return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
}
bool operator <=(const cv::Size& a, const cv::Size& b)
{
return a.width <= b.width && a.height <= b.width;
}
struct PyrLavel
{
PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order)
{
scale = pow(_scale, order);
sFrame = frame / scale;
workArea = sFrame - window + 1;
sWindow = window * scale;
}
bool isFeasible(cv::Size maxObj)
{
return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
}
PyrLavel next(float factor, cv::Size frame, cv::Size window)
{
return PyrLavel(order + 1, factor, frame, window);
}
int order;
float scale;
cv::Size sFrame;
cv::Size workArea;
cv::Size sWindow;
};
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize)
{ {
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U); CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
...@@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp ...@@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
// used for debug // used for debug
// candidates.setTo(cv::Scalar::all(0)); // candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0)); // objects.setTo(cv::Scalar::all(0));
if (maxObjectSize == cv::Size()) if (maxObjectSize == cv::Size())
maxObjectSize = image.size(); maxObjectSize = image.size();
...@@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp ...@@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
GpuMat dclassified(1, 1, CV_32S); GpuMat dclassified(1, 1, CV_32S);
cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) ); cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
// cv::gpu::device::lbp::bindIntegral(integral); PyrLavel level(0, 1.0f, image.size(), NxM);
Size scaledImageSize(image.cols, image.rows); while (level.isFeasible(maxObjectSize))
Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
Size windowSize(NxM.width, NxM.height);
float factor = 1;
for (;;)
{ {
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) int acc = level.sFrame.width + 1;
break; float iniScale = level.scale;
cv::Size area = level.workArea;
float step = (float)(1 + (level.scale <= 2.f));
if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height ) int total = 0, prev = 0;
break;
// if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height ) while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
// continue; {
// create sutable matrix headers
GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
GpuMat buff = integralBuffer;
GpuMat scaledImg = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height)); // generate integral for scale
GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1)); gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR);
GpuMat currBuff = integralBuffer; gpu::integralBuffered(src, sint, buff);
gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR); total += cvCeil(area.width / step) * cvCeil(area.height / step);
gpu::integralBuffered(scaledImg, scaledIntegral, currBuff); // std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl;
int step = factor <= 2.f ? 2 : 1; // increment pyr lavel
level = level.next(scaleFactor, image.size(), NxM);
area = level.workArea;
device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, step = (float)(1 + (level.scale <= 2.f));
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr<unsigned int>()); prev = acc;
acc += level.sFrame.width + 1;
}
factor *= scaleFactor; device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
} }
// cv::gpu::device::lbp::unbindIntegral();
if (groupThreshold <= 0 || objects.empty()) if (groupThreshold <= 0 || objects.empty())
return 0; return 0;
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>()); device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
// candidates.copyTo(objects);
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
// std::cout << classified << " !!!!!!!!!!" << std::endl;
return classified; return classified;
} }
......
This diff is collapsed.
...@@ -44,18 +44,19 @@ ...@@ -44,18 +44,19 @@
#define OPENCV_GPU_EMULATION_HPP_ #define OPENCV_GPU_EMULATION_HPP_
#include "warp_reduce.hpp" #include "warp_reduce.hpp"
#include <stdio.h>
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
struct Emulation struct Emulation
{ {
template<int CTA_SIZE> template<int CTA_SIZE>
static __forceinline__ __device__ int Ballot(int predicate) static __forceinline__ __device__ int Ballot(int predicate)
{ {
#if (__CUDA_ARCH__ >= 200) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
return __ballot(predicate); return __ballot(predicate);
#else #else
__shared__ volatile int cta_buffer[CTA_SIZE] __shared__ volatile int cta_buffer[CTA_SIZE];
int tid = threadIdx.x; int tid = threadIdx.x;
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
...@@ -63,41 +64,62 @@ namespace cv { namespace gpu { namespace device ...@@ -63,41 +64,62 @@ namespace cv { namespace gpu { namespace device
#endif #endif
} }
struct smem struct smem
{ {
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U }; enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
template<typename T> template<typename T>
static __device__ __forceinline__ T atomicInc(T* address, T val) static __device__ __forceinline__ T atomicInc(T* address, T val)
{ {
#if (__CUDA_ARCH__ < 120) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T count;
#else unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
#endif {
count = *address & TAG_MASK;
} count = tag | (count + 1);
*address = count;
template<typename T> } while (*address != count);
static __device__ __forceinline__ void atomicAdd(T* address, T val)
{ return (count & TAG_MASK) - 1;
#if (__CUDA_ARCH__ < 120) #else
return ::atomicInc(address, val);
#else #endif
}
#endif
} template<typename T>
static __device__ __forceinline__ void atomicAdd(T* address, T val)
template<typename T> {
__device__ __forceinline__ T __atomicMin(T* address, T val) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
{ T count;
#if (__CUDA_ARCH__ < 120) unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
#else {
count = *address & TAG_MASK;
#endif count = tag | (count + val);
} *address = count;
}; } while (*address != count);
#else
::atomicAdd(address, val);
#endif
}
template<typename T>
static __device__ __forceinline__ T atomicMin(T* address, T val)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T count = min(*address, val);
do
{
*address = count;
} while (*address > count);
return count;
#else
return ::atomicMin(address, val);
#endif
}
};
}; };
}}} // namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device
......
...@@ -44,52 +44,11 @@ ...@@ -44,52 +44,11 @@
#define __OPENCV_GPU_DEVICE_LBP_HPP_ #define __OPENCV_GPU_DEVICE_LBP_HPP_
#include "internal_shared.hpp" #include "internal_shared.hpp"
#include <opencv2/gpu/device/emulation.hpp>
namespace cv { namespace gpu { namespace device { namespace cv { namespace gpu { namespace device {
namespace lbp{ namespace lbp {
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
template<typename T>
__device__ __forceinline__ T __atomicInc(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + 1);
*address = count;
} while (*address != count);
return (count & TAG_MASK) - 1;
}
template<typename T>
__device__ __forceinline__ void __atomicAdd(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + val);
*address = count;
} while (*address != count);
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
T count = min(*address, val);
do
{
*address = count;
} while (*address > count);
return count;
}
struct Stage struct Stage
{ {
...@@ -127,27 +86,25 @@ namespace lbp{ ...@@ -127,27 +86,25 @@ namespace lbp{
unsigned tid = threadIdx.x; unsigned tid = threadIdx.x;
labels[tid] = tid; labels[tid] = tid;
__syncthreads(); __syncthreads();
for (unsigned int id = 0; id < n; id++) for (unsigned int id = 0; id < n; id++)
{ {
if (tid != id && predicate(vec[tid], vec[id])) if (tid != id && predicate(vec[tid], vec[id]))
{ {
int p = labels[tid]; int p = labels[tid];
int q = labels[id]; int q = labels[id];
if (p < q)
if (p != q) {
{ Emulation::smem::atomicMin(labels + id, p);
int m = min(p, q); }
#if (__CUDA_ARCH__ < 120) else if (p > q)
__atomicMin(labels + id, m); {
#else Emulation::smem::atomicMin(labels + tid, q);
atomicMin(labels + id, m); }
#endif
}
} }
} }
__syncthreads(); __syncthreads();
} }
} // lbp } // lbp
} } }// namespaces } } }// namespaces
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment