Commit 0b0c9be7 authored by Andrey Kamaev's avatar Andrey Kamaev Committed by OpenCV Buildbot

Merge pull request #636 from ilya-lavrenov:SSE2_HOG

parents 3f8d87d8 8b510ad8
...@@ -39,12 +39,12 @@ ...@@ -39,12 +39,12 @@
// the use of this software, even if advised of the possibility of such damage. // the use of this software, even if advised of the possibility of such damage.
// //
//M*/ //M*/
#include <stdio.h>
#include "precomp.hpp" #include "precomp.hpp"
#include <cstdio>
#include <iterator> #include <iterator>
#ifdef HAVE_IPP
#include "ipp.h"
#endif
/****************************************************************************************\ /****************************************************************************************\
The code below is implementation of HOG (Histogram-of-Oriented Gradients) The code below is implementation of HOG (Histogram-of-Oriented Gradients)
descriptor and object detection, introduced by Navneet Dalal and Bill Triggs. descriptor and object detection, introduced by Navneet Dalal and Bill Triggs.
...@@ -63,6 +63,7 @@ size_t HOGDescriptor::getDescriptorSize() const ...@@ -63,6 +63,7 @@ size_t HOGDescriptor::getDescriptorSize() const
blockSize.height % cellSize.height == 0); blockSize.height % cellSize.height == 0);
CV_Assert((winSize.width - blockSize.width) % blockStride.width == 0 && CV_Assert((winSize.width - blockSize.width) % blockStride.width == 0 &&
(winSize.height - blockSize.height) % blockStride.height == 0 ); (winSize.height - blockSize.height) % blockStride.height == 0 );
return (size_t)nbins* return (size_t)nbins*
(blockSize.width/cellSize.width)* (blockSize.width/cellSize.width)*
(blockSize.height/cellSize.height)* (blockSize.height/cellSize.height)*
...@@ -180,6 +181,7 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -180,6 +181,7 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
img.rows + paddingTL.height + paddingBR.height); img.rows + paddingTL.height + paddingBR.height);
grad.create(gradsize, CV_32FC2); // <magnitude*(1-alpha), magnitude*alpha> grad.create(gradsize, CV_32FC2); // <magnitude*(1-alpha), magnitude*alpha>
qangle.create(gradsize, CV_8UC2); // [0..nbins-1] - quantized gradient orientation qangle.create(gradsize, CV_8UC2); // [0..nbins-1] - quantized gradient orientation
Size wholeSize; Size wholeSize;
Point roiofs; Point roiofs;
img.locateROI(wholeSize, roiofs); img.locateROI(wholeSize, roiofs);
...@@ -188,14 +190,33 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -188,14 +190,33 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
int cn = img.channels(); int cn = img.channels();
Mat_<float> _lut(1, 256); Mat_<float> _lut(1, 256);
const float* lut = &_lut(0,0); const float* const lut = &_lut(0,0);
#if CV_SSE2
const int indeces[] = { 0, 1, 2, 3 };
__m128i idx = _mm_loadu_si128((const __m128i*)indeces);
__m128i ifour = _mm_set1_epi32(4);
float* const _data = &_lut(0, 0);
if( gammaCorrection )
for( i = 0; i < 256; i += 4 )
{
_mm_storeu_ps(_data + i, _mm_sqrt_ps(_mm_cvtepi32_ps(idx)));
idx = _mm_add_epi32(idx, ifour);
}
else
for( i = 0; i < 256; i += 4 )
{
_mm_storeu_ps(_data + i, _mm_cvtepi32_ps(idx));
idx = _mm_add_epi32(idx, ifour);
}
#else
if( gammaCorrection ) if( gammaCorrection )
for( i = 0; i < 256; i++ ) for( i = 0; i < 256; i++ )
_lut(0,i) = std::sqrt((float)i); _lut(0,i) = std::sqrt((float)i);
else else
for( i = 0; i < 256; i++ ) for( i = 0; i < 256; i++ )
_lut(0,i) = (float)i; _lut(0,i) = (float)i;
#endif
AutoBuffer<int> mapbuf(gradsize.width + gradsize.height + 4); AutoBuffer<int> mapbuf(gradsize.width + gradsize.height + 4);
int* xmap = (int*)mapbuf + 1; int* xmap = (int*)mapbuf + 1;
...@@ -213,47 +234,34 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -213,47 +234,34 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
// x- & y- derivatives for the whole row // x- & y- derivatives for the whole row
int width = gradsize.width; int width = gradsize.width;
AutoBuffer<float> _dbuf(width*4); AutoBuffer<float> _dbuf(width*4);
float* dbuf = _dbuf; float* const dbuf = _dbuf;
Mat Dx(1, width, CV_32F, dbuf); Mat Dx(1, width, CV_32F, dbuf);
Mat Dy(1, width, CV_32F, dbuf + width); Mat Dy(1, width, CV_32F, dbuf + width);
Mat Mag(1, width, CV_32F, dbuf + width*2); Mat Mag(1, width, CV_32F, dbuf + width*2);
Mat Angle(1, width, CV_32F, dbuf + width*3); Mat Angle(1, width, CV_32F, dbuf + width*3);
int _nbins = nbins; if (cn == 3)
float angleScale = (float)(_nbins/CV_PI);
#ifdef HAVE_IPP
Mat lutimg(img.rows,img.cols,CV_MAKETYPE(CV_32F,cn));
Mat hidxs(1, width, CV_32F);
Ipp32f* pHidxs = (Ipp32f*)hidxs.data;
Ipp32f* pAngles = (Ipp32f*)Angle.data;
IppiSize roiSize;
roiSize.width = img.cols;
roiSize.height = img.rows;
for( y = 0; y < roiSize.height; y++ )
{ {
const uchar* imgPtr = img.data + y*img.step; int end = gradsize.width + 2;
float* imglutPtr = (float*)(lutimg.data + y*lutimg.step); xmap -= 1, x = 0;
#if CV_SSE2
for( x = 0; x < roiSize.width*cn; x++ ) __m128i ithree = _mm_set1_epi32(3);
{ for ( ; x <= end - 4; x += 4)
imglutPtr[x] = lut[imgPtr[x]]; _mm_storeu_si128((__m128i*)(xmap + x), _mm_mullo_epi16(ithree,
} _mm_loadu_si128((const __m128i*)(xmap + x))));
#endif
for ( ; x < end; ++x)
xmap[x] *= 3;
xmap += 1;
} }
#endif float angleScale = (float)(nbins/CV_PI);
for( y = 0; y < gradsize.height; y++ ) for( y = 0; y < gradsize.height; y++ )
{ {
#ifdef HAVE_IPP
const float* imgPtr = (float*)(lutimg.data + lutimg.step*ymap[y]);
const float* prevPtr = (float*)(lutimg.data + lutimg.step*ymap[y-1]);
const float* nextPtr = (float*)(lutimg.data + lutimg.step*ymap[y+1]);
#else
const uchar* imgPtr = img.data + img.step*ymap[y]; const uchar* imgPtr = img.data + img.step*ymap[y];
const uchar* prevPtr = img.data + img.step*ymap[y-1]; const uchar* prevPtr = img.data + img.step*ymap[y-1];
const uchar* nextPtr = img.data + img.step*ymap[y+1]; const uchar* nextPtr = img.data + img.step*ymap[y+1];
#endif
float* gradPtr = (float*)grad.ptr(y); float* gradPtr = (float*)grad.ptr(y);
uchar* qanglePtr = (uchar*)qangle.ptr(y); uchar* qanglePtr = (uchar*)qangle.ptr(y);
...@@ -262,46 +270,59 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -262,46 +270,59 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
for( x = 0; x < width; x++ ) for( x = 0; x < width; x++ )
{ {
int x1 = xmap[x]; int x1 = xmap[x];
#ifdef HAVE_IPP
dbuf[x] = (float)(imgPtr[xmap[x+1]] - imgPtr[xmap[x-1]]);
dbuf[width + x] = (float)(nextPtr[x1] - prevPtr[x1]);
#else
dbuf[x] = (float)(lut[imgPtr[xmap[x+1]]] - lut[imgPtr[xmap[x-1]]]); dbuf[x] = (float)(lut[imgPtr[xmap[x+1]]] - lut[imgPtr[xmap[x-1]]]);
dbuf[width + x] = (float)(lut[nextPtr[x1]] - lut[prevPtr[x1]]); dbuf[width + x] = (float)(lut[nextPtr[x1]] - lut[prevPtr[x1]]);
#endif
} }
} }
else else
{ {
for( x = 0; x < width; x++ ) x = 0;
#if CV_SSE2
for( ; x <= width - 4; x += 4 )
{ {
int x1 = xmap[x]*3; int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
float dx0, dy0, dx, dy, mag0, mag; typedef const uchar* const T;
#ifdef HAVE_IPP T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
const float* p2 = imgPtr + xmap[x+1]*3; T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
const float* p0 = imgPtr + xmap[x-1]*3; T p22 = imgPtr + xmap[x+3], p20 = p02;
T p32 = imgPtr + xmap[x+4], p30 = p12;
dx0 = p2[2] - p0[2];
dy0 = nextPtr[x1+2] - prevPtr[x1+2]; __m128 _dx0 = _mm_sub_ps(_mm_set_ps(lut[p32[0]], lut[p22[0]], lut[p12[0]], lut[p02[0]]),
mag0 = dx0*dx0 + dy0*dy0; _mm_set_ps(lut[p30[0]], lut[p20[0]], lut[p10[0]], lut[p00[0]]));
__m128 _dx1 = _mm_sub_ps(_mm_set_ps(lut[p32[1]], lut[p22[1]], lut[p12[1]], lut[p02[1]]),
dx = p2[1] - p0[1]; _mm_set_ps(lut[p30[1]], lut[p20[1]], lut[p10[1]], lut[p00[1]]));
dy = nextPtr[x1+1] - prevPtr[x1+1]; __m128 _dx2 = _mm_sub_ps(_mm_set_ps(lut[p32[2]], lut[p22[2]], lut[p12[2]], lut[p02[2]]),
mag = dx*dx + dy*dy; _mm_set_ps(lut[p30[2]], lut[p20[2]], lut[p10[2]], lut[p00[2]]));
if( mag0 < mag ) __m128 _dy0 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3]], lut[nextPtr[x2]], lut[nextPtr[x1]], lut[nextPtr[x0]]),
{ _mm_set_ps(lut[prevPtr[x3]], lut[prevPtr[x2]], lut[prevPtr[x1]], lut[prevPtr[x0]]));
dx0 = dx; __m128 _dy1 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+1]], lut[nextPtr[x2+1]], lut[nextPtr[x1+1]], lut[nextPtr[x0+1]]),
dy0 = dy; _mm_set_ps(lut[prevPtr[x3+1]], lut[prevPtr[x2+1]], lut[prevPtr[x1+1]], lut[prevPtr[x0+1]]));
mag0 = mag; __m128 _dy2 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+2]], lut[nextPtr[x2+2]], lut[nextPtr[x1+2]], lut[nextPtr[x0+2]]),
_mm_set_ps(lut[prevPtr[x3+2]], lut[prevPtr[x2+2]], lut[prevPtr[x1+2]], lut[prevPtr[x0+2]]));
__m128 _mag0 = _mm_add_ps(_mm_mul_ps(_dx0, _dx0), _mm_mul_ps(_dy0, _dy0));
__m128 _mag1 = _mm_add_ps(_mm_mul_ps(_dx1, _dx1), _mm_mul_ps(_dy1, _dy1));
__m128 _mag2 = _mm_add_ps(_mm_mul_ps(_dx2, _dx2), _mm_mul_ps(_dy2, _dy2));
__m128 mask = _mm_cmpgt_ps(_mag2, _mag1);
_dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx1));
_dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy1));
mask = _mm_cmpgt_ps(_mm_max_ps(_mag2, _mag1), _mag0);
_dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx0));
_dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy0));
_mm_storeu_ps(dbuf + x, _dx2);
_mm_storeu_ps(dbuf + x + width, _dy2);
} }
#endif
dx = p2[0] - p0[0]; for( ; x < width; x++ )
dy = nextPtr[x1] - prevPtr[x1]; {
mag = dx*dx + dy*dy; int x1 = xmap[x];
#else float dx0, dy0, dx, dy, mag0, mag;
const uchar* p2 = imgPtr + xmap[x+1]*3; const uchar* p2 = imgPtr + xmap[x+1];
const uchar* p0 = imgPtr + xmap[x-1]*3; const uchar* p0 = imgPtr + xmap[x-1];
dx0 = lut[p2[2]] - lut[p0[2]]; dx0 = lut[p2[2]] - lut[p0[2]];
dy0 = lut[nextPtr[x1+2]] - lut[prevPtr[x1+2]]; dy0 = lut[nextPtr[x1+2]] - lut[prevPtr[x1+2]];
...@@ -310,7 +331,6 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -310,7 +331,6 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
dx = lut[p2[1]] - lut[p0[1]]; dx = lut[p2[1]] - lut[p0[1]];
dy = lut[nextPtr[x1+1]] - lut[prevPtr[x1+1]]; dy = lut[nextPtr[x1+1]] - lut[prevPtr[x1+1]];
mag = dx*dx + dy*dy; mag = dx*dx + dy*dy;
if( mag0 < mag ) if( mag0 < mag )
{ {
dx0 = dx; dx0 = dx;
...@@ -321,7 +341,6 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -321,7 +341,6 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
dx = lut[p2[0]] - lut[p0[0]]; dx = lut[p2[0]] - lut[p0[0]];
dy = lut[nextPtr[x1]] - lut[prevPtr[x1]]; dy = lut[nextPtr[x1]] - lut[prevPtr[x1]];
mag = dx*dx + dy*dy; mag = dx*dx + dy*dy;
#endif
if( mag0 < mag ) if( mag0 < mag )
{ {
dx0 = dx; dx0 = dx;
...@@ -333,55 +352,83 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -333,55 +352,83 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
dbuf[x+width] = dy0; dbuf[x+width] = dy0;
} }
} }
#ifdef HAVE_IPP
ippsCartToPolar_32f((const Ipp32f*)Dx.data, (const Ipp32f*)Dy.data, (Ipp32f*)Mag.data, pAngles, width);
for( x = 0; x < width; x++ )
{
if(pAngles[x] < 0.f)
pAngles[x] += (Ipp32f)(CV_PI*2.);
}
ippsNormalize_32f(pAngles, pAngles, width, 0.5f/angleScale, 1.f/angleScale);
ippsFloor_32f(pAngles,(Ipp32f*)hidxs.data,width);
ippsSub_32f_I((Ipp32f*)hidxs.data,pAngles,width);
ippsMul_32f_I((Ipp32f*)Mag.data,pAngles,width);
ippsSub_32f_I(pAngles,(Ipp32f*)Mag.data,width); // computing angles and magnidutes
ippsRealToCplx_32f((Ipp32f*)Mag.data,pAngles,(Ipp32fc*)gradPtr,width);
#else
cartToPolar( Dx, Dy, Mag, Angle, false ); cartToPolar( Dx, Dy, Mag, Angle, false );
// filling the result matrix
x = 0;
#if CV_SSE2
__m128 fhalf = _mm_set1_ps(0.5f), fzero = _mm_setzero_ps();
__m128 _angleScale = _mm_set1_ps(angleScale), fone = _mm_set1_ps(1.0f);
__m128i ione = _mm_set1_epi32(1), _nbins = _mm_set1_epi32(nbins), izero = _mm_setzero_si128();
for ( ; x <= width - 4; x += 4)
{
int x2 = x << 1;
__m128 _mag = _mm_loadu_ps(dbuf + x + (width << 1));
__m128 _angle = _mm_loadu_ps(dbuf + x + width * 3);
_angle = _mm_sub_ps(_mm_mul_ps(_angleScale, _angle), fhalf);
__m128 sign = _mm_and_ps(fone, _mm_cmplt_ps(_angle, fzero));
__m128i _hidx = _mm_cvttps_epi32(_angle);
_hidx = _mm_sub_epi32(_hidx, _mm_cvtps_epi32(sign));
_angle = _mm_sub_ps(_angle, _mm_cvtepi32_ps(_hidx));
__m128 ft0 = _mm_mul_ps(_mag, _mm_sub_ps(fone, _angle));
__m128 ft1 = _mm_mul_ps(_mag, _angle);
__m128 ft2 = _mm_unpacklo_ps(ft0, ft1);
__m128 ft3 = _mm_unpackhi_ps(ft0, ft1);
_mm_storeu_ps(gradPtr + x2, ft2);
_mm_storeu_ps(gradPtr + x2 + 4, ft3);
__m128i mask0 = _mm_sub_epi32(izero, _mm_srli_epi32(_hidx, 31));
__m128i it0 = _mm_and_si128(mask0, _nbins);
mask0 = _mm_cmplt_epi32(_hidx, _nbins);
__m128i it1 = _mm_andnot_si128(mask0, _nbins);
_hidx = _mm_add_epi32(_hidx, _mm_sub_epi32(it0, it1));
it0 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
_hidx = _mm_add_epi32(ione, _hidx);
_hidx = _mm_and_si128(_hidx, _mm_cmplt_epi32(_hidx, _nbins));
it1 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
it0 = _mm_unpacklo_epi8(it0, it1);
_mm_storel_epi64((__m128i*)(qanglePtr + x2), it0);
}
#endif #endif
for( x = 0; x < width; x++ ) for( ; x < width; x++ )
{ {
#ifdef HAVE_IPP
int hidx = (int)pHidxs[x];
#else
float mag = dbuf[x+width*2], angle = dbuf[x+width*3]*angleScale - 0.5f; float mag = dbuf[x+width*2], angle = dbuf[x+width*3]*angleScale - 0.5f;
int hidx = cvFloor(angle); int hidx = cvFloor(angle);
angle -= hidx; angle -= hidx;
gradPtr[x*2] = mag*(1.f - angle); gradPtr[x*2] = mag*(1.f - angle);
gradPtr[x*2+1] = mag*angle; gradPtr[x*2+1] = mag*angle;
#endif
if( hidx < 0 ) if( hidx < 0 )
hidx += _nbins; hidx += nbins;
else if( hidx >= _nbins ) else if( hidx >= nbins )
hidx -= _nbins; hidx -= nbins;
assert( (unsigned)hidx < (unsigned)_nbins );
CV_Assert( (unsigned)hidx < (unsigned)nbins );
qanglePtr[x*2] = (uchar)hidx; qanglePtr[x*2] = (uchar)hidx;
hidx++; hidx++;
hidx &= hidx < _nbins ? -1 : 0; hidx &= hidx < nbins ? -1 : 0;
qanglePtr[x*2+1] = (uchar)hidx; qanglePtr[x*2+1] = (uchar)hidx;
} }
} }
} }
struct HOGCache struct HOGCache
{ {
struct BlockData struct BlockData
{ {
BlockData() : histOfs(0), imgOffset() {} BlockData() :
histOfs(0), imgOffset()
{ }
int histOfs; int histOfs;
Point imgOffset; Point imgOffset;
}; };
...@@ -396,15 +443,15 @@ struct HOGCache ...@@ -396,15 +443,15 @@ struct HOGCache
HOGCache(); HOGCache();
HOGCache(const HOGDescriptor* descriptor, HOGCache(const HOGDescriptor* descriptor,
const Mat& img, Size paddingTL, Size paddingBR, const Mat& img, const Size& paddingTL, const Size& paddingBR,
bool useCache, Size cacheStride); bool useCache, const Size& cacheStride);
virtual ~HOGCache() {}; virtual ~HOGCache() { }
virtual void init(const HOGDescriptor* descriptor, virtual void init(const HOGDescriptor* descriptor,
const Mat& img, Size paddingTL, Size paddingBR, const Mat& img, const Size& paddingTL, const Size& paddingBR,
bool useCache, Size cacheStride); bool useCache, const Size& cacheStride);
Size windowsInImage(Size imageSize, Size winStride) const; Size windowsInImage(const Size& imageSize, const Size& winStride) const;
Rect getWindow(Size imageSize, Size winStride, int idx) const; Rect getWindow(const Size& imageSize, const Size& winStride, int idx) const;
const float* getBlock(Point pt, float* buf); const float* getBlock(Point pt, float* buf);
virtual void normalizeBlockHistogram(float* histogram) const; virtual void normalizeBlockHistogram(float* histogram) const;
...@@ -414,7 +461,8 @@ struct HOGCache ...@@ -414,7 +461,8 @@ struct HOGCache
bool useCache; bool useCache;
std::vector<int> ymaxCached; std::vector<int> ymaxCached;
Size winSize, cacheStride; Size winSize;
Size cacheStride;
Size nblocks, ncells; Size nblocks, ncells;
int blockHistogramSize; int blockHistogramSize;
int count1, count2, count4; int count1, count2, count4;
...@@ -426,24 +474,23 @@ struct HOGCache ...@@ -426,24 +474,23 @@ struct HOGCache
const HOGDescriptor* descriptor; const HOGDescriptor* descriptor;
}; };
HOGCache::HOGCache() :
HOGCache::HOGCache() blockHistogramSize(), count1(), count2(), count4()
{ {
useCache = false; useCache = false;
blockHistogramSize = count1 = count2 = count4 = 0;
descriptor = 0; descriptor = 0;
} }
HOGCache::HOGCache(const HOGDescriptor* _descriptor, HOGCache::HOGCache(const HOGDescriptor* _descriptor,
const Mat& _img, Size _paddingTL, Size _paddingBR, const Mat& _img, const Size& _paddingTL, const Size& _paddingBR,
bool _useCache, Size _cacheStride) bool _useCache, const Size& _cacheStride)
{ {
init(_descriptor, _img, _paddingTL, _paddingBR, _useCache, _cacheStride); init(_descriptor, _img, _paddingTL, _paddingBR, _useCache, _cacheStride);
} }
void HOGCache::init(const HOGDescriptor* _descriptor, void HOGCache::init(const HOGDescriptor* _descriptor,
const Mat& _img, Size _paddingTL, Size _paddingBR, const Mat& _img, const Size& _paddingTL, const Size& _paddingBR,
bool _useCache, Size _cacheStride) bool _useCache, const Size& _cacheStride)
{ {
descriptor = _descriptor; descriptor = _descriptor;
cacheStride = _cacheStride; cacheStride = _cacheStride;
...@@ -468,8 +515,10 @@ void HOGCache::init(const HOGDescriptor* _descriptor, ...@@ -468,8 +515,10 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
{ {
Size cacheSize((grad.cols - blockSize.width)/cacheStride.width+1, Size cacheSize((grad.cols - blockSize.width)/cacheStride.width+1,
(winSize.height/cacheStride.height)+1); (winSize.height/cacheStride.height)+1);
blockCache.create(cacheSize.height, cacheSize.width*blockHistogramSize); blockCache.create(cacheSize.height, cacheSize.width*blockHistogramSize);
blockCacheFlags.create(cacheSize); blockCacheFlags.create(cacheSize);
size_t cacheRows = blockCache.rows; size_t cacheRows = blockCache.rows;
ymaxCached.resize(cacheRows); ymaxCached.resize(cacheRows);
for(size_t ii = 0; ii < cacheRows; ii++ ) for(size_t ii = 0; ii < cacheRows; ii++ )
...@@ -480,12 +529,52 @@ void HOGCache::init(const HOGDescriptor* _descriptor, ...@@ -480,12 +529,52 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
float sigma = (float)descriptor->getWinSigma(); float sigma = (float)descriptor->getWinSigma();
float scale = 1.f/(sigma*sigma*2); float scale = 1.f/(sigma*sigma*2);
{
AutoBuffer<float> di(blockSize.height), dj(blockSize.width);
float* _di = (float*)di, *_dj = (float*)dj;
float bh = blockSize.height * 0.5f, bw = blockSize.width * 0.5f;
i = 0;
#if CV_SSE2
const int a[] = { 0, 1, 2, 3 };
__m128i idx = _mm_loadu_si128((__m128i*)a);
__m128 _bw = _mm_set1_ps(bw), _bh = _mm_set1_ps(bh);
__m128i ifour = _mm_set1_epi32(4);
for (; i <= blockSize.height - 4; i += 4)
{
__m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bh);
t = _mm_mul_ps(t, t);
idx = _mm_add_epi32(idx, ifour);
_mm_storeu_ps(_di + i, t);
}
#endif
for ( ; i < blockSize.height; ++i)
{
_di[i] = i - bh;
_di[i] *= _di[i];
}
j = 0;
#if CV_SSE2
idx = _mm_loadu_si128((__m128i*)a);
for (; j <= blockSize.width - 4; j += 4)
{
__m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bw);
t = _mm_mul_ps(t, t);
idx = _mm_add_epi32(idx, ifour);
_mm_storeu_ps(_dj + j, t);
}
#endif
for ( ; j < blockSize.width; ++j)
{
_dj[j] = j - bw;
_dj[j] *= _dj[j];
}
for(i = 0; i < blockSize.height; i++) for(i = 0; i < blockSize.height; i++)
for(j = 0; j < blockSize.width; j++) for(j = 0; j < blockSize.width; j++)
{ weights(i,j) = std::exp(-(_di[i] + _dj[j])*scale);
float di = i - blockSize.height*0.5f;
float dj = j - blockSize.width*0.5f;
weights(i,j) = std::exp(-(di*di + dj*dj)*scale);
} }
blockData.resize(nblocks.width*nblocks.height); blockData.resize(nblocks.width*nblocks.height);
...@@ -515,6 +604,7 @@ void HOGCache::init(const HOGDescriptor* _descriptor, ...@@ -515,6 +604,7 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
// and also unrolled. Inside we use PixData[k] to access the gradient values and // and also unrolled. Inside we use PixData[k] to access the gradient values and
// update the histogram // update the histogram
// //
count1 = count2 = count4 = 0; count1 = count2 = count4 = 0;
for( j = 0; j < blockSize.width; j++ ) for( j = 0; j < blockSize.width; j++ )
for( i = 0; i < blockSize.height; i++ ) for( i = 0; i < blockSize.height; i++ )
...@@ -617,17 +707,16 @@ void HOGCache::init(const HOGDescriptor* _descriptor, ...@@ -617,17 +707,16 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
} }
} }
const float* HOGCache::getBlock(Point pt, float* buf) const float* HOGCache::getBlock(Point pt, float* buf)
{ {
float* blockHist = buf; float* blockHist = buf;
assert(descriptor != 0); assert(descriptor != 0);
Size blockSize = descriptor->blockSize; // Size blockSize = descriptor->blockSize;
pt += imgoffset; pt += imgoffset;
CV_Assert( (unsigned)pt.x <= (unsigned)(grad.cols - blockSize.width) && // CV_Assert( (unsigned)pt.x <= (unsigned)(grad.cols - blockSize.width) &&
(unsigned)pt.y <= (unsigned)(grad.rows - blockSize.height) ); // (unsigned)pt.y <= (unsigned)(grad.rows - blockSize.height) );
if( useCache ) if( useCache )
{ {
...@@ -653,35 +742,58 @@ const float* HOGCache::getBlock(Point pt, float* buf) ...@@ -653,35 +742,58 @@ const float* HOGCache::getBlock(Point pt, float* buf)
const float* gradPtr = (const float*)(grad.data + grad.step*pt.y) + pt.x*2; const float* gradPtr = (const float*)(grad.data + grad.step*pt.y) + pt.x*2;
const uchar* qanglePtr = qangle.data + qangle.step*pt.y + pt.x*2; const uchar* qanglePtr = qangle.data + qangle.step*pt.y + pt.x*2;
CV_Assert( blockHist != 0 ); // CV_Assert( blockHist != 0 );
#ifdef HAVE_IPP memset(blockHist, 0, sizeof(float) * blockHistogramSize);
ippsZero_32f(blockHist,blockHistogramSize);
#else
for( k = 0; k < blockHistogramSize; k++ )
blockHist[k] = 0.f;
#endif
const PixData* _pixData = &pixData[0]; const PixData* _pixData = &pixData[0];
for( k = 0; k < C1; k++ ) for( k = 0; k < C1; k++ )
{ {
const PixData& pk = _pixData[k]; const PixData& pk = _pixData[k];
const float* a = gradPtr + pk.gradOfs; const float* const a = gradPtr + pk.gradOfs;
float w = pk.gradWeight*pk.histWeights[0]; float w = pk.gradWeight*pk.histWeights[0];
const uchar* h = qanglePtr + pk.qangleOfs; const uchar* h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1]; int h0 = h[0], h1 = h[1];
float* hist = blockHist + pk.histOfs[0]; float* hist = blockHist + pk.histOfs[0];
float t0 = hist[h0] + a[0]*w; float t0 = hist[h0] + a[0]*w;
float t1 = hist[h1] + a[1]*w; float t1 = hist[h1] + a[1]*w;
hist[h0] = t0; hist[h1] = t1; hist[h0] = t0; hist[h1] = t1;
} }
#if CV_SSE2
float hist0[4], hist1[4];
for( ; k < C2; k++ ) for( ; k < C2; k++ )
{ {
const PixData& pk = _pixData[k]; const PixData& pk = _pixData[k];
const float* a = gradPtr + pk.gradOfs; const float* const a = gradPtr + pk.gradOfs;
const uchar* const h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1];
__m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
__m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
__m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);
_mm_storeu_ps(hist0, _t0);
_mm_storeu_ps(hist1, _t1);
float* hist = blockHist + pk.histOfs[0];
float t0 = hist[h0] + hist0[0];
float t1 = hist[h1] + hist1[0];
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[1];
t0 = hist[h0] + hist0[1];
t1 = hist[h1] + hist1[1];
hist[h0] = t0; hist[h1] = t1;
}
#else
for( ; k < C2; k++ )
{
const PixData& pk = _pixData[k];
const float* const a = gradPtr + pk.gradOfs;
float w, t0, t1, a0 = a[0], a1 = a[1]; float w, t0, t1, a0 = a[0], a1 = a[1];
const uchar* h = qanglePtr + pk.qangleOfs; const uchar* const h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1]; int h0 = h[0], h1 = h[1];
float* hist = blockHist + pk.histOfs[0]; float* hist = blockHist + pk.histOfs[0];
...@@ -696,7 +808,65 @@ const float* HOGCache::getBlock(Point pt, float* buf) ...@@ -696,7 +808,65 @@ const float* HOGCache::getBlock(Point pt, float* buf)
t1 = hist[h1] + a1*w; t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1; hist[h0] = t0; hist[h1] = t1;
} }
#endif
#if CV_SSE2
for( ; k < C4; k++ )
{
const PixData& pk = _pixData[k];
const float* const a = gradPtr + pk.gradOfs;
const uchar* const h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1];
__m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
__m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
__m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);
_mm_storeu_ps(hist0, _t0);
_mm_storeu_ps(hist1, _t1);
float* hist = blockHist + pk.histOfs[0];
float t0 = hist[h0] + hist0[0];
float t1 = hist[h1] + hist1[0];
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[1];
t0 = hist[h0] + hist0[1];
t1 = hist[h1] + hist1[1];
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[2];
t0 = hist[h0] + hist0[2];
t1 = hist[h1] + hist1[2];
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[3];
t0 = hist[h0] + hist0[3];
t1 = hist[h1] + hist1[3];
hist[h0] = t0; hist[h1] = t1;
// __m128 _hist0 = _mm_set_ps((blockHist + pk.histOfs[3])[h0], (blockHist + pk.histOfs[2])[h0],
// (blockHist + pk.histOfs[1])[h0], (blockHist + pk.histOfs[0])[h0]);
// __m128 _hist1 = _mm_set_ps((blockHist + pk.histOfs[3])[h1], (blockHist + pk.histOfs[2])[h1],
// (blockHist + pk.histOfs[1])[h1], (blockHist + pk.histOfs[0])[h1]);
//
// _hist0 = _mm_add_ps(_t0, _hist0);
// _hist1 = _mm_add_ps(_t1, _hist1);
//
// _mm_storeu_ps(hist0, _hist0);
// _mm_storeu_ps(hist1, _hist1);
//
// (pk.histOfs[0] + blockHist)[h0] = hist0[0];
// (pk.histOfs[1] + blockHist)[h0] = hist0[1];
// (pk.histOfs[2] + blockHist)[h0] = hist0[2];
// (pk.histOfs[3] + blockHist)[h0] = hist0[3];
//
// (pk.histOfs[0] + blockHist)[h1] = hist1[0];
// (pk.histOfs[1] + blockHist)[h1] = hist1[1];
// (pk.histOfs[2] + blockHist)[h1] = hist1[2];
// (pk.histOfs[3] + blockHist)[h1] = hist1[3];
}
#else
for( ; k < C4; k++ ) for( ; k < C4; k++ )
{ {
const PixData& pk = _pixData[k]; const PixData& pk = _pixData[k];
...@@ -729,60 +899,115 @@ const float* HOGCache::getBlock(Point pt, float* buf) ...@@ -729,60 +899,115 @@ const float* HOGCache::getBlock(Point pt, float* buf)
t1 = hist[h1] + a1*w; t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1; hist[h0] = t0; hist[h1] = t1;
} }
#endif
normalizeBlockHistogram(blockHist); normalizeBlockHistogram(blockHist);
return blockHist; return blockHist;
} }
void HOGCache::normalizeBlockHistogram(float* _hist) const void HOGCache::normalizeBlockHistogram(float* _hist) const
{ {
float* hist = &_hist[0]; float* hist = &_hist[0], sum = 0.0f, partSum[4];
#ifdef HAVE_IPP size_t i = 0, sz = blockHistogramSize;
size_t sz = blockHistogramSize;
#else #if CV_SSE2
size_t i, sz = blockHistogramSize; __m128 p0 = _mm_loadu_ps(hist);
#endif __m128 s = _mm_mul_ps(p0, p0);
float sum = 0; for (i = 4; i <= sz - 4; i += 4)
#ifdef HAVE_IPP {
ippsDotProd_32f(hist,hist,sz,&sum); p0 = _mm_loadu_ps(hist + i);
s = _mm_add_ps(s, _mm_mul_ps(p0, p0));
}
_mm_storeu_ps(partSum, s);
#else #else
for( i = 0; i < sz; i++ ) partSum[0] = 0.0f;
sum += hist[i]*hist[i]; partSum[1] = 0.0f;
partSum[2] = 0.0f;
partSum[3] = 0.0f;
for ( ; i <= sz - 4; i += 4)
{
partSum[0] += hist[i] * hist[i];
partSum[1] += hist[i+1] * hist[i+1];
partSum[2] += hist[i+2] * hist[i+2];
partSum[3] += hist[i+3] * hist[i+3];
}
#endif #endif
float t0 = partSum[0] + partSum[1];
float t1 = partSum[2] + partSum[3];
sum = t0 + t1;
for ( ; i < sz; ++i)
sum += hist[i]*hist[i];
float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold; float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold;
#ifdef HAVE_IPP i = 0, sum = 0.0f;
ippsMulC_32f_I(scale,hist,sz);
ippsThreshold_32f_I( hist, sz, thresh, ippCmpGreater ); #if CV_SSE2
ippsDotProd_32f(hist,hist,sz,&sum); __m128 _scale = _mm_set1_ps(scale);
static __m128 _threshold = _mm_set1_ps(thresh);
__m128 p = _mm_mul_ps(_scale, _mm_loadu_ps(hist));
p = _mm_min_ps(p, _threshold);
s = _mm_mul_ps(p, p);
_mm_storeu_ps(hist, p);
for(i = 4 ; i <= sz - 4; i += 4)
{
p = _mm_loadu_ps(hist + i);
p = _mm_mul_ps(p, _scale);
p = _mm_min_ps(p, _threshold);
s = _mm_add_ps(s, _mm_mul_ps(p, p));
_mm_storeu_ps(hist + i, p);
}
_mm_storeu_ps(partSum, s);
#else #else
for( i = 0, sum = 0; i < sz; i++ ) partSum[0] = 0.0f;
partSum[1] = 0.0f;
partSum[2] = 0.0f;
partSum[3] = 0.0f;
for( ; i <= sz - 4; i += 4)
{ {
hist[i] = std::min(hist[i]*scale, thresh); hist[i] = std::min(hist[i]*scale, thresh);
sum += hist[i]*hist[i]; hist[i+1] = std::min(hist[i+1]*scale, thresh);
hist[i+2] = std::min(hist[i+2]*scale, thresh);
hist[i+3] = std::min(hist[i+3]*scale, thresh);
partSum[0] += hist[i]*hist[i];
partSum[1] += hist[i+1]*hist[i+1];
partSum[2] += hist[i+2]*hist[i+2];
partSum[3] += hist[i+3]*hist[i+3];
} }
#endif #endif
t0 = partSum[0] + partSum[1];
t1 = partSum[2] + partSum[3];
sum = t0 + t1;
for( ; i < sz; ++i)
{
hist[i] = std::min(hist[i]*scale, thresh);
sum += hist[i]*hist[i];
}
scale = 1.f/(std::sqrt(sum)+1e-3f); scale = 1.f/(std::sqrt(sum)+1e-3f), i = 0;
#ifdef HAVE_IPP #if CV_SSE2
ippsMulC_32f_I(scale,hist,sz); __m128 _scale2 = _mm_set1_ps(scale);
#else for ( ; i <= sz - 4; i += 4)
for( i = 0; i < sz; i++ ) {
hist[i] *= scale; __m128 t = _mm_mul_ps(_scale2, _mm_loadu_ps(hist + i));
_mm_storeu_ps(hist + i, t);
}
#endif #endif
for ( ; i < sz; ++i)
hist[i] *= scale;
} }
Size HOGCache::windowsInImage(const Size& imageSize, const Size& winStride) const
Size HOGCache::windowsInImage(Size imageSize, Size winStride) const
{ {
return Size((imageSize.width - winSize.width)/winStride.width + 1, return Size((imageSize.width - winSize.width)/winStride.width + 1,
(imageSize.height - winSize.height)/winStride.height + 1); (imageSize.height - winSize.height)/winStride.height + 1);
} }
Rect HOGCache::getWindow(Size imageSize, Size winStride, int idx) const Rect HOGCache::getWindow(const Size& imageSize, const Size& winStride, int idx) const
{ {
int nwindowsX = (imageSize.width - winSize.width)/winStride.width + 1; int nwindowsX = (imageSize.width - winSize.width)/winStride.width + 1;
int y = idx / nwindowsX; int y = idx / nwindowsX;
...@@ -790,15 +1015,14 @@ Rect HOGCache::getWindow(Size imageSize, Size winStride, int idx) const ...@@ -790,15 +1015,14 @@ Rect HOGCache::getWindow(Size imageSize, Size winStride, int idx) const
return Rect( x*winStride.width, y*winStride.height, winSize.width, winSize.height ); return Rect( x*winStride.width, y*winStride.height, winSize.width, winSize.height );
} }
void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors, void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
Size winStride, Size padding, Size winStride, Size padding, const std::vector<Point>& locations) const
const std::vector<Point>& locations) const
{ {
if( winStride == Size() ) if( winStride == Size() )
winStride = cellSize; winStride = cellSize;
Size cacheStride(gcd(winStride.width, blockStride.width), Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height)); gcd(winStride.height, blockStride.height));
size_t nwindows = locations.size(); size_t nwindows = locations.size();
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width); padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height); padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
...@@ -816,6 +1040,7 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors, ...@@ -816,6 +1040,7 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
size_t dsize = getDescriptorSize(); size_t dsize = getDescriptorSize();
descriptors.resize(dsize*nwindows); descriptors.resize(dsize*nwindows);
// for each window
for( size_t i = 0; i < nwindows; i++ ) for( size_t i = 0; i < nwindows; i++ )
{ {
float* descriptor = &descriptors[i*dsize]; float* descriptor = &descriptors[i*dsize];
...@@ -831,7 +1056,7 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors, ...@@ -831,7 +1056,7 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
else else
{ {
pt0 = cache.getWindow(paddedImgSize, winStride, (int)i).tl() - Point(padding); pt0 = cache.getWindow(paddedImgSize, winStride, (int)i).tl() - Point(padding);
CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0); // CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0);
} }
for( int j = 0; j < nblocks; j++ ) for( int j = 0; j < nblocks; j++ )
...@@ -842,17 +1067,11 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors, ...@@ -842,17 +1067,11 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
float* dst = descriptor + bj.histOfs; float* dst = descriptor + bj.histOfs;
const float* src = cache.getBlock(pt, dst); const float* src = cache.getBlock(pt, dst);
if( src != dst ) if( src != dst )
#ifdef HAVE_IPP memcpy(dst, src, blockHistogramSize * sizeof(float));
ippsCopy_32f(src,dst,blockHistogramSize);
#else
for( int k = 0; k < blockHistogramSize; k++ )
dst[k] = src[k];
#endif
} }
} }
} }
void HOGDescriptor::detect(const Mat& img, void HOGDescriptor::detect(const Mat& img,
std::vector<Point>& hits, std::vector<double>& weights, double hitThreshold, std::vector<Point>& hits, std::vector<double>& weights, double hitThreshold,
Size winStride, Size padding, const std::vector<Point>& locations) const Size winStride, Size padding, const std::vector<Point>& locations) const
...@@ -865,6 +1084,7 @@ void HOGDescriptor::detect(const Mat& img, ...@@ -865,6 +1084,7 @@ void HOGDescriptor::detect(const Mat& img,
winStride = cellSize; winStride = cellSize;
Size cacheStride(gcd(winStride.width, blockStride.width), Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height)); gcd(winStride.height, blockStride.height));
size_t nwindows = locations.size(); size_t nwindows = locations.size();
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width); padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height); padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
...@@ -884,6 +1104,10 @@ void HOGDescriptor::detect(const Mat& img, ...@@ -884,6 +1104,10 @@ void HOGDescriptor::detect(const Mat& img,
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
std::vector<float> blockHist(blockHistogramSize); std::vector<float> blockHist(blockHistogramSize);
#if CV_SSE2
float partSum[4];
#endif
for( size_t i = 0; i < nwindows; i++ ) for( size_t i = 0; i < nwindows; i++ )
{ {
Point pt0; Point pt0;
...@@ -901,28 +1125,38 @@ void HOGDescriptor::detect(const Mat& img, ...@@ -901,28 +1125,38 @@ void HOGDescriptor::detect(const Mat& img,
} }
double s = rho; double s = rho;
const float* svmVec = &svmDetector[0]; const float* svmVec = &svmDetector[0];
#ifdef HAVE_IPP
int j;
#else
int j, k; int j, k;
#endif
for( j = 0; j < nblocks; j++, svmVec += blockHistogramSize ) for( j = 0; j < nblocks; j++, svmVec += blockHistogramSize )
{ {
const HOGCache::BlockData& bj = blockData[j]; const HOGCache::BlockData& bj = blockData[j];
Point pt = pt0 + bj.imgOffset; Point pt = pt0 + bj.imgOffset;
const float* vec = cache.getBlock(pt, &blockHist[0]); const float* vec = cache.getBlock(pt, &blockHist[0]);
#ifdef HAVE_IPP #if CV_SSE2
Ipp32f partSum; __m128 _vec = _mm_loadu_ps(vec);
ippsDotProd_32f(vec,svmVec,blockHistogramSize,&partSum); __m128 _svmVec = _mm_loadu_ps(svmVec);
s += (double)partSum; __m128 sum = _mm_mul_ps(_svmVec, _vec);
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{
_vec = _mm_loadu_ps(vec + k);
_svmVec = _mm_loadu_ps(svmVec + k);
sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec));
}
_mm_storeu_ps(partSum, sum);
double t0 = partSum[0] + partSum[1];
double t1 = partSum[2] + partSum[3];
s += t0 + t1;
#else #else
for( k = 0; k <= blockHistogramSize - 4; k += 4 ) for( k = 0; k <= blockHistogramSize - 4; k += 4 )
s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] + s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] +
vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3]; vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3];
#endif
for( ; k < blockHistogramSize; k++ ) for( ; k < blockHistogramSize; k++ )
s += vec[k]*svmVec[k]; s += vec[k]*svmVec[k];
#endif
} }
if( s >= hitThreshold ) if( s >= hitThreshold )
{ {
...@@ -939,11 +1173,12 @@ void HOGDescriptor::detect(const Mat& img, std::vector<Point>& hits, double hitT ...@@ -939,11 +1173,12 @@ void HOGDescriptor::detect(const Mat& img, std::vector<Point>& hits, double hitT
detect(img, hits, weightsV, hitThreshold, winStride, padding, locations); detect(img, hits, weightsV, hitThreshold, winStride, padding, locations);
} }
class HOGInvoker : public ParallelLoopBody class HOGInvoker :
public ParallelLoopBody
{ {
public: public:
HOGInvoker( const HOGDescriptor* _hog, const Mat& _img, HOGInvoker( const HOGDescriptor* _hog, const Mat& _img,
double _hitThreshold, Size _winStride, Size _padding, double _hitThreshold, const Size& _winStride, const Size& _padding,
const double* _levelScale, std::vector<Rect> * _vec, Mutex* _mtx, const double* _levelScale, std::vector<Rect> * _vec, Mutex* _mtx,
std::vector<double>* _weights=0, std::vector<double>* _scales=0 ) std::vector<double>* _weights=0, std::vector<double>* _scales=0 )
{ {
...@@ -987,24 +1222,21 @@ public: ...@@ -987,24 +1222,21 @@ public:
cvRound(locations[j].y*scale), cvRound(locations[j].y*scale),
scaledWinSize.width, scaledWinSize.height)); scaledWinSize.width, scaledWinSize.height));
if (scales) if (scales)
{
scales->push_back(scale); scales->push_back(scale);
} }
}
mtx->unlock(); mtx->unlock();
if (weights && (!hitsWeights.empty())) if (weights && (!hitsWeights.empty()))
{ {
mtx->lock(); mtx->lock();
for (size_t j = 0; j < locations.size(); j++) for (size_t j = 0; j < locations.size(); j++)
{
weights->push_back(hitsWeights[j]); weights->push_back(hitsWeights[j]);
}
mtx->unlock(); mtx->unlock();
} }
} }
} }
private:
const HOGDescriptor* hog; const HOGDescriptor* hog;
Mat img; Mat img;
double hitThreshold; double hitThreshold;
...@@ -1017,7 +1249,6 @@ public: ...@@ -1017,7 +1249,6 @@ public:
Mutex* mtx; Mutex* mtx;
}; };
void HOGDescriptor::detectMultiScale( void HOGDescriptor::detectMultiScale(
const Mat& img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights, const Mat& img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
double hitThreshold, Size winStride, Size padding, double hitThreshold, Size winStride, Size padding,
...@@ -1045,8 +1276,9 @@ void HOGDescriptor::detectMultiScale( ...@@ -1045,8 +1276,9 @@ void HOGDescriptor::detectMultiScale(
std::vector<double> foundScales; std::vector<double> foundScales;
Mutex mtx; Mutex mtx;
parallel_for_(Range(0, (int)levelScale.size()), Range range(0, (int)levelScale.size());
HOGInvoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates, &mtx, &tempWeights, &tempScales)); HOGInvoker invoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates, &mtx, &tempWeights, &tempScales);
parallel_for_(range, invoker);
std::copy(tempScales.begin(), tempScales.end(), back_inserter(foundScales)); std::copy(tempScales.begin(), tempScales.end(), back_inserter(foundScales));
foundLocations.clear(); foundLocations.clear();
...@@ -1055,13 +1287,9 @@ void HOGDescriptor::detectMultiScale( ...@@ -1055,13 +1287,9 @@ void HOGDescriptor::detectMultiScale(
std::copy(tempWeights.begin(), tempWeights.end(), back_inserter(foundWeights)); std::copy(tempWeights.begin(), tempWeights.end(), back_inserter(foundWeights));
if ( useMeanshiftGrouping ) if ( useMeanshiftGrouping )
{
groupRectangles_meanshift(foundLocations, foundWeights, foundScales, finalThreshold, winSize); groupRectangles_meanshift(foundLocations, foundWeights, foundScales, finalThreshold, winSize);
}
else else
{
groupRectangles(foundLocations, (int)finalThreshold, 0.2); groupRectangles(foundLocations, (int)finalThreshold, 0.2);
}
} }
void HOGDescriptor::detectMultiScale(const Mat& img, std::vector<Rect>& foundLocations, void HOGDescriptor::detectMultiScale(const Mat& img, std::vector<Rect>& foundLocations,
...@@ -1888,8 +2116,9 @@ std::vector<float> HOGDescriptor::getDefaultPeopleDetector() ...@@ -1888,8 +2116,9 @@ std::vector<float> HOGDescriptor::getDefaultPeopleDetector()
-0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f }; -0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f };
return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0])); return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
} }
//This function renurn 1981 SVM coeffs obtained from daimler's base.
//To use these coeffs the detection window size should be (48,96) // This function renurn 1981 SVM coeffs obtained from daimler's base.
// To use these coeffs the detection window size should be (48,96)
std::vector<float> HOGDescriptor::getDaimlerPeopleDetector() std::vector<float> HOGDescriptor::getDaimlerPeopleDetector()
{ {
static const float detector[] = { static const float detector[] = {
...@@ -2392,11 +2621,12 @@ std::vector<float> HOGDescriptor::getDaimlerPeopleDetector() ...@@ -2392,11 +2621,12 @@ std::vector<float> HOGDescriptor::getDaimlerPeopleDetector()
return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0])); return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
} }
class HOGConfInvoker : public ParallelLoopBody class HOGConfInvoker :
public ParallelLoopBody
{ {
public: public:
HOGConfInvoker( const HOGDescriptor* _hog, const Mat& _img, HOGConfInvoker( const HOGDescriptor* _hog, const Mat& _img,
double _hitThreshold, Size _padding, double _hitThreshold, const Size& _padding,
std::vector<DetectionROI>* locs, std::vector<DetectionROI>* locs,
std::vector<Rect>* _vec, Mutex* _mtx ) std::vector<Rect>* _vec, Mutex* _mtx )
{ {
...@@ -2433,11 +2663,9 @@ public: ...@@ -2433,11 +2663,9 @@ public:
Size scaledWinSize = Size(cvRound(hog->winSize.width*scale), cvRound(hog->winSize.height*scale)); Size scaledWinSize = Size(cvRound(hog->winSize.width*scale), cvRound(hog->winSize.height*scale));
mtx->lock(); mtx->lock();
for( size_t j = 0; j < dets.size(); j++ ) for( size_t j = 0; j < dets.size(); j++ )
{
vec->push_back(Rect(cvRound(dets[j].x*scale), vec->push_back(Rect(cvRound(dets[j].x*scale),
cvRound(dets[j].y*scale), cvRound(dets[j].y*scale),
scaledWinSize.width, scaledWinSize.height)); scaledWinSize.width, scaledWinSize.height));
}
mtx->unlock(); mtx->unlock();
} }
} }
...@@ -2453,22 +2681,16 @@ public: ...@@ -2453,22 +2681,16 @@ public:
void HOGDescriptor::detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations, void HOGDescriptor::detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
CV_OUT std::vector<cv::Point>& foundLocations, CV_OUT std::vector<double>& confidences, CV_OUT std::vector<cv::Point>& foundLocations, CV_OUT std::vector<double>& confidences,
double hitThreshold, cv::Size winStride, double hitThreshold, cv::Size winStride, cv::Size padding) const
cv::Size padding) const
{ {
foundLocations.clear(); foundLocations.clear();
confidences.clear(); confidences.clear();
if( svmDetector.empty() ) if( svmDetector.empty() || locations.empty())
return;
if( locations.empty() )
return; return;
if( winStride == Size() ) if( winStride == Size() )
winStride = cellSize; winStride = cellSize;
Size cacheStride(gcd(winStride.width, blockStride.width), Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height)); gcd(winStride.height, blockStride.height));
...@@ -2491,6 +2713,10 @@ void HOGDescriptor::detectROI(const cv::Mat& img, const std::vector<cv::Point> & ...@@ -2491,6 +2713,10 @@ void HOGDescriptor::detectROI(const cv::Mat& img, const std::vector<cv::Point> &
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
std::vector<float> blockHist(blockHistogramSize); std::vector<float> blockHist(blockHistogramSize);
#if CV_SSE2
float partSum[4];
#endif
for( size_t i = 0; i < nwindows; i++ ) for( size_t i = 0; i < nwindows; i++ )
{ {
Point pt0; Point pt0;
...@@ -2511,33 +2737,51 @@ void HOGDescriptor::detectROI(const cv::Mat& img, const std::vector<cv::Point> & ...@@ -2511,33 +2737,51 @@ void HOGDescriptor::detectROI(const cv::Mat& img, const std::vector<cv::Point> &
{ {
const HOGCache::BlockData& bj = blockData[j]; const HOGCache::BlockData& bj = blockData[j];
Point pt = pt0 + bj.imgOffset; Point pt = pt0 + bj.imgOffset;
// need to devide this into 4 parts! // need to devide this into 4 parts!
const float* vec = cache.getBlock(pt, &blockHist[0]); const float* vec = cache.getBlock(pt, &blockHist[0]);
#if CV_SSE2
__m128 _vec = _mm_loadu_ps(vec);
__m128 _svmVec = _mm_loadu_ps(svmVec);
__m128 sum = _mm_mul_ps(_svmVec, _vec);
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{
_vec = _mm_loadu_ps(vec + k);
_svmVec = _mm_loadu_ps(svmVec + k);
sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec));
}
_mm_storeu_ps(partSum, sum);
double t0 = partSum[0] + partSum[1];
double t1 = partSum[2] + partSum[3];
s += t0 + t1;
#else
for( k = 0; k <= blockHistogramSize - 4; k += 4 ) for( k = 0; k <= blockHistogramSize - 4; k += 4 )
s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] + s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] +
vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3]; vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3];
#endif
for( ; k < blockHistogramSize; k++ ) for( ; k < blockHistogramSize; k++ )
s += vec[k]*svmVec[k]; s += vec[k]*svmVec[k];
} }
// cv::waitKey();
confidences.push_back(s); confidences.push_back(s);
if( s >= hitThreshold ) if( s >= hitThreshold )
foundLocations.push_back(pt0); foundLocations.push_back(pt0);
} }
} }
void HOGDescriptor::detectMultiScaleROI(const cv::Mat& img, void HOGDescriptor::detectMultiScaleROI(const cv::Mat& img,
CV_OUT std::vector<cv::Rect>& foundLocations, CV_OUT std::vector<cv::Rect>& foundLocations, std::vector<DetectionROI>& locations,
std::vector<DetectionROI>& locations, double hitThreshold, int groupThreshold) const
double hitThreshold,
int groupThreshold) const
{ {
std::vector<Rect> allCandidates; std::vector<Rect> allCandidates;
Mutex mtx; Mutex mtx;
parallel_for_(Range(0, (int)locations.size()), parallel_for_(Range(0, (int)locations.size()),
HOGConfInvoker(this, img, hitThreshold, Size(8, 8), &locations, &allCandidates, &mtx)); HOGConfInvoker(this, img, hitThreshold, Size(8, 8),
&locations, &allCandidates, &mtx));
foundLocations.resize(allCandidates.size()); foundLocations.resize(allCandidates.size());
std::copy(allCandidates.begin(), allCandidates.end(), foundLocations.begin()); std::copy(allCandidates.begin(), allCandidates.end(), foundLocations.begin());
...@@ -2635,3 +2879,4 @@ void HOGDescriptor::readALTModel(std::string modelfile) ...@@ -2635,3 +2879,4 @@ void HOGDescriptor::readALTModel(std::string modelfile)
} }
} }
...@@ -543,3 +543,815 @@ TEST(Objdetect_HOGDetectorReadWrite, regression) ...@@ -543,3 +543,815 @@ TEST(Objdetect_HOGDetectorReadWrite, regression)
TEST(Objdetect_CascadeDetector, regression) { CV_CascadeDetectorTest test; test.safe_run(); } TEST(Objdetect_CascadeDetector, regression) { CV_CascadeDetectorTest test; test.safe_run(); }
TEST(Objdetect_HOGDetector, regression) { CV_HOGDetectorTest test; test.safe_run(); } TEST(Objdetect_HOGDetector, regression) { CV_HOGDetectorTest test; test.safe_run(); }
//----------------------------------------------- HOG SSE2 compatible test -----------------------------------
class HOGDescriptorTester :
public cv::HOGDescriptor
{
HOGDescriptor* actual_hog;
cvtest::TS* ts;
mutable bool failed;
public:
HOGDescriptorTester(HOGDescriptor& instance) :
cv::HOGDescriptor(instance), actual_hog(&instance),
ts(cvtest::TS::ptr()), failed(false)
{ }
virtual void computeGradient(const Mat& img, Mat& grad, Mat& qangle,
Size paddingTL, Size paddingBR) const;
virtual void detect(const Mat& img,
vector<Point>& hits, vector<double>& weights, double hitThreshold = 0.0,
Size winStride = Size(), Size padding = Size(),
const vector<Point>& locations = vector<Point>()) const;
virtual void detect(const Mat& img, vector<Point>& hits, double hitThreshold = 0.0,
Size winStride = Size(), Size padding = Size(),
const vector<Point>& locations = vector<Point>()) const;
virtual void compute(const Mat& img, vector<float>& descriptors,
Size winStride = Size(), Size padding = Size(),
const vector<Point>& locations = vector<Point>()) const;
bool is_failed() const;
};
struct HOGCacheTester
{
struct BlockData
{
BlockData() : histOfs(0), imgOffset() {}
int histOfs;
Point imgOffset;
};
struct PixData
{
size_t gradOfs, qangleOfs;
int histOfs[4];
float histWeights[4];
float gradWeight;
};
HOGCacheTester();
HOGCacheTester(const HOGDescriptorTester* descriptor,
const Mat& img, Size paddingTL, Size paddingBR,
bool useCache, Size cacheStride);
virtual ~HOGCacheTester() { }
virtual void init(const HOGDescriptorTester* descriptor,
const Mat& img, Size paddingTL, Size paddingBR,
bool useCache, Size cacheStride);
Size windowsInImage(Size imageSize, Size winStride) const;
Rect getWindow(Size imageSize, Size winStride, int idx) const;
const float* getBlock(Point pt, float* buf);
virtual void normalizeBlockHistogram(float* histogram) const;
vector<PixData> pixData;
vector<BlockData> blockData;
bool useCache;
vector<int> ymaxCached;
Size winSize, cacheStride;
Size nblocks, ncells;
int blockHistogramSize;
int count1, count2, count4;
Point imgoffset;
Mat_<float> blockCache;
Mat_<uchar> blockCacheFlags;
Mat grad, qangle;
const HOGDescriptorTester* descriptor;
};
HOGCacheTester::HOGCacheTester()
{
useCache = false;
blockHistogramSize = count1 = count2 = count4 = 0;
descriptor = 0;
}
HOGCacheTester::HOGCacheTester(const HOGDescriptorTester* _descriptor,
const Mat& _img, Size _paddingTL, Size _paddingBR,
bool _useCache, Size _cacheStride)
{
init(_descriptor, _img, _paddingTL, _paddingBR, _useCache, _cacheStride);
}
void HOGCacheTester::init(const HOGDescriptorTester* _descriptor,
const Mat& _img, Size _paddingTL, Size _paddingBR,
bool _useCache, Size _cacheStride)
{
descriptor = _descriptor;
cacheStride = _cacheStride;
useCache = _useCache;
descriptor->computeGradient(_img, grad, qangle, _paddingTL, _paddingBR);
imgoffset = _paddingTL;
winSize = descriptor->winSize;
Size blockSize = descriptor->blockSize;
Size blockStride = descriptor->blockStride;
Size cellSize = descriptor->cellSize;
int i, j, nbins = descriptor->nbins;
int rawBlockSize = blockSize.width*blockSize.height;
nblocks = Size((winSize.width - blockSize.width)/blockStride.width + 1,
(winSize.height - blockSize.height)/blockStride.height + 1);
ncells = Size(blockSize.width/cellSize.width, blockSize.height/cellSize.height);
blockHistogramSize = ncells.width*ncells.height*nbins;
if( useCache )
{
Size cacheSize((grad.cols - blockSize.width)/cacheStride.width+1,
(winSize.height/cacheStride.height)+1);
blockCache.create(cacheSize.height, cacheSize.width*blockHistogramSize);
blockCacheFlags.create(cacheSize);
size_t cacheRows = blockCache.rows;
ymaxCached.resize(cacheRows);
for(size_t ii = 0; ii < cacheRows; ii++ )
ymaxCached[ii] = -1;
}
Mat_<float> weights(blockSize);
float sigma = (float)descriptor->getWinSigma();
float scale = 1.f/(sigma*sigma*2);
for(i = 0; i < blockSize.height; i++)
for(j = 0; j < blockSize.width; j++)
{
float di = i - blockSize.height*0.5f;
float dj = j - blockSize.width*0.5f;
weights(i,j) = std::exp(-(di*di + dj*dj)*scale);
}
blockData.resize(nblocks.width*nblocks.height);
pixData.resize(rawBlockSize*3);
// Initialize 2 lookup tables, pixData & blockData.
// Here is why:
//
// The detection algorithm runs in 4 nested loops (at each pyramid layer):
// loop over the windows within the input image
// loop over the blocks within each window
// loop over the cells within each block
// loop over the pixels in each cell
//
// As each of the loops runs over a 2-dimensional array,
// we could get 8(!) nested loops in total, which is very-very slow.
//
// To speed the things up, we do the following:
// 1. loop over windows is unrolled in the HOGDescriptor::{compute|detect} methods;
// inside we compute the current search window using getWindow() method.
// Yes, it involves some overhead (function call + couple of divisions),
// but it's tiny in fact.
// 2. loop over the blocks is also unrolled. Inside we use pre-computed blockData[j]
// to set up gradient and histogram pointers.
// 3. loops over cells and pixels in each cell are merged
// (since there is no overlap between cells, each pixel in the block is processed once)
// and also unrolled. Inside we use PixData[k] to access the gradient values and
// update the histogram
//
count1 = count2 = count4 = 0;
for( j = 0; j < blockSize.width; j++ )
for( i = 0; i < blockSize.height; i++ )
{
PixData* data = 0;
float cellX = (j+0.5f)/cellSize.width - 0.5f;
float cellY = (i+0.5f)/cellSize.height - 0.5f;
int icellX0 = cvFloor(cellX);
int icellY0 = cvFloor(cellY);
int icellX1 = icellX0 + 1, icellY1 = icellY0 + 1;
cellX -= icellX0;
cellY -= icellY0;
if( (unsigned)icellX0 < (unsigned)ncells.width &&
(unsigned)icellX1 < (unsigned)ncells.width )
{
if( (unsigned)icellY0 < (unsigned)ncells.height &&
(unsigned)icellY1 < (unsigned)ncells.height )
{
data = &pixData[rawBlockSize*2 + (count4++)];
data->histOfs[0] = (icellX0*ncells.height + icellY0)*nbins;
data->histWeights[0] = (1.f - cellX)*(1.f - cellY);
data->histOfs[1] = (icellX1*ncells.height + icellY0)*nbins;
data->histWeights[1] = cellX*(1.f - cellY);
data->histOfs[2] = (icellX0*ncells.height + icellY1)*nbins;
data->histWeights[2] = (1.f - cellX)*cellY;
data->histOfs[3] = (icellX1*ncells.height + icellY1)*nbins;
data->histWeights[3] = cellX*cellY;
}
else
{
data = &pixData[rawBlockSize + (count2++)];
if( (unsigned)icellY0 < (unsigned)ncells.height )
{
icellY1 = icellY0;
cellY = 1.f - cellY;
}
data->histOfs[0] = (icellX0*ncells.height + icellY1)*nbins;
data->histWeights[0] = (1.f - cellX)*cellY;
data->histOfs[1] = (icellX1*ncells.height + icellY1)*nbins;
data->histWeights[1] = cellX*cellY;
data->histOfs[2] = data->histOfs[3] = 0;
data->histWeights[2] = data->histWeights[3] = 0;
}
}
else
{
if( (unsigned)icellX0 < (unsigned)ncells.width )
{
icellX1 = icellX0;
cellX = 1.f - cellX;
}
if( (unsigned)icellY0 < (unsigned)ncells.height &&
(unsigned)icellY1 < (unsigned)ncells.height )
{
data = &pixData[rawBlockSize + (count2++)];
data->histOfs[0] = (icellX1*ncells.height + icellY0)*nbins;
data->histWeights[0] = cellX*(1.f - cellY);
data->histOfs[1] = (icellX1*ncells.height + icellY1)*nbins;
data->histWeights[1] = cellX*cellY;
data->histOfs[2] = data->histOfs[3] = 0;
data->histWeights[2] = data->histWeights[3] = 0;
}
else
{
data = &pixData[count1++];
if( (unsigned)icellY0 < (unsigned)ncells.height )
{
icellY1 = icellY0;
cellY = 1.f - cellY;
}
data->histOfs[0] = (icellX1*ncells.height + icellY1)*nbins;
data->histWeights[0] = cellX*cellY;
data->histOfs[1] = data->histOfs[2] = data->histOfs[3] = 0;
data->histWeights[1] = data->histWeights[2] = data->histWeights[3] = 0;
}
}
data->gradOfs = (grad.cols*i + j)*2;
data->qangleOfs = (qangle.cols*i + j)*2;
data->gradWeight = weights(i,j);
}
assert( count1 + count2 + count4 == rawBlockSize );
// defragment pixData
for( j = 0; j < count2; j++ )
pixData[j + count1] = pixData[j + rawBlockSize];
for( j = 0; j < count4; j++ )
pixData[j + count1 + count2] = pixData[j + rawBlockSize*2];
count2 += count1;
count4 += count2;
// initialize blockData
for( j = 0; j < nblocks.width; j++ )
for( i = 0; i < nblocks.height; i++ )
{
BlockData& data = blockData[j*nblocks.height + i];
data.histOfs = (j*nblocks.height + i)*blockHistogramSize;
data.imgOffset = Point(j*blockStride.width,i*blockStride.height);
}
}
const float* HOGCacheTester::getBlock(Point pt, float* buf)
{
float* blockHist = buf;
assert(descriptor != 0);
Size blockSize = descriptor->blockSize;
pt += imgoffset;
CV_Assert( (unsigned)pt.x <= (unsigned)(grad.cols - blockSize.width) &&
(unsigned)pt.y <= (unsigned)(grad.rows - blockSize.height) );
if( useCache )
{
CV_Assert( pt.x % cacheStride.width == 0 &&
pt.y % cacheStride.height == 0 );
Point cacheIdx(pt.x/cacheStride.width,
(pt.y/cacheStride.height) % blockCache.rows);
if( pt.y != ymaxCached[cacheIdx.y] )
{
Mat_<uchar> cacheRow = blockCacheFlags.row(cacheIdx.y);
cacheRow = (uchar)0;
ymaxCached[cacheIdx.y] = pt.y;
}
blockHist = &blockCache[cacheIdx.y][cacheIdx.x*blockHistogramSize];
uchar& computedFlag = blockCacheFlags(cacheIdx.y, cacheIdx.x);
if( computedFlag != 0 )
return blockHist;
computedFlag = (uchar)1; // set it at once, before actual computing
}
int k, C1 = count1, C2 = count2, C4 = count4;
const float* gradPtr = (const float*)(grad.data + grad.step*pt.y) + pt.x*2;
const uchar* qanglePtr = qangle.data + qangle.step*pt.y + pt.x*2;
CV_Assert( blockHist != 0 );
for( k = 0; k < blockHistogramSize; k++ )
blockHist[k] = 0.f;
const PixData* _pixData = &pixData[0];
for( k = 0; k < C1; k++ )
{
const PixData& pk = _pixData[k];
const float* a = gradPtr + pk.gradOfs;
float w = pk.gradWeight*pk.histWeights[0];
const uchar* h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1];
float* hist = blockHist + pk.histOfs[0];
float t0 = hist[h0] + a[0]*w;
float t1 = hist[h1] + a[1]*w;
hist[h0] = t0; hist[h1] = t1;
}
for( ; k < C2; k++ )
{
const PixData& pk = _pixData[k];
const float* a = gradPtr + pk.gradOfs;
float w, t0, t1, a0 = a[0], a1 = a[1];
const uchar* h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1];
float* hist = blockHist + pk.histOfs[0];
w = pk.gradWeight*pk.histWeights[0];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[1];
w = pk.gradWeight*pk.histWeights[1];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
}
for( ; k < C4; k++ )
{
const PixData& pk = _pixData[k];
const float* a = gradPtr + pk.gradOfs;
float w, t0, t1, a0 = a[0], a1 = a[1];
const uchar* h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1];
float* hist = blockHist + pk.histOfs[0];
w = pk.gradWeight*pk.histWeights[0];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[1];
w = pk.gradWeight*pk.histWeights[1];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[2];
w = pk.gradWeight*pk.histWeights[2];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + pk.histOfs[3];
w = pk.gradWeight*pk.histWeights[3];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
}
normalizeBlockHistogram(blockHist);
return blockHist;
}
void HOGCacheTester::normalizeBlockHistogram(float* _hist) const
{
float* hist = &_hist[0], partSum[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
size_t i, sz = blockHistogramSize;
for (i = 0; i <= sz - 4; i += 4)
{
partSum[0] += hist[i] * hist[i];
partSum[1] += hist[i+1] * hist[i+1];
partSum[2] += hist[i+2] * hist[i+2];
partSum[3] += hist[i+3] * hist[i+3];
}
float t0 = partSum[0] + partSum[1];
float t1 = partSum[2] + partSum[3];
float sum = t0 + t1;
for( ; i < sz; i++ )
sum += hist[i]*hist[i];
float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold;
partSum[0] = partSum[1] = partSum[2] = partSum[3] = 0.0f;
for(i = 0; i <= sz - 4; i += 4)
{
hist[i] = std::min(hist[i]*scale, thresh);
hist[i+1] = std::min(hist[i+1]*scale, thresh);
hist[i+2] = std::min(hist[i+2]*scale, thresh);
hist[i+3] = std::min(hist[i+3]*scale, thresh);
partSum[0] += hist[i]*hist[i];
partSum[1] += hist[i+1]*hist[i+1];
partSum[2] += hist[i+2]*hist[i+2];
partSum[3] += hist[i+3]*hist[i+3];
}
t0 = partSum[0] + partSum[1];
t1 = partSum[2] + partSum[3];
sum = t0 + t1;
for( ; i < sz; i++ )
{
hist[i] = std::min(hist[i]*scale, thresh);
sum += hist[i]*hist[i];
}
scale = 1.f/(std::sqrt(sum)+1e-3f);
for( i = 0; i < sz; i++ )
hist[i] *= scale;
}
Size HOGCacheTester::windowsInImage(Size imageSize, Size winStride) const
{
return Size((imageSize.width - winSize.width)/winStride.width + 1,
(imageSize.height - winSize.height)/winStride.height + 1);
}
Rect HOGCacheTester::getWindow(Size imageSize, Size winStride, int idx) const
{
int nwindowsX = (imageSize.width - winSize.width)/winStride.width + 1;
int y = idx / nwindowsX;
int x = idx - nwindowsX*y;
return Rect( x*winStride.width, y*winStride.height, winSize.width, winSize.height );
}
inline bool HOGDescriptorTester::is_failed() const
{
return failed;
}
void HOGDescriptorTester::detect(const Mat& img,
vector<Point>& hits, vector<double>& weights, double hitThreshold,
Size winStride, Size padding, const vector<Point>& locations) const
{
if (failed)
return;
hits.clear();
if( svmDetector.empty() )
return;
if( winStride == Size() )
winStride = cellSize;
Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height));
size_t nwindows = locations.size();
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
HOGCacheTester cache(this, img, padding, padding, nwindows == 0, cacheStride);
if( !nwindows )
nwindows = cache.windowsInImage(paddedImgSize, winStride).area();
const HOGCacheTester::BlockData* blockData = &cache.blockData[0];
int nblocks = cache.nblocks.area();
int blockHistogramSize = cache.blockHistogramSize;
size_t dsize = getDescriptorSize();
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
vector<float> blockHist(blockHistogramSize);
for( size_t i = 0; i < nwindows; i++ )
{
Point pt0;
if( !locations.empty() )
{
pt0 = locations[i];
if( pt0.x < -padding.width || pt0.x > img.cols + padding.width - winSize.width ||
pt0.y < -padding.height || pt0.y > img.rows + padding.height - winSize.height )
continue;
}
else
{
pt0 = cache.getWindow(paddedImgSize, winStride, (int)i).tl() - Point(padding);
CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0);
}
double s = rho;
const float* svmVec = &svmDetector[0];
int j, k;
for( j = 0; j < nblocks; j++, svmVec += blockHistogramSize )
{
const HOGCacheTester::BlockData& bj = blockData[j];
Point pt = pt0 + bj.imgOffset;
const float* vec = cache.getBlock(pt, &blockHist[0]);
for( k = 0; k <= blockHistogramSize - 4; k += 4 )
s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] +
vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3];
for( ; k < blockHistogramSize; k++ )
s += vec[k]*svmVec[k];
}
if( s >= hitThreshold )
{
hits.push_back(pt0);
weights.push_back(s);
}
}
// validation
std::vector<Point> actual_find_locations;
std::vector<double> actual_weights;
actual_hog->detect(img, actual_find_locations, actual_weights,
hitThreshold, winStride, padding, locations);
if (!std::equal(hits.begin(), hits.end(),
actual_find_locations.begin()))
{
ts->printf(cvtest::TS::SUMMARY, "Found locations are not equal (see detect function)\n");
ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
ts->set_gtest_status();
failed = true;
return;
}
const double eps = 0.0;
double diff_norm = norm(Mat(actual_weights) - Mat(weights), CV_L2);
if (diff_norm > eps)
{
ts->printf(cvtest::TS::SUMMARY, "Weights for found locations aren't equal.\n"
"Norm of the difference is %lf\n", diff_norm);
ts->printf(cvtest::TS::LOG, "Channels: %d\n", img.channels());
failed = true;
ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
ts->set_gtest_status();
return;
}
}
void HOGDescriptorTester::detect(const Mat& img, vector<Point>& hits, double hitThreshold,
Size winStride, Size padding, const vector<Point>& locations) const
{
vector<double> weightsV;
detect(img, hits, weightsV, hitThreshold, winStride, padding, locations);
}
void HOGDescriptorTester::compute(const Mat& img, vector<float>& descriptors,
Size winStride, Size padding, const vector<Point>& locations) const
{
if( winStride == Size() )
winStride = cellSize;
Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height));
size_t nwindows = locations.size();
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
HOGCacheTester cache(this, img, padding, padding, nwindows == 0, cacheStride);
if( !nwindows )
nwindows = cache.windowsInImage(paddedImgSize, winStride).area();
const HOGCacheTester::BlockData* blockData = &cache.blockData[0];
int nblocks = cache.nblocks.area();
int blockHistogramSize = cache.blockHistogramSize;
size_t dsize = getDescriptorSize();
descriptors.resize(dsize*nwindows);
for( size_t i = 0; i < nwindows; i++ )
{
float* descriptor = &descriptors[i*dsize];
Point pt0;
if( !locations.empty() )
{
pt0 = locations[i];
if( pt0.x < -padding.width || pt0.x > img.cols + padding.width - winSize.width ||
pt0.y < -padding.height || pt0.y > img.rows + padding.height - winSize.height )
continue;
}
else
{
pt0 = cache.getWindow(paddedImgSize, winStride, (int)i).tl() - Point(padding);
CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0);
}
for( int j = 0; j < nblocks; j++ )
{
const HOGCacheTester::BlockData& bj = blockData[j];
Point pt = pt0 + bj.imgOffset;
float* dst = descriptor + bj.histOfs;
const float* src = cache.getBlock(pt, dst);
if( src != dst )
for( int k = 0; k < blockHistogramSize; k++ )
dst[k] = src[k];
}
}
// validation
std::vector<float> actual_descriptors;
actual_hog->compute(img, actual_descriptors, winStride, padding, locations);
double diff_norm = cv::norm(Mat(actual_descriptors) - Mat(descriptors), CV_L2);
const double eps = 0.0;
if (diff_norm > eps)
{
ts->printf(cvtest::TS::SUMMARY, "Norm of the difference: %lf\n", diff_norm);
ts->printf(cvtest::TS::SUMMARY, "Found descriptors are not equal (see compute function)\n");
ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
ts->printf(cvtest::TS::LOG, "Channels: %d\n", img.channels());
ts->set_gtest_status();
failed = true;
return;
}
}
void HOGDescriptorTester::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
Size paddingTL, Size paddingBR) const
{
CV_Assert( img.type() == CV_8U || img.type() == CV_8UC3 );
Size gradsize(img.cols + paddingTL.width + paddingBR.width,
img.rows + paddingTL.height + paddingBR.height);
grad.create(gradsize, CV_32FC2); // <magnitude*(1-alpha), magnitude*alpha>
qangle.create(gradsize, CV_8UC2); // [0..nbins-1] - quantized gradient orientation
Size wholeSize;
Point roiofs;
img.locateROI(wholeSize, roiofs);
int i, x, y;
int cn = img.channels();
Mat_<float> _lut(1, 256);
const float* lut = &_lut(0,0);
if( gammaCorrection )
for( i = 0; i < 256; i++ )
_lut(0,i) = std::sqrt((float)i);
else
for( i = 0; i < 256; i++ )
_lut(0,i) = (float)i;
AutoBuffer<int> mapbuf(gradsize.width + gradsize.height + 4);
int* xmap = (int*)mapbuf + 1;
int* ymap = xmap + gradsize.width + 2;
const int borderType = (int)BORDER_REFLECT_101;
for( x = -1; x < gradsize.width + 1; x++ )
xmap[x] = borderInterpolate(x - paddingTL.width + roiofs.x,
wholeSize.width, borderType) - roiofs.x;
for( y = -1; y < gradsize.height + 1; y++ )
ymap[y] = borderInterpolate(y - paddingTL.height + roiofs.y,
wholeSize.height, borderType) - roiofs.y;
// x- & y- derivatives for the whole row
int width = gradsize.width;
AutoBuffer<float> _dbuf(width*4);
float* dbuf = _dbuf;
Mat Dx(1, width, CV_32F, dbuf);
Mat Dy(1, width, CV_32F, dbuf + width);
Mat Mag(1, width, CV_32F, dbuf + width*2);
Mat Angle(1, width, CV_32F, dbuf + width*3);
int _nbins = nbins;
float angleScale = (float)(_nbins/CV_PI);
for( y = 0; y < gradsize.height; y++ )
{
const uchar* imgPtr = img.data + img.step*ymap[y];
const uchar* prevPtr = img.data + img.step*ymap[y-1];
const uchar* nextPtr = img.data + img.step*ymap[y+1];
float* gradPtr = (float*)grad.ptr(y);
uchar* qanglePtr = (uchar*)qangle.ptr(y);
if( cn == 1 )
{
for( x = 0; x < width; x++ )
{
int x1 = xmap[x];
dbuf[x] = (float)(lut[imgPtr[xmap[x+1]]] - lut[imgPtr[xmap[x-1]]]);
dbuf[width + x] = (float)(lut[nextPtr[x1]] - lut[prevPtr[x1]]);
}
}
else
{
for( x = 0; x < width; x++ )
{
int x1 = xmap[x]*3;
float dx0, dy0, dx, dy, mag0, mag;
const uchar* p2 = imgPtr + xmap[x+1]*3;
const uchar* p0 = imgPtr + xmap[x-1]*3;
dx0 = lut[p2[2]] - lut[p0[2]];
dy0 = lut[nextPtr[x1+2]] - lut[prevPtr[x1+2]];
mag0 = dx0*dx0 + dy0*dy0;
dx = lut[p2[1]] - lut[p0[1]];
dy = lut[nextPtr[x1+1]] - lut[prevPtr[x1+1]];
mag = dx*dx + dy*dy;
if( mag0 < mag )
{
dx0 = dx;
dy0 = dy;
mag0 = mag;
}
dx = lut[p2[0]] - lut[p0[0]];
dy = lut[nextPtr[x1]] - lut[prevPtr[x1]];
mag = dx*dx + dy*dy;
if( mag0 < mag )
{
dx0 = dx;
dy0 = dy;
mag0 = mag;
}
dbuf[x] = dx0;
dbuf[x+width] = dy0;
}
}
cartToPolar( Dx, Dy, Mag, Angle, false );
for( x = 0; x < width; x++ )
{
float mag = dbuf[x+width*2], angle = dbuf[x+width*3]*angleScale - 0.5f;
int hidx = cvFloor(angle);
angle -= hidx;
gradPtr[x*2] = mag*(1.f - angle);
gradPtr[x*2+1] = mag*angle;
if( hidx < 0 )
hidx += _nbins;
else if( hidx >= _nbins )
hidx -= _nbins;
assert( (unsigned)hidx < (unsigned)_nbins );
qanglePtr[x*2] = (uchar)hidx;
hidx++;
hidx &= hidx < _nbins ? -1 : 0;
qanglePtr[x*2+1] = (uchar)hidx;
}
}
// validation
Mat actual_mats[2], reference_mats[2] = { grad, qangle };
const char* args[] = { "Gradient's", "Qangles's" };
actual_hog->computeGradient(img, actual_mats[0], actual_mats[1], paddingTL, paddingBR);
const double eps = 0.0;
for (i = 0; i < 2; ++i)
{
double diff_norm = norm(reference_mats[i] - actual_mats[i], CV_L2);
if (diff_norm > eps)
{
ts->printf(cvtest::TS::LOG, "%s matrices are not equal\n"
"Norm of the difference is %lf\n", args[i], diff_norm);
ts->printf(cvtest::TS::LOG, "Channels: %d\n", img.channels());
ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
ts->set_gtest_status();
failed = true;
return;
}
}
}
TEST(Objdetect_HOGDetector_Strict, accuracy)
{
cvtest::TS* ts = cvtest::TS::ptr();
RNG& rng = ts->get_rng();
HOGDescriptor actual_hog;
actual_hog.setSVMDetector(HOGDescriptor::getDefaultPeopleDetector());
HOGDescriptorTester reference_hog(actual_hog);
const unsigned int test_case_count = 5;
for (unsigned int i = 0; i < test_case_count && !reference_hog.is_failed(); ++i)
{
// creating a matrix
Size ssize(rng.uniform(1, 10) * actual_hog.winSize.width,
rng.uniform(1, 10) * actual_hog.winSize.height);
int type = rng.uniform(0, 1) > 0 ? CV_8UC1 : CV_8UC3;
Mat image(ssize, type);
rng.fill(image, RNG::UNIFORM, 0, 256, true);
// checking detect
std::vector<Point> hits;
std::vector<double> weights;
reference_hog.detect(image, hits, weights);
// checking compute
std::vector<float> descriptors;
reference_hog.compute(image, descriptors);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment