Commit a71ff501 authored by Chip Kerchner's avatar Chip Kerchner Committed by Alexander Alekhin

Merge pull request #15623 from ChipKerchner:optimizeHOGpipeline

* Use circular lut hustory buffer in computeGradient of HOG

* Initialize prefetch data outside main loop.  Avoid code duplication.
parent 732657cc
...@@ -299,6 +299,11 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -299,6 +299,11 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
Mat Dy(1, width, CV_32F, dbuf + width); Mat Dy(1, width, CV_32F, dbuf + width);
Mat Mag(1, width, CV_32F, dbuf + width*2); Mat Mag(1, width, CV_32F, dbuf + width*2);
Mat Angle(1, width, CV_32F, dbuf + width*3); Mat Angle(1, width, CV_32F, dbuf + width*3);
#if CV_SIMD128
int widthP2 = width+2;
AutoBuffer<float> _lutBuf(9*widthP2);
float* const lutBuf = _lutBuf.data();
#endif
if (cn == 3) if (cn == 3)
{ {
...@@ -317,6 +322,63 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -317,6 +322,63 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
xmap += 1; xmap += 1;
} }
#if CV_SIMD128
typedef const uchar* const T;
float *lutPrev, *lutCurr, *lutNext;
{
y = 0;
const uchar* imgPtr = img.ptr(ymap[y]);
const uchar* prevPtr = img.data + img.step*ymap[y-1];
lutPrev = lutBuf+widthP2*0;
lutCurr = lutBuf+widthP2*3;
{
int x0 = xmap[-1], x1 = xmap[0];
T p02 = imgPtr + x0, p12 = imgPtr + x1;
lutPrev[0+widthP2*0] = lut[prevPtr[x0+0]];
lutPrev[0+widthP2*1] = lut[prevPtr[x0+1]];
lutPrev[0+widthP2*2] = lut[prevPtr[x0+2]];
lutCurr[0+widthP2*0] = lut[p02[0]]; lutCurr[1+widthP2*0] = lut[p12[0]];
lutCurr[0+widthP2*1] = lut[p02[1]]; lutCurr[1+widthP2*1] = lut[p12[1]];
lutCurr[0+widthP2*2] = lut[p02[2]]; lutCurr[1+widthP2*2] = lut[p12[2]];
}
for( x = 0; x <= width - 4; x += 4 )
{
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
T p02 = imgPtr + xmap[x+1];
T p12 = imgPtr + xmap[x+2];
T p22 = imgPtr + xmap[x+3];
T p32 = imgPtr + xmap[x+4];
v_float32x4 _dx00 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]);
v_float32x4 _dx10 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]);
v_float32x4 _dx20 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]);
v_store(lutCurr+x+widthP2*0+2, _dx00);
v_store(lutCurr+x+widthP2*1+2, _dx10);
v_store(lutCurr+x+widthP2*2+2, _dx20);
v_float32x4 _dy00 = v_float32x4(lut[prevPtr[x0+0]], lut[prevPtr[x1+0]], lut[prevPtr[x2+0]], lut[prevPtr[x3+0]]);
v_float32x4 _dy10 = v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
v_float32x4 _dy20 = v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
v_store(lutPrev+x+widthP2*0+1, _dy00);
v_store(lutPrev+x+widthP2*1+1, _dy10);
v_store(lutPrev+x+widthP2*2+1, _dy20);
}
{
int x0 = xmap[x];
lutPrev[x+widthP2*0+1] = lut[prevPtr[x0+0]];
lutPrev[x+widthP2*1+1] = lut[prevPtr[x0+1]];
lutPrev[x+widthP2*2+1] = lut[prevPtr[x0+2]];
}
}
#endif
float angleScale = signedGradient ? (float)(nbins/(2.0*CV_PI)) : (float)(nbins/CV_PI); float angleScale = signedGradient ? (float)(nbins/(2.0*CV_PI)) : (float)(nbins/CV_PI);
for( y = 0; y < gradsize.height; y++ ) for( y = 0; y < gradsize.height; y++ )
{ {
...@@ -342,28 +404,57 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -342,28 +404,57 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
{ {
x = 0; x = 0;
#if CV_SIMD128 #if CV_SIMD128
int yMod = y%3;
// Circular lut history buffer
if (yMod == 0)
{
lutPrev = lutBuf+widthP2*0;
lutCurr = lutBuf+widthP2*3;
lutNext = lutBuf+widthP2*6;
}
else if (yMod == 1)
{
lutPrev = lutBuf+widthP2*3;
lutCurr = lutBuf+widthP2*6;
lutNext = lutBuf+widthP2*0;
}
else
{
lutPrev = lutBuf+widthP2*6;
lutCurr = lutBuf+widthP2*0;
lutNext = lutBuf+widthP2*3;
}
{
int x0 = xmap[-1];
lutNext[0+widthP2*0] = lut[nextPtr[x0+0]];
lutNext[0+widthP2*1] = lut[nextPtr[x0+1]];
lutNext[0+widthP2*2] = lut[nextPtr[x0+2]];
}
for( ; x <= width - 4; x += 4 ) for( ; x <= width - 4; x += 4 )
{ {
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
typedef const uchar* const T;
T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1]; v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0);
T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x]; v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1);
T p22 = imgPtr + xmap[x+3], p20 = p02; v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2);
T p32 = imgPtr + xmap[x+4], p30 = p12;
v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]);
v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) - v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1);
v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]);
v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) - v_store(lutNext+x+widthP2*0+1, _dy00);
v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]);
v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) - v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]);
v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]); v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1);
v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) - v_store(lutNext+x+widthP2*1+1, _dy10);
v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]);
v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) - v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]);
v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]); v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1);
v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) -
v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]); v_store(lutNext+x+widthP2*2+1, _dy20);
v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0); v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1); v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
...@@ -380,6 +471,13 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, ...@@ -380,6 +471,13 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
v_store(dbuf + x, _dx2); v_store(dbuf + x, _dx2);
v_store(dbuf + x + width, _dy2); v_store(dbuf + x + width, _dy2);
} }
{
int x0 = xmap[x];
lutNext[x+widthP2*0+1] = lut[nextPtr[x0+0]];
lutNext[x+widthP2*1+1] = lut[nextPtr[x0+1]];
lutNext[x+widthP2*2+1] = lut[nextPtr[x0+2]];
}
#endif #endif
for( ; x < width; x++ ) for( ; x < width; x++ )
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment