Commit bf96d823 authored by Maksim Shabunin's avatar Maksim Shabunin

Use BufferArea in more places

parent f9bd0257
...@@ -55,7 +55,7 @@ ...@@ -55,7 +55,7 @@
#include <math.h> #include <math.h>
#include <vector> #include <vector>
#include "rho.h" #include "rho.h"
#include "opencv2/core/utils/buffer_area.private.hpp"
...@@ -65,7 +65,6 @@ namespace cv{/* For C support, replace with extern "C" { */ ...@@ -65,7 +65,6 @@ namespace cv{/* For C support, replace with extern "C" { */
/* Constants */ /* Constants */
const int MEM_ALIGN = 32;
const size_t HSIZE = (3*3*sizeof(float)); const size_t HSIZE = (3*3*sizeof(float));
const double MIN_DELTA_CHNG = 0.1; const double MIN_DELTA_CHNG = 0.1;
// const double CHI_STAT = 2.706; // const double CHI_STAT = 2.706;
...@@ -312,16 +311,14 @@ struct RHO_HEST_REFC : RHO_HEST{ ...@@ -312,16 +311,14 @@ struct RHO_HEST_REFC : RHO_HEST{
/* Levenberg-Marquardt Refinement */ /* Levenberg-Marquardt Refinement */
struct{ struct{
float (* JtJ)[8]; /* JtJ matrix */ float* JtJ; /* JtJ matrix */
float (* tmp1)[8]; /* Temporary 1 */ float* tmp1; /* Temporary 1 */
float* Jte; /* Jte vector */ float* Jte; /* Jte vector */
} lm; } lm;
/* Memory Management */ /* Memory Management */
struct{ utils::BufferArea runArea;
cv::Mat perObj; utils::BufferArea objArea;
cv::Mat perRun;
} mem;
/* Initialized? */ /* Initialized? */
int initialized; int initialized;
...@@ -659,16 +656,9 @@ inline int RHO_HEST_REFC::initialize(void){ ...@@ -659,16 +656,9 @@ inline int RHO_HEST_REFC::initialize(void){
fastSeed((uint64_t)~0); fastSeed((uint64_t)~0);
int areAllAllocsSuccessful = !mem.perObj.empty();
if(!areAllAllocsSuccessful){
finalize();
}else{
initialized = 1; initialized = 1;
}
return areAllAllocsSuccessful; return true;
} }
/** /**
...@@ -835,45 +825,14 @@ unsigned RHO_HEST_REFC::rhoHest(const float* src, /* Source points */ ...@@ -835,45 +825,14 @@ unsigned RHO_HEST_REFC::rhoHest(const float* src, /* Source points */
*/ */
inline void RHO_HEST_REFC::allocatePerObj(void){ inline void RHO_HEST_REFC::allocatePerObj(void){
/* We have known sizes */ objArea.allocate(ctrl.smpl, SMPL_SIZE);
size_t ctrl_smpl_sz = SMPL_SIZE*sizeof(*ctrl.smpl); objArea.allocate(curr.pkdPts, SMPL_SIZE*2*2);
size_t curr_pkdPts_sz = SMPL_SIZE*2*2*sizeof(*curr.pkdPts); objArea.allocate(curr.H, HSIZE);
size_t curr_H_sz = HSIZE; objArea.allocate(best.H, HSIZE);
size_t best_H_sz = HSIZE; objArea.allocate(lm.JtJ, 8*8);
size_t lm_JtJ_sz = 8*8*sizeof(float); objArea.allocate(lm.tmp1, 8*8);
size_t lm_tmp1_sz = 8*8*sizeof(float); objArea.allocate(lm.Jte, 1*8);
size_t lm_Jte_sz = 1*8*sizeof(float); objArea.commit();
/* We compute offsets */
size_t total = 0;
#define MK_OFFSET(v) \
size_t v ## _of = total; \
total = alignSize(v ## _of + v ## _sz, MEM_ALIGN)
MK_OFFSET(ctrl_smpl);
MK_OFFSET(curr_pkdPts);
MK_OFFSET(curr_H);
MK_OFFSET(best_H);
MK_OFFSET(lm_JtJ);
MK_OFFSET(lm_tmp1);
MK_OFFSET(lm_Jte);
#undef MK_OFFSET
/* Allocate dynamic memory managed by cv::Mat */
mem.perObj.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
/* Extract aligned pointer */
unsigned char* ptr = alignPtr(mem.perObj.data, MEM_ALIGN);
/* Assign pointers */
ctrl.smpl = (unsigned*) (ptr + ctrl_smpl_of);
curr.pkdPts = (float*) (ptr + curr_pkdPts_of);
curr.H = (float*) (ptr + curr_H_of);
best.H = (float*) (ptr + best_H_of);
lm.JtJ = (float(*)[8])(ptr + lm_JtJ_of);
lm.tmp1 = (float(*)[8])(ptr + lm_tmp1_of);
lm.Jte = (float*) (ptr + lm_Jte_of);
} }
...@@ -885,30 +844,9 @@ inline void RHO_HEST_REFC::allocatePerObj(void){ ...@@ -885,30 +844,9 @@ inline void RHO_HEST_REFC::allocatePerObj(void){
*/ */
inline void RHO_HEST_REFC::allocatePerRun(void){ inline void RHO_HEST_REFC::allocatePerRun(void){
/* We have known sizes */ runArea.allocate(best.inl, arg.N);
size_t best_inl_sz = arg.N; runArea.allocate(curr.inl, arg.N);
size_t curr_inl_sz = arg.N; runArea.commit();
/* We compute offsets */
size_t total = 0;
#define MK_OFFSET(v) \
size_t v ## _of = total; \
total = alignSize(v ## _of + v ## _sz, MEM_ALIGN)
MK_OFFSET(best_inl);
MK_OFFSET(curr_inl);
#undef MK_OFFSET
/* Allocate dynamic memory managed by cv::Mat */
mem.perRun.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
/* Extract aligned pointer */
unsigned char* ptr = alignPtr(mem.perRun.data, MEM_ALIGN);
/* Assign pointers */
best.inl = (char*)(ptr + best_inl_of);
curr.inl = (char*)(ptr + curr_inl_of);
} }
...@@ -919,10 +857,7 @@ inline void RHO_HEST_REFC::allocatePerRun(void){ ...@@ -919,10 +857,7 @@ inline void RHO_HEST_REFC::allocatePerRun(void){
*/ */
inline void RHO_HEST_REFC::deallocatePerRun(void){ inline void RHO_HEST_REFC::deallocatePerRun(void){
best.inl = NULL; runArea.release();
curr.inl = NULL;
mem.perRun.release();
} }
...@@ -933,15 +868,7 @@ inline void RHO_HEST_REFC::deallocatePerRun(void){ ...@@ -933,15 +868,7 @@ inline void RHO_HEST_REFC::deallocatePerRun(void){
*/ */
inline void RHO_HEST_REFC::deallocatePerObj(void){ inline void RHO_HEST_REFC::deallocatePerObj(void){
ctrl.smpl = NULL; objArea.release();
curr.pkdPts = NULL;
curr.H = NULL;
best.H = NULL;
lm.JtJ = NULL;
lm.tmp1 = NULL;
lm.Jte = NULL;
mem.perObj.release();
} }
...@@ -2144,7 +2071,7 @@ inline void RHO_HEST_REFC::refine(void){ ...@@ -2144,7 +2071,7 @@ inline void RHO_HEST_REFC::refine(void){
*/ */
/* Find initial conditions */ /* Find initial conditions */
sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N, sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
lm.JtJ, lm.Jte, &S); (float(*)[8])lm.JtJ, lm.Jte, &S);
/*Levenberg-Marquardt Loop.*/ /*Levenberg-Marquardt Loop.*/
for(i=0;i<MAXLEVMARQITERS;i++){ for(i=0;i<MAXLEVMARQITERS;i++){
...@@ -2169,11 +2096,11 @@ inline void RHO_HEST_REFC::refine(void){ ...@@ -2169,11 +2096,11 @@ inline void RHO_HEST_REFC::refine(void){
* transpose) then multiply Jte in order to find dH. * transpose) then multiply Jte in order to find dH.
*/ */
while(!sacChol8x8Damped(lm.JtJ, L, lm.tmp1)){ while(!sacChol8x8Damped((float(*)[8])lm.JtJ, L, (float(*)[8])lm.tmp1)){
L *= 2.0f; L *= 2.0f;
} }
sacTRInv8x8 (lm.tmp1, lm.tmp1); sacTRInv8x8 ((float(*)[8])lm.tmp1, (float(*)[8])lm.tmp1);
sacTRISolve8x8(lm.tmp1, lm.Jte, dH); sacTRISolve8x8((float(*)[8])lm.tmp1, lm.Jte, dH);
sacSub8x1 (newH, best.H, dH); sacSub8x1 (newH, best.H, dH);
sacCalcJacobianErrors(newH, arg.src, arg.dst, best.inl, arg.N, sacCalcJacobianErrors(newH, arg.src, arg.dst, best.inl, arg.N,
NULL, NULL, &newS); NULL, NULL, &newS);
...@@ -2204,7 +2131,7 @@ inline void RHO_HEST_REFC::refine(void){ ...@@ -2204,7 +2131,7 @@ inline void RHO_HEST_REFC::refine(void){
S = newS; S = newS;
memcpy(best.H, newH, sizeof(newH)); memcpy(best.H, newH, sizeof(newH));
sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N, sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
lm.JtJ, lm.Jte, &S); (float(*)[8])lm.JtJ, lm.Jte, &S);
} }
} }
} }
......
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
#include "precomp.hpp" #include "precomp.hpp"
#include <limits.h> #include <limits.h>
#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/utils/buffer_area.private.hpp"
namespace cv namespace cv
{ {
...@@ -99,6 +100,16 @@ struct StereoSGBMParams ...@@ -99,6 +100,16 @@ struct StereoSGBMParams
mode = _mode; mode = _mode;
} }
inline bool isFullDP() const
{
return mode == StereoSGBM::MODE_HH || mode == StereoSGBM::MODE_HH4;
}
inline Size calcSADWindowSize() const
{
const int dim = SADWindowSize > 0 ? SADWindowSize : 5;
return Size(dim, dim);
}
int minDisparity; int minDisparity;
int numDisparities; int numDisparities;
int SADWindowSize; int SADWindowSize;
...@@ -148,6 +159,7 @@ static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_va ...@@ -148,6 +159,7 @@ static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_va
#endif #endif
static const int DEFAULT_RIGHT_BORDER = -1; static const int DEFAULT_RIGHT_BORDER = -1;
/* /*
For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD), For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
and for each disparity minD<=d<maxD the function and for each disparity minD<=d<maxD the function
...@@ -161,7 +173,7 @@ static const int DEFAULT_RIGHT_BORDER = -1; ...@@ -161,7 +173,7 @@ static const int DEFAULT_RIGHT_BORDER = -1;
static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
int minD, int maxD, CostType* cost, int minD, int maxD, CostType* cost,
PixType* buffer, const PixType* tab, PixType* buffer, const PixType* tab,
int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER ) int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
{ {
int x, c, width = img1.cols, cn = img1.channels(); int x, c, width = img1.cols, cn = img1.channels();
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
...@@ -178,8 +190,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, ...@@ -178,8 +190,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y); const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2; PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
tab += tabOfs;
for( c = 0; c < cn*2; c++ ) for( c = 0; c < cn*2; c++ )
{ {
prow1[width*c] = prow1[width*c + width-1] = prow1[width*c] = prow1[width*c + width-1] =
...@@ -297,6 +307,166 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, ...@@ -297,6 +307,166 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
} }
class BufferSGBM
{
private:
size_t width1;
size_t Da;
size_t Dlra;
size_t costWidth;
size_t costHeight;
size_t hsumRows;
bool fullDP;
uchar dirs;
uchar dirs2;
static const size_t TAB_OFS = 256*4;
public:
CostType* Cbuf;
CostType* Sbuf;
CostType* hsumBuf;
CostType* pixDiff;
CostType* disp2cost;
DispType* disp2ptr;
PixType* tempBuf;
std::vector<CostType*> Lr;
std::vector<CostType*> minLr;
PixType * clipTab;
private:
utils::BufferArea area;
public:
BufferSGBM(size_t width1_,
size_t Da_,
size_t Dlra_,
size_t cn,
size_t width,
size_t height,
const StereoSGBMParams &params)
: width1(width1_),
Da(Da_),
Dlra(Dlra_),
Cbuf(NULL),
Sbuf(NULL),
hsumBuf(NULL),
pixDiff(NULL),
disp2cost(NULL),
disp2ptr(NULL),
tempBuf(NULL),
Lr(2, (CostType*)NULL),
minLr(2, (CostType*)NULL),
clipTab(NULL)
{
const size_t TAB_SIZE = 256 + TAB_OFS*2;
fullDP = params.isFullDP();
costWidth = width1 * Da;
costHeight = fullDP ? height : 1;
hsumRows = params.calcSADWindowSize().height + 2;
dirs = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR;
dirs2 = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR2;
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
area.allocate(Cbuf, costWidth * costHeight, CV_SIMD_WIDTH); // summary cost over different (nDirs) directions
area.allocate(Sbuf, costWidth * costHeight, CV_SIMD_WIDTH);
area.allocate(hsumBuf, costWidth * hsumRows, CV_SIMD_WIDTH);
area.allocate(pixDiff, costWidth, CV_SIMD_WIDTH);
area.allocate(disp2cost, width, CV_SIMD_WIDTH);
area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
area.allocate(tempBuf, width * (4 * cn + 2), CV_SIMD_WIDTH);
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for 8-way dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
for (size_t i = 0; i < 2; ++i)
{
// 2D: [ NR ][ w1 * NR2 ][ NR ] * [ Dlra ]
area.allocate(Lr[i], calcLrCount() * Dlra, CV_SIMD_WIDTH);
// 1D: [ NR ][ w1 * NR2 ][ NR ]
area.allocate(minLr[i], calcLrCount(), CV_SIMD_WIDTH);
}
area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
area.commit();
// init clipTab
const int ftzero = std::max(params.preFilterCap, 15) | 1;
for(int i = 0; i < (int)TAB_SIZE; i++ )
clipTab[i] = (PixType)(std::min(std::max(i - (int)TAB_OFS, -ftzero), ftzero) + ftzero);
}
inline const PixType * getClipTab() const
{
return clipTab + TAB_OFS;
}
inline void initCBuf(CostType val) const
{
for (size_t i = 0; i < costWidth * costHeight; ++i)
Cbuf[i] = val;
}
inline void clearLr(const Range & range = Range::all()) const
{
for (uchar i = 0; i < 2; ++i)
{
if (range == Range::all())
{
memset(Lr[i], 0, calcLrCount() * Dlra * sizeof(CostType));
memset(minLr[i], 0, calcLrCount() * sizeof(CostType));
}
else
{
memset(getLr(i, range.start), 0, range.size() * sizeof(CostType) * Dlra);
memset(getMinLr(i, range.start), 0, range.size() * sizeof(CostType));
}
}
}
inline size_t calcLrCount() const
{
return width1 * dirs2 + 2 * dirs;
}
inline void swapLr()
{
std::swap(Lr[0], Lr[1]);
std::swap(minLr[0], minLr[1]);
}
inline CostType * getHSumBuf(int row) const
{
return hsumBuf + (row % hsumRows) * costWidth;
}
inline CostType * getCBuf(int row) const
{
CV_Assert(row >= 0);
return Cbuf + (!fullDP ? 0 : (row * costWidth));
}
inline CostType * getSBuf(int row) const
{
CV_Assert(row >= 0);
return Sbuf + (!fullDP ? 0 : (row * costWidth));
}
inline void clearSBuf(int row, const Range & range = Range::all()) const
{
if (range == Range::all())
memset(getSBuf(row), 0, costWidth * sizeof(CostType));
else
memset(getSBuf(row) + range.start * Da, 0, range.size() * Da * sizeof(CostType));
}
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
inline CostType * getLr(uchar id, int idx, uchar shift = 0) const
{
CV_Assert(id < 2);
const size_t fixed_offset = dirs * Dlra;
return Lr[id] + fixed_offset + (idx * (int)dirs2 + (int)shift) * (int)Dlra;
}
inline CostType * getMinLr(uchar id, int idx, uchar shift = 0) const
{
CV_Assert(id < 2);
const size_t fixed_offset = dirs;
return minLr[id] + fixed_offset + (idx * dirs2 + shift);
}
};
/* /*
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf. computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y). that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
...@@ -318,34 +488,25 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, ...@@ -318,34 +488,25 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost. It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
*/ */
static void computeDisparitySGBM( const Mat& img1, const Mat& img2, static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
Mat& disp1, const StereoSGBMParams& params, Mat& disp1, const StereoSGBMParams& params )
Mat& buffer )
{ {
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
const int DISP_SCALE = (1 << DISP_SHIFT); const int DISP_SCALE = (1 << DISP_SHIFT);
const CostType MAX_COST = SHRT_MAX; const CostType MAX_COST = SHRT_MAX;
int minD = params.minDisparity, maxD = minD + params.numDisparities; int minD = params.minDisparity, maxD = minD + params.numDisparities;
Size SADWindowSize;
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
int ftzero = std::max(params.preFilterCap, 15) | 1;
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
int k, width = disp1.cols, height = disp1.rows; int k, width = disp1.cols, height = disp1.rows;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int D = maxD - minD, width1 = maxX1 - minX1; const int D = params.numDisparities;
int width1 = maxX1 - minX1;
int Da = (int)alignSize(D, v_int16::nlanes); int Da = (int)alignSize(D, v_int16::nlanes);
int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2; int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
bool fullDP = params.mode == StereoSGBM::MODE_HH; int npasses = params.isFullDP() ? 2 : 1;
int npasses = fullDP ? 2 : 1;
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType clipTab[TAB_SIZE];
for( k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
if( minX1 >= maxX1 ) if( minX1 >= maxX1 )
{ {
...@@ -353,39 +514,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -353,39 +514,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
return; return;
} }
// for each possible stereo match (img1(x,y) <=> img2(x-d,y)) BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
// we keep pixel difference cost (C) and the summary cost over NR directions (S). mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
size_t costBufSize = width1*Da;
size_t CSBufSize = costBufSize*(fullDP ? height : 1);
size_t minLrSize = (width1 + 2)*NR2, LrSize = minLrSize*Dlra;
int hsumBufNRows = SH2*2 + 2;
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for 8-way dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // alignment, C, S
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType) + // minLr[] and Lr[]
width*(sizeof(CostType) + sizeof(DispType)) + // disp2cost + disp2
width * (4*img1.channels() + 2) * sizeof(PixType); // temp buffer for computing per-pixel cost
if( buffer.empty() || !buffer.isContinuous() ||
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
buffer.reserveBuffer(totalBufSize);
// summary cost over different (nDirs) directions
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
CostType* Sbuf = Cbuf + CSBufSize;
CostType* hsumBuf = Sbuf + CSBufSize;
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
CostType* disp2cost = pixDiff + costBufSize + ((LrSize + minLrSize)*2 + v_int16::nlanes);
DispType* disp2ptr = (DispType*)(disp2cost + width);
PixType* tempBuf = (PixType*)(disp2ptr + width);
// add P2 to every C(x,y). it saves a few operations in the inner loops
for(k = 0; k < (int)CSBufSize; k++ )
Cbuf[k] = (CostType)P2;
for( int pass = 1; pass <= npasses; pass++ ) for( int pass = 1; pass <= npasses; pass++ )
{ {
...@@ -402,27 +532,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -402,27 +532,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
x1 = width1-1; x2 = -1; dx = -1; x1 = width1-1; x2 = -1; dx = -1;
} }
CostType *Lr[2]={0}, *minLr[2]={0}; uchar lrID = 0;
mem.clearLr();
for( k = 0; k < 2; k++ )
{
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
// however, then the alignment will be imperfect, i.e. bad for SSE,
// thus we shift the pointers by SIMD vector size
Lr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*k + NR2*Dlra;
memset( Lr[k] - NR2*Dlra, 0, LrSize*sizeof(CostType) );
minLr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*2 + minLrSize*k + NR2;
memset( minLr[k] - NR2, 0, minLrSize*sizeof(CostType) );
}
for( int y = y1; y != y2; y += dy ) for( int y = y1; y != y2; y += dy )
{ {
int x, d; int x, d;
DispType* disp1ptr = disp1.ptr<DispType>(y); DispType* disp1ptr = disp1.ptr<DispType>(y);
CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize); CostType* const C = mem.getCBuf(y);
CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize); CostType* const S = mem.getSBuf(y);
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any. if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
{ {
...@@ -430,35 +548,35 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -430,35 +548,35 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( k = dy1; k <= dy2; k++ ) for( k = dy1; k <= dy2; k++ )
{ {
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
if( k < height ) if( k < height )
{ {
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero ); calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
memset(hsumAdd, 0, Da*sizeof(CostType)); memset(hsumAdd, 0, Da*sizeof(CostType));
#if CV_SIMD #if CV_SIMD
v_int16 h_scale = vx_setall_s16((short)SW2 + 1); v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
for( d = 0; d < Da; d += v_int16::nlanes ) for( d = 0; d < Da; d += v_int16::nlanes )
{ {
v_int16 v_hsumAdd = vx_load_aligned(pixDiff + d) * h_scale; v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
for( x = Da; x <= SW2*Da; x += Da ) for( x = Da; x <= SW2*Da; x += Da )
v_hsumAdd += vx_load_aligned(pixDiff + x + d); v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
v_store_aligned(hsumAdd + d, v_hsumAdd); v_store_aligned(hsumAdd + d, v_hsumAdd);
} }
#else #else
for (d = 0; d < D; d++) for (d = 0; d < D; d++)
{ {
hsumAdd[d] = (CostType)(pixDiff[d] * (SW2 + 1)); hsumAdd[d] = (CostType)(mem.pixDiff[d] * (SW2 + 1));
for( x = Da; x <= SW2*Da; x += Da ) for( x = Da; x <= SW2*Da; x += Da )
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]); hsumAdd[d] = (CostType)(hsumAdd[d] + mem.pixDiff[x + d]);
} }
#endif #endif
if( y > 0 ) if( y > 0 )
{ {
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD #if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes) for (d = 0; d < Da; d += v_int16::nlanes)
...@@ -470,8 +588,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -470,8 +588,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( x = Da; x < width1*Da; x += Da ) for( x = Da; x < width1*Da; x += Da )
{ {
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD #if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes ) for( d = 0; d < Da; d += v_int16::nlanes )
{ {
...@@ -501,8 +619,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -501,8 +619,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
#endif #endif
for( x = Da; x < width1*Da; x += Da ) for( x = Da; x < width1*Da; x += Da )
{ {
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD #if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes) for (d = 0; d < Da; d += v_int16::nlanes)
...@@ -526,8 +644,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -526,8 +644,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
{ {
if( y > 0 ) if( y > 0 )
{ {
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD #if CV_SIMD
for (x = 0; x < width1*Da; x += v_int16::nlanes) for (x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x)); v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
...@@ -551,7 +669,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -551,7 +669,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
} }
// also, clear the S buffer // also, clear the S buffer
memset(S, 0, width1*Da * sizeof(CostType)); mem.clearSBuf(y);
} }
/* /*
...@@ -575,24 +693,26 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -575,24 +693,26 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( x = x1; x != x2; x += dx ) for( x = x1; x != x2; x += dx )
{ {
int xm = x*NR2, xd = xm*Dlra; int delta0 = P2 + *mem.getMinLr(lrID, x - dx);
int delta1 = P2 + *mem.getMinLr(1 - lrID, x - 1, 1);
int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2; int delta2 = P2 + *mem.getMinLr(1 - lrID, x, 2);
int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2; int delta3 = P2 + *mem.getMinLr(1 - lrID, x + 1, 3);
CostType* Lr_p0 = Lr[0] + xd - dx*NR2*Dlra; CostType* Lr_p0 = mem.getLr(lrID, x - dx);
CostType* Lr_p1 = Lr[1] + xd - NR2*Dlra + Dlra; CostType* Lr_p1 = mem.getLr(1 - lrID, x - 1, 1);
CostType* Lr_p2 = Lr[1] + xd + Dlra*2; CostType* Lr_p2 = mem.getLr(1 - lrID, x, 2);
CostType* Lr_p3 = Lr[1] + xd + NR2*Dlra + Dlra*3; CostType* Lr_p3 = mem.getLr(1 - lrID, x + 1, 3);
Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] = Lr_p0[-1] = Lr_p0[D] = MAX_COST;
Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST; Lr_p1[-1] = Lr_p1[D] = MAX_COST;
Lr_p2[-1] = Lr_p2[D] = MAX_COST;
Lr_p3[-1] = Lr_p3[D] = MAX_COST;
CostType* Lr_p = Lr[0] + xd; CostType* Lr_p = mem.getLr(lrID, x);
const CostType* Cp = C + x*Da; const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da; CostType* Sp = S + x*Da;
CostType* minL = minLr[0] + xm; CostType* minL = mem.getMinLr(lrID, x);
d = 0; d = 0;
#if CV_SIMD #if CV_SIMD
v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _P1 = vx_setall_s16((short)P1);
...@@ -703,14 +823,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -703,14 +823,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes ) for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
{ {
v_store(disp1ptr + x, v_inv_dist); v_store(disp1ptr + x, v_inv_dist);
v_store(disp2ptr + x, v_inv_dist); v_store(mem.disp2ptr + x, v_inv_dist);
v_store(disp2cost + x, v_max_cost); v_store(mem.disp2cost + x, v_max_cost);
} }
#endif #endif
for( ; x < width; x++ ) for( ; x < width; x++ )
{ {
disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; disp1ptr[x] = mem.disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
disp2cost[x] = MAX_COST; mem.disp2cost[x] = MAX_COST;
} }
for( x = width1 - 1; x >= 0; x-- ) for( x = width1 - 1; x >= 0; x-- )
...@@ -721,16 +841,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -721,16 +841,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
if( npasses == 1 ) if( npasses == 1 )
{ {
int xm = x*NR2, xd = xm*Dlra; CostType* Lr_p0 = mem.getLr(lrID, x + 1);
CostType* Lr_p0 = Lr[0] + xd + NR2*Dlra;
Lr_p0[-1] = Lr_p0[D] = MAX_COST; Lr_p0[-1] = Lr_p0[D] = MAX_COST;
CostType* Lr_p = Lr[0] + xd; CostType* Lr_p = mem.getLr(lrID, x);
const CostType* Cp = C + x*Da; const CostType* Cp = C + x*Da;
d = 0; d = 0;
int delta0 = minLr[0][xm + NR2] + P2; int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
int minL0 = MAX_COST; int minL0 = MAX_COST;
#if CV_SIMD #if CV_SIMD
v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _P1 = vx_setall_s16((short)P1);
...@@ -768,7 +886,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -768,7 +886,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
bestDisp = (short)d; bestDisp = (short)d;
} }
} }
minLr[0][xm] = (CostType)minL0; *mem.getMinLr(lrID, x) = (CostType)minL0;
} }
else else
{ {
...@@ -803,10 +921,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -803,10 +921,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
continue; continue;
d = bestDisp; d = bestDisp;
int _x2 = x + minX1 - d - minD; int _x2 = x + minX1 - d - minD;
if( disp2cost[_x2] > minS ) if( mem.disp2cost[_x2] > minS )
{ {
disp2cost[_x2] = (CostType)minS; mem.disp2cost[_x2] = (CostType)minS;
disp2ptr[_x2] = (DispType)(d + minD); mem.disp2ptr[_x2] = (DispType)(d + minD);
} }
if( 0 < d && d < D-1 ) if( 0 < d && d < D-1 )
...@@ -833,15 +951,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -833,15 +951,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
int _d = d1 >> DISP_SHIFT; int _d = d1 >> DISP_SHIFT;
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT; int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
int _x = x - _d, x_ = x - d_; int _x = x - _d, x_ = x - d_;
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && if( 0 <= _x && _x < width && mem.disp2ptr[_x] >= minD && std::abs(mem.disp2ptr[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff ) 0 <= x_ && x_ < width && mem.disp2ptr[x_] >= minD && std::abs(mem.disp2ptr[x_] - d_) > disp12MaxDiff )
disp1ptr[x] = (DispType)INVALID_DISP_SCALED; disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
} }
} }
// now shift the cyclic buffers lrID = 1 - lrID; // now shift the cyclic buffers
std::swap( Lr[0], Lr[1] );
std::swap( minLr[0], minLr[1] );
} }
} }
} }
...@@ -849,13 +965,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, ...@@ -849,13 +965,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
struct CalcVerticalSums: public ParallelLoopBody struct CalcVerticalSums: public ParallelLoopBody
{ {
CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, const BufferSGBM &mem_)
CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab) : img1(_img1), img2(_img2), mem(mem_)
{ {
minD = params.minDisparity; minD = params.minDisparity;
maxD = minD + params.numDisparities; maxD = minD + params.numDisparities;
SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2; SW2 = SH2 = params.calcSADWindowSize().height/2;
ftzero = std::max(params.preFilterCap, 15) | 1;
P1 = params.P1 > 0 ? params.P1 : 2; P1 = params.P1 > 0 ? params.P1 : 2;
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
height = img1.rows; height = img1.rows;
...@@ -865,32 +980,27 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -865,32 +980,27 @@ struct CalcVerticalSums: public ParallelLoopBody
Da = (int)alignSize(D, v_int16::nlanes); Da = (int)alignSize(D, v_int16::nlanes);
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1; width1 = maxX1 - minX1;
costBufSize = width1*Da; D = params.numDisparities;
CSBufSize = costBufSize*height; Da = (int)alignSize(D, v_int16::nlanes);
minLrSize = width1;
LrSize = minLrSize*Dlra;
hsumBufNRows = SH2*2 + 2;
Cbuf = alignedBuf;
Sbuf = Cbuf + CSBufSize;
hsumBuf = Sbuf + CSBufSize;
} }
void operator()(const Range& range) const CV_OVERRIDE void operator()(const Range& range) const CV_OVERRIDE
{ {
static const CostType MAX_COST = SHRT_MAX; const CostType MAX_COST = SHRT_MAX;
static const int TAB_OFS = 256*4; const int npasses = 2;
static const int npasses = 2; const int x1 = range.start, x2 = range.end;
int x1 = range.start, x2 = range.end, k; int k;
size_t pixDiffSize = ((x2 - x1) + 2*SW2)*Da;
size_t auxBufsSize = CV_SIMD_WIDTH + pixDiffSize*sizeof(CostType) + //alignment and pixdiff size CostType* pixDiff = 0;
width*(4*img1.channels()+2)*sizeof(PixType); //tempBuf PixType* tempBuf = 0;
Mat auxBuff; utils::BufferArea aux_area;
auxBuff.create(1, (int)auxBufsSize, CV_8U); aux_area.allocate(pixDiff, ((x2 - x1) + 2 * SW2) * Da, CV_SIMD_WIDTH);
CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH); aux_area.allocate(tempBuf, width * (4 * img1.channels() + 2) * sizeof(PixType), CV_SIMD_WIDTH);
PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize); aux_area.commit();
// Simplification of index calculation // Simplification of index calculation
pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*Da; if (x1 > SW2)
pixDiff -= (x1 - SW2) * Da;
for( int pass = 1; pass <= npasses; pass++ ) for( int pass = 1; pass <= npasses; pass++ )
{ {
...@@ -905,26 +1015,14 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -905,26 +1015,14 @@ struct CalcVerticalSums: public ParallelLoopBody
y1 = height-1; y2 = -1; dy = -1; y1 = height-1; y2 = -1; dy = -1;
} }
CostType *Lr[2]={0}, *minLr[2]={0}; uchar lrID = 0;
mem.clearLr(range);
for( k = 0; k < 2; k++ )
{
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
// however, then the alignment will be imperfect, i.e. bad for SSE,
// thus we shift the pointers by SIMD vector size
Lr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*k;
memset( Lr[k] + x1*Dlra, 0, (x2-x1)*Dlra*sizeof(CostType) );
minLr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*2 + minLrSize*k;
memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
}
for( int y = y1; y != y2; y += dy ) for( int y = y1; y != y2; y += dy )
{ {
int x, d; int x, d;
CostType* C = Cbuf + y*costBufSize; CostType* C = mem.getCBuf(y);
CostType* S = Sbuf + y*costBufSize; CostType* S = mem.getSBuf(y);
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any. if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
{ {
...@@ -932,11 +1030,11 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -932,11 +1030,11 @@ struct CalcVerticalSums: public ParallelLoopBody
for( k = dy1; k <= dy2; k++ ) for( k = dy1; k <= dy2; k++ )
{ {
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
if( k < height ) if( k < height )
{ {
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2); calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, mem.getClipTab(), x1 - SW2, x2 + SW2);
memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType)); memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType));
for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da ) for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
...@@ -953,8 +1051,8 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -953,8 +1051,8 @@ struct CalcVerticalSums: public ParallelLoopBody
if( y > 0 ) if( y > 0 )
{ {
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = C - costBufSize; const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD #if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes ) for( d = 0; d < Da; d += v_int16::nlanes )
v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d)); v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
...@@ -1020,8 +1118,8 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -1020,8 +1118,8 @@ struct CalcVerticalSums: public ParallelLoopBody
{ {
/* if (y > 0) /* if (y > 0)
{ {
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = C - costBufSize; const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD #if CV_SIMD
for( x = x1*Da; x < x2*Da; x += v_int16::nlanes ) for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
...@@ -1044,9 +1142,7 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -1044,9 +1142,7 @@ struct CalcVerticalSums: public ParallelLoopBody
} }
} }
} }
mem.clearSBuf(y, range);
// also, clear the S buffer
memset(S + x1*Da, 0, (x2-x1)*Da*sizeof(CostType));
} }
// [formula 13 in the paper] // [formula 13 in the paper]
...@@ -1061,19 +1157,16 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -1061,19 +1157,16 @@ struct CalcVerticalSums: public ParallelLoopBody
for( x = x1; x != x2; x++ ) for( x = x1; x != x2; x++ )
{ {
int xd = x*Dlra; int delta = P2 + *mem.getMinLr(1 - lrID, x);
CostType* Lr_ppr = mem.getLr(1 - lrID, x);
int delta = minLr[1][x] + P2;
CostType* Lr_ppr = Lr[1] + xd;
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
CostType* Lr_p = Lr[0] + xd; CostType* Lr_p = mem.getLr(lrID, x);
const CostType* Cp = C + x*Da; const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da; CostType* Sp = S + x*Da;
CostType& minL = minLr[0][x]; CostType& minL = *(mem.getMinLr(lrID, x));
d = 0; d = 0;
#if CV_SIMD #if CV_SIMD
v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _P1 = vx_setall_s16((short)P1);
...@@ -1105,19 +1198,13 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -1105,19 +1198,13 @@ struct CalcVerticalSums: public ParallelLoopBody
Sp[d] = saturate_cast<CostType>(Sp[d] + L); Sp[d] = saturate_cast<CostType>(Sp[d] + L);
} }
} }
lrID = 1 - lrID; // now shift the cyclic buffers
// now shift the cyclic buffers
std::swap( Lr[0], Lr[1] );
std::swap( minLr[0], minLr[1] );
} }
} }
} }
const Mat& img1; const Mat& img1;
const Mat& img2; const Mat& img2;
CostType* Cbuf; const BufferSGBM & mem;
CostType* Sbuf;
CostType* hsumBuf;
PixType* clipTab;
int minD; int minD;
int maxD; int maxD;
int D, Da, Dlra; int D, Da, Dlra;
...@@ -1128,18 +1215,12 @@ struct CalcVerticalSums: public ParallelLoopBody ...@@ -1128,18 +1215,12 @@ struct CalcVerticalSums: public ParallelLoopBody
int height; int height;
int P1; int P1;
int P2; int P2;
size_t costBufSize;
size_t CSBufSize;
size_t minLrSize;
size_t LrSize;
size_t hsumBufNRows;
int ftzero;
}; };
struct CalcHorizontalSums: public ParallelLoopBody struct CalcHorizontalSums: public ParallelLoopBody
{ {
CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, const BufferSGBM &mem_)
CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1) : img1(_img1), img2(_img2), disp1(_disp1), mem(mem_)
{ {
minD = params.minDisparity; minD = params.minDisparity;
maxD = minD + params.numDisparities; maxD = minD + params.numDisparities;
...@@ -1157,23 +1238,22 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1157,23 +1238,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
Da = (int)alignSize(D, v_int16::nlanes); Da = (int)alignSize(D, v_int16::nlanes);
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1; width1 = maxX1 - minX1;
costBufSize = width1*Da;
CSBufSize = costBufSize*height;
LrSize = 2 * Dlra;
Cbuf = alignedBuf;
Sbuf = Cbuf + CSBufSize;
} }
void operator()(const Range& range) const CV_OVERRIDE void operator()(const Range& range) const CV_OVERRIDE
{ {
int y1 = range.start, y2 = range.end; int y1 = range.start, y2 = range.end;
size_t auxBufsSize = CV_SIMD_WIDTH + (v_int16::nlanes + LrSize) * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType));
Mat auxBuff; const size_t LrSize = 2 * (1 + Dlra + 1);
auxBuff.create(1, (int)auxBufsSize, CV_8U);
CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH)) + v_int16::nlanes; CostType * Lr = 0;
CostType* disp2cost = Lr + LrSize; CostType * disp2cost = 0;
DispType* disp2ptr = (DispType*)(disp2cost + width); DispType * disp2ptr = 0;
utils::BufferArea aux_area;
aux_area.allocate(Lr, LrSize);
aux_area.allocate(disp2cost, width, CV_SIMD_WIDTH);
aux_area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
aux_area.commit();
CostType minLr; CostType minLr;
...@@ -1181,8 +1261,8 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1181,8 +1261,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
{ {
int x, d; int x, d;
DispType* disp1ptr = disp1.ptr<DispType>(y); DispType* disp1ptr = disp1.ptr<DispType>(y);
CostType* C = Cbuf + y*costBufSize; CostType* C = mem.getCBuf(y);
CostType* S = Sbuf + y*costBufSize; CostType* S = mem.getSBuf(y);
x = 0; x = 0;
#if CV_SIMD #if CV_SIMD
...@@ -1202,8 +1282,8 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1202,8 +1282,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
} }
// clear buffers // clear buffers
memset( Lr, 0, LrSize*sizeof(CostType) ); aux_area.zeroFill(Lr);
Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST; Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
minLr = 0; minLr = 0;
// [formula 13 in the paper] // [formula 13 in the paper]
...@@ -1219,10 +1299,8 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1219,10 +1299,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
for( x = 0; x != width1; x++) for( x = 0; x != width1; x++)
{ {
int delta = minLr + P2; int delta = minLr + P2;
CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
CostType* Lr_ppr = Lr + ((x&1)? 0 : Dlra); CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
CostType* Lr_p = Lr + ((x&1)? Dlra :0);
const CostType* Cp = C + x*Da; const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da; CostType* Sp = S + x*Da;
...@@ -1236,8 +1314,8 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1236,8 +1314,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes) for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
{ {
v_int16 Cpd = vx_load_aligned(Cp + d); v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_store_aligned(Lr_p + d, L); v_store(Lr_p + d, L);
_minL = v_min(_minL, L); _minL = v_min(_minL, L);
v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L); v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
} }
...@@ -1255,18 +1333,16 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1255,18 +1333,16 @@ struct CalcHorizontalSums: public ParallelLoopBody
} }
} }
memset( Lr, 0, LrSize*sizeof(CostType) ); aux_area.zeroFill(Lr);
Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST; Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
minLr = 0; minLr = 0;
for( x = width1-1; x != -1; x--) for( x = width1-1; x != -1; x--)
{ {
int delta = minLr + P2; int delta = minLr + P2;
CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
CostType* Lr_ppr = Lr + ((x&1)? 0 :Dlra); CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
CostType* Lr_p = Lr + ((x&1)? Dlra :0);
const CostType* Cp = C + x*Da; const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da; CostType* Sp = S + x*Da;
CostType minS = MAX_COST; CostType minS = MAX_COST;
...@@ -1283,8 +1359,8 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1283,8 +1359,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
{ {
v_int16 Cpd = vx_load_aligned(Cp + d); v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_store_aligned(Lr_p + d, L); v_store(Lr_p + d, L);
_minL = v_min(_minL, L); _minL = v_min(_minL, L);
L += vx_load_aligned(Sp + d); L += vx_load_aligned(Sp + d);
v_store_aligned(Sp + d, L); v_store_aligned(Sp + d, L);
...@@ -1366,8 +1442,7 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1366,8 +1442,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
const Mat& img1; const Mat& img1;
const Mat& img2; const Mat& img2;
Mat& disp1; Mat& disp1;
CostType* Cbuf; const BufferSGBM & mem;
CostType* Sbuf;
int minD; int minD;
int maxD; int maxD;
int D, Da, Dlra; int D, Da, Dlra;
...@@ -1378,9 +1453,6 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1378,9 +1453,6 @@ struct CalcHorizontalSums: public ParallelLoopBody
int P2; int P2;
int minX1; int minX1;
int maxX1; int maxX1;
size_t costBufSize;
size_t CSBufSize;
size_t LrSize;
int INVALID_DISP; int INVALID_DISP;
int INVALID_DISP_SCALED; int INVALID_DISP_SCALED;
int uniquenessRatio; int uniquenessRatio;
...@@ -1401,28 +1473,21 @@ struct CalcHorizontalSums: public ParallelLoopBody ...@@ -1401,28 +1473,21 @@ struct CalcHorizontalSums: public ParallelLoopBody
is written as is, without interpolation. is written as is, without interpolation.
*/ */
static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
Mat& disp1, const StereoSGBMParams& params, Mat& disp1, const StereoSGBMParams& params)
Mat& buffer )
{ {
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
const int DISP_SCALE = (1 << DISP_SHIFT); const int DISP_SCALE = (1 << DISP_SHIFT);
int minD = params.minDisparity, maxD = minD + params.numDisparities; int minD = params.minDisparity, maxD = minD + params.numDisparities;
Size SADWindowSize; Size SADWindowSize;
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
int ftzero = std::max(params.preFilterCap, 15) | 1;
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
int k, width = disp1.cols, height = disp1.rows; int width = disp1.cols, height = disp1.rows;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1; int width1 = maxX1 - minX1;
int Dlra = D + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
int SH2 = SADWindowSize.height/2; int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1; int INVALID_DISP = minD - 1;
int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType clipTab[TAB_SIZE];
for( k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
if( minX1 >= maxX1 ) if( minX1 >= maxX1 )
{ {
...@@ -1430,54 +1495,79 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, ...@@ -1430,54 +1495,79 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
return; return;
} }
// for each possible stereo match (img1(x,y) <=> img2(x-d,y)) BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
// we keep pixel difference cost (C) and the summary cost over 4 directions (S). mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
size_t costBufSize = width1*D;
size_t CSBufSize = costBufSize*height;
size_t minLrSize = width1 , LrSize = minLrSize*Dlra;
int hsumBufNRows = SH2*2 + 2;
size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // Alignment, C, S
costBufSize*hsumBufNRows * sizeof(CostType) + // hsumBuf
((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType); // minLr[] and Lr[]
if( buffer.empty() || !buffer.isContinuous() ||
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
{
buffer.reserveBuffer(totalBufSize);
}
// summary cost over different (nDirs) directions
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
// add P2 to every C(x,y). it saves a few operations in the inner loops
for(k = 0; k < (int)CSBufSize; k++ )
Cbuf[k] = (CostType)P2;
parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, mem),8);
parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, mem),8);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2, class BufferSGBM3Way
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, {
PixType*& tmpBuf, CostType*& horPassCostVolume, private:
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf, size_t hsumCols;
CostType*& disp2CostBuf, short*& disp2Buf); size_t hsumRows;
public:
CostType *curCostVolumeLine;
CostType *hsumBuf;
CostType *pixDiff;
PixType *tmpBuf;
CostType *horPassCostVolume;
CostType *vertPassCostVolume;
CostType *vertPassMin;
CostType *rightPassBuf;
CostType *disp2CostBuf;
short *disp2Buf;
private:
utils::BufferArea area;
public:
BufferSGBM3Way(int width1, int width, int num_ch, int Da, int SH2, int P2) :
curCostVolumeLine(0),
hsumBuf(0),
pixDiff(0),
tmpBuf(0),
horPassCostVolume(0),
vertPassCostVolume(0),
vertPassMin(0),
rightPassBuf(0),
disp2CostBuf(0),
disp2Buf(0)
{
hsumCols = width1 * Da;
hsumRows = SH2*2 + 2;
area.allocate(curCostVolumeLine, hsumCols, CV_SIMD_WIDTH);
area.allocate(hsumBuf, hsumCols * hsumRows, CV_SIMD_WIDTH);
area.allocate(pixDiff,hsumCols, CV_SIMD_WIDTH);
area.allocate(tmpBuf, width * (4 * num_ch + 2), CV_SIMD_WIDTH);
area.allocate(horPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
area.allocate(vertPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
area.allocate(vertPassMin, width1 + 2, CV_SIMD_WIDTH);
area.allocate(rightPassBuf, Da, CV_SIMD_WIDTH);
area.allocate(disp2CostBuf, width, CV_SIMD_WIDTH);
area.allocate(disp2Buf, width, CV_SIMD_WIDTH);
area.commit();
area.zeroFill();
for(size_t i = 0; i < hsumCols; i++)
curCostVolumeLine[i] = (CostType)P2;
}
inline void clearRightPassBuf()
{
area.zeroFill(rightPassBuf);
}
CostType *getHSumBuf(int x) const
{
return hsumBuf + (x % hsumRows) * hsumCols;
}
};
struct SGBM3WayMainLoop : public ParallelLoopBody struct SGBM3WayMainLoop : public ParallelLoopBody
{ {
Mat* buffers;
const Mat *img1, *img2; const Mat *img1, *img2;
Mat* dst_disp; Mat* dst_disp;
int nstripes, stripe_sz; int stripe_sz;
int stripe_overlap; int stripe_overlap;
int width,height; int width,height;
...@@ -1488,25 +1578,54 @@ struct SGBM3WayMainLoop : public ParallelLoopBody ...@@ -1488,25 +1578,54 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
int P1, P2; int P1, P2;
int uniquenessRatio, disp12MaxDiff; int uniquenessRatio, disp12MaxDiff;
int costBufSize, hsumBufNRows; int TAB_OFS;
int TAB_OFS, ftzero;
utils::BufferArea aux_area;
PixType* clipTab; PixType* clipTab;
#if CV_SIMD #if CV_SIMD
short idx_row[v_int16::nlanes]; short idx_row[v_int16::nlanes];
#endif #endif
SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap); SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
void operator () (const Range& range) const CV_OVERRIDE; void operator () (const Range& range) const CV_OVERRIDE;
template<bool x_nlanes> void impl(const Range& range) const; template<bool x_nlanes> void impl(const Range& range) const;
private:
void getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const;
template<bool x_nlanes>
void accumulateCostsLeftTop(const BufferSGBM3Way &mem,
int x,
CostType &leftMinCost) const;
template<bool x_nlanes>
void accumulateCostsRight(const BufferSGBM3Way &mem,
int x,
CostType &rightMinCost,
short &optimal_disp,
CostType &min_cost) const;
}; };
SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap): SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab) const Mat& _img2,
Mat* _dst_disp,
const StereoSGBMParams& params,
int _stripe_sz,
int _stripe_overlap)
: img1(&_img1),
img2(&_img2),
dst_disp(_dst_disp),
stripe_sz(_stripe_sz),
stripe_overlap(_stripe_overlap),
clipTab(0)
{ {
nstripes = _nstripes; // precompute a lookup table for the raw matching cost computation:
stripe_overlap = _stripe_overlap; TAB_OFS = 256*4;
stripe_sz = (int)ceil(img1->rows/(double)nstripes); const int TAB_SIZE = 256 + TAB_OFS*2;
aux_area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
aux_area.commit();
const int ftzero = std::max(params.preFilterCap, 15) | 1;
for(int k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
width = img1->cols; height = img1->rows; width = img1->cols; height = img1->rows;
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD; minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
...@@ -1519,100 +1638,27 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli ...@@ -1519,100 +1638,27 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
costBufSize = width1*Da;
hsumBufNRows = SH2*2 + 2;
TAB_OFS = 256*4;
ftzero = std::max(params.preFilterCap, 15) | 1;
#if CV_SIMD #if CV_SIMD
for(short i = 0; i < v_int16::nlanes; ++i) for(short i = 0; i < v_int16::nlanes; ++i)
idx_row[i] = i; idx_row[i] = i;
#endif #endif
} }
void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
PixType*& tmpBuf, CostType*& horPassCostVolume,
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
CostType*& disp2CostBuf, short*& disp2Buf)
{
// allocating all the required memory:
int costVolumeLineSize = width1*Da;
int width1_ext = width1+2;
int costVolumeLineSize_ext = width1_ext*Da;
int hsumBufNRows = SH2*2 + 2;
// main buffer to store matching costs for the current line:
int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
// auxiliary buffers for the raw matching cost computation:
int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
int pixDiffSize = costVolumeLineSize*sizeof(CostType);
int tmpBufSize = width * (4 * num_ch + 2) * sizeof(PixType);
// auxiliary buffers for the matching cost aggregation:
int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
int rightPassBufSize = Da * sizeof(CostType); // additional small buffer for the right-to-left pass
int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line
// buffers for the pseudo-LRC check:
int disp2CostBufSize = width*sizeof(CostType);
int disp2BufSize = width*sizeof(short);
// sum up the sizes of all the buffers:
size_t totalBufSize = CV_SIMD_WIDTH + curCostVolumeLineSize +
hsumBufSize +
pixDiffSize +
horPassCostVolumeSize +
vertPassCostVolumeSize +
rightPassBufSize +
vertPassMinSize +
disp2CostBufSize +
disp2BufSize +
tmpBufSize;
if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
buffer.reserveBuffer(totalBufSize);
// set up all the pointers:
curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
hsumBuf = curCostVolumeLine + costVolumeLineSize;
pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows;
horPassCostVolume = pixDiff + costVolumeLineSize;
vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext;
vertPassMin = rightPassBuf + Da;
disp2CostBuf = vertPassMin + width1_ext;
disp2Buf = disp2CostBuf + width;
tmpBuf = (PixType*)(disp2Buf + width);
// initialize memory:
memset(buffer.ptr(),0,totalBufSize);
int i = 0;
#if CV_SIMD
v_int16 _P2 = vx_setall_s16((CostType)P2);
for (; i<=costVolumeLineSize-v_int16::nlanes; i+=v_int16::nlanes)
v_store_aligned(curCostVolumeLine + i, _P2);
#endif
for(;i<costVolumeLineSize;i++)
curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
}
// performing block matching and building raw cost-volume for the current row // performing block matching and building raw cost-volume for the current row
void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const
CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
int y, int src_start_idx) const
{ {
CostType* C = mem.curCostVolumeLine;
CostType* pixDiff = mem.pixDiff;
PixType* tmpBuf = mem.tmpBuf;
int x, d; int x, d;
int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1; int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
for(int k = dy1; k <= dy2; k++ ) for(int k = dy1; k <= dy2; k++ )
{ {
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
if( k < height ) if( k < height )
{ {
calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero ); calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
#if CV_SIMD #if CV_SIMD
v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1); v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
...@@ -1634,7 +1680,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row ...@@ -1634,7 +1680,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
#endif #endif
if( y > src_start_idx ) if( y > src_start_idx )
{ {
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize; const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
#if CV_SIMD #if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes) for (d = 0; d < Da; d += v_int16::nlanes)
...@@ -1702,7 +1748,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row ...@@ -1702,7 +1748,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
{ {
if( y > src_start_idx ) if( y > src_start_idx )
{ {
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize; const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
#if CV_SIMD #if CV_SIMD
for( x = 0; x < width1*Da; x += v_int16::nlanes) for( x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x)); v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
...@@ -1728,12 +1774,15 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row ...@@ -1728,12 +1774,15 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
// performing SGM cost accumulation from left to right (result is stored in leftBuf) and // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
// in-place cost accumulation from top to bottom (result is stored in topBuf) // in-place cost accumulation from top to bottom (result is stored in topBuf)
template<bool x_nlanes> template<bool x_nlanes>
inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs, void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType& leftMinCost) const
CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
{ {
CostType *leftBuf = mem.horPassCostVolume + x;
CostType *leftBuf_prev = mem.horPassCostVolume + x - Da;
CostType *topBuf = mem.vertPassCostVolume + x;
CostType *costs = mem.curCostVolumeLine - Da + x;
CostType& topMinCost = mem.vertPassMin[x/Da];
int i = 0; int i = 0;
#if CV_SIMD #if CV_SIMD
int Da = (int)alignSize(D, v_int16::nlanes);
v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1)); v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2)); v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
...@@ -1847,12 +1896,16 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co ...@@ -1847,12 +1896,16 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
// optimal disparity value with minimum accumulated cost // optimal disparity value with minimum accumulated cost
template<bool x_nlanes> template<bool x_nlanes>
inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs, void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
CostType& rightMinCost, int D, int P1, int P2, short& optimal_disp, CostType& min_cost) CostType& rightMinCost, short& optimal_disp, CostType& min_cost) const
{ {
CostType* costs = mem.curCostVolumeLine - Da + x;
CostType* rightBuf = mem.rightPassBuf;
CostType* topBuf = mem.vertPassCostVolume + x;
CostType* leftBuf = mem.horPassCostVolume + x;
int i = 0; int i = 0;
#if CV_SIMD #if CV_SIMD
int Da = (int)alignSize(D, v_int16::nlanes);
v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1)); v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2)); v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
...@@ -1955,6 +2008,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const ...@@ -1955,6 +2008,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
if (D == Da) impl<true>(range); if (D == Da) impl<true>(range);
else impl<false>(range); else impl<false>(range);
} }
template<bool x_nlanes> template<bool x_nlanes>
void SGBM3WayMainLoop::impl(const Range& range) const void SGBM3WayMainLoop::impl(const Range& range) const
{ {
...@@ -1979,33 +2033,24 @@ void SGBM3WayMainLoop::impl(const Range& range) const ...@@ -1979,33 +2033,24 @@ void SGBM3WayMainLoop::impl(const Range& range) const
else else
dst_offset=0; dst_offset=0;
Mat cur_buffer = buffers [range.start];
Mat cur_disp = dst_disp[range.start]; Mat cur_disp = dst_disp[range.start];
cur_disp = Scalar(INVALID_DISP_SCALED); cur_disp = Scalar(INVALID_DISP_SCALED);
// prepare buffers: BufferSGBM3Way mem(width1, width, img1->channels(), Da, SH2, P2);
CostType *curCostVolumeLine, *hsumBuf, *pixDiff; CostType *horPassCostVolume = mem.horPassCostVolume;
PixType* tmpBuf;
CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
short* disp2Buf;
getBufferPointers(cur_buffer,width,width1,Da,img1->channels(),SH2,P2,
curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
// start real processing: // start real processing:
for(int y=src_start_idx;y<src_end_idx;y++) for(int y=src_start_idx;y<src_end_idx;y++)
{ {
getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx); getRawMatchingCost(mem, y, src_start_idx);
short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx)); short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
// initialize the auxiliary buffers for the pseudo left-right consistency check: // initialize the auxiliary buffers for the pseudo left-right consistency check:
for(int x=0;x<width;x++) for(int x=0;x<width;x++)
{ {
disp2Buf[x] = (short)INVALID_DISP_SCALED; mem.disp2Buf[x] = (short)INVALID_DISP_SCALED;
disp2CostBuf[x] = SHRT_MAX; mem.disp2CostBuf[x] = SHRT_MAX;
} }
CostType* C = curCostVolumeLine - Da;
CostType prev_min, min_cost; CostType prev_min, min_cost;
int d; int d;
short best_d; short best_d;
...@@ -2014,14 +2059,14 @@ void SGBM3WayMainLoop::impl(const Range& range) const ...@@ -2014,14 +2059,14 @@ void SGBM3WayMainLoop::impl(const Range& range) const
// forward pass // forward pass
prev_min=0; prev_min=0;
for (int x=Da;x<(1+width1)*Da;x+=Da) for (int x=Da;x<(1+width1)*Da;x+=Da)
accumulateCostsLeftTop<x_nlanes>(horPassCostVolume+x,horPassCostVolume+x-Da,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/Da],D,P1,P2); accumulateCostsLeftTop<x_nlanes>(mem, x, prev_min);
//backward pass //backward pass
memset(rightPassBuf,0,Da*sizeof(CostType)); mem.clearRightPassBuf();
prev_min=0; prev_min=0;
for (int x=width1*Da;x>=Da;x-=Da) for (int x=width1*Da;x>=Da;x-=Da)
{ {
accumulateCostsRight<x_nlanes>(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost); accumulateCostsRight<x_nlanes>(mem, x, prev_min, best_d, min_cost);
if(uniquenessRatio>0) if(uniquenessRatio>0)
{ {
...@@ -2074,10 +2119,10 @@ void SGBM3WayMainLoop::impl(const Range& range) const ...@@ -2074,10 +2119,10 @@ void SGBM3WayMainLoop::impl(const Range& range) const
d = best_d; d = best_d;
int _x2 = x/Da - 1 + minX1 - d - minD; int _x2 = x/Da - 1 + minX1 - d - minD;
if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost ) if( _x2>=0 && _x2<width && mem.disp2CostBuf[_x2] > min_cost )
{ {
disp2CostBuf[_x2] = min_cost; mem.disp2CostBuf[_x2] = min_cost;
disp2Buf[_x2] = (short)(d + minD); mem.disp2Buf[_x2] = (short)(d + minD);
} }
if( 0 < d && d < D-1 ) if( 0 < d && d < D-1 )
...@@ -2104,32 +2149,27 @@ void SGBM3WayMainLoop::impl(const Range& range) const ...@@ -2104,32 +2149,27 @@ void SGBM3WayMainLoop::impl(const Range& range) const
int _d = d1 >> StereoMatcher::DISP_SHIFT; int _d = d1 >> StereoMatcher::DISP_SHIFT;
int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT; int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
int _x = x - _d, x_ = x - d_; int _x = x - _d, x_ = x - d_;
if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff && if( 0 <= _x && _x < width && mem.disp2Buf[_x] >= minD && std::abs(mem.disp2Buf[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff ) 0 <= x_ && x_ < width && mem.disp2Buf[x_] >= minD && std::abs(mem.disp2Buf[x_] - d_) > disp12MaxDiff )
disp_row[x] = (short)INVALID_DISP_SCALED; disp_row[x] = (short)INVALID_DISP_SCALED;
} }
} }
} }
static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2, template <uchar nstripes>
Mat& disp1, const StereoSGBMParams& params, static void computeDisparity3WaySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params)
Mat* buffers, int nstripes )
{ {
// precompute a lookup table for the raw matching cost computation:
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType* clipTab = new PixType[TAB_SIZE];
int ftzero = std::max(params.preFilterCap, 15) | 1;
for(int k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
// allocate separate dst_disp arrays to avoid conflicts due to stripe overlap: // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
int stripe_sz = (int)ceil(img1.rows/(double)nstripes); int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz); int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
Mat* dst_disp = new Mat[nstripes]; Mat dst_disp[nstripes];
for(int i=0;i<nstripes;i++) for(int i=0;i<nstripes;i++)
dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S); dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap)); parallel_for_(
Range(0,nstripes),
SGBM3WayMainLoop(img1,img2,dst_disp,params,stripe_sz,stripe_overlap)
);
//assemble disp1 from dst_disp: //assemble disp1 from dst_disp:
short* dst_row; short* dst_row;
...@@ -2140,9 +2180,6 @@ static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2, ...@@ -2140,9 +2180,6 @@ static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz); src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
memcpy(dst_row,src_row,disp1.cols*sizeof(short)); memcpy(dst_row,src_row,disp1.cols*sizeof(short));
} }
delete[] clipTab;
delete[] dst_disp;
} }
class StereoSGBMImpl CV_FINAL : public StereoSGBM class StereoSGBMImpl CV_FINAL : public StereoSGBM
...@@ -2176,11 +2213,13 @@ public: ...@@ -2176,11 +2213,13 @@ public:
Mat disp = disparr.getMat(); Mat disp = disparr.getMat();
if(params.mode==MODE_SGBM_3WAY) if(params.mode==MODE_SGBM_3WAY)
computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes ); // the number of stripes is fixed, disregarding the number of threads/processors
// to make the results fully reproducible
computeDisparity3WaySGBM<4>( left, right, disp, params );
else if(params.mode==MODE_HH4) else if(params.mode==MODE_HH4)
computeDisparitySGBM_HH4( left, right, disp, params, buffer ); computeDisparitySGBM_HH4( left, right, disp, params );
else else
computeDisparitySGBM( left, right, disp, params, buffer ); computeDisparitySGBM( left, right, disp, params );
medianBlur(disp, disp, 3); medianBlur(disp, disp, 3);
...@@ -2259,11 +2298,6 @@ public: ...@@ -2259,11 +2298,6 @@ public:
StereoSGBMParams params; StereoSGBMParams params;
Mat buffer; Mat buffer;
// the number of stripes is fixed, disregarding the number of threads/processors
// to make the results fully reproducible:
static const int num_stripes = 4;
Mat buffers[num_stripes];
static const char* name_; static const char* name_;
}; };
......
...@@ -74,6 +74,25 @@ public: ...@@ -74,6 +74,25 @@ public:
allocate_((void**)(&ptr), static_cast<ushort>(sizeof(T)), count, alignment); allocate_((void**)(&ptr), static_cast<ushort>(sizeof(T)), count, alignment);
} }
/** @brief Fill one of buffers with zeroes
@param ptr pointer to memory block previously added using BufferArea::allocate
BufferArea::commit must be called before using this method
*/
template <typename T>
void zeroFill(T*&ptr)
{
CV_Assert(ptr);
zeroFill_((void**)&ptr);
}
/** @brief Fill all buffers with zeroes
BufferArea::commit must be called before using this method
*/
void zeroFill();
/** @brief Allocate memory and initialize all bound pointers /** @brief Allocate memory and initialize all bound pointers
Each pointer bound to the area with the BufferArea::allocate will be initialized and will be set Each pointer bound to the area with the BufferArea::allocate will be initialized and will be set
...@@ -83,10 +102,18 @@ public: ...@@ -83,10 +102,18 @@ public:
*/ */
void commit(); void commit();
/** @brief Release all memory and unbind all pointers
All memory will be freed and all pointers will be reset to NULL and untied from the area allowing
to call `allocate` and `commit` again.
*/
void release();
private: private:
BufferArea(const BufferArea &); // = delete BufferArea(const BufferArea &); // = delete
BufferArea &operator=(const BufferArea &); // = delete BufferArea &operator=(const BufferArea &); // = delete
void allocate_(void **ptr, ushort type_size, size_t count, ushort alignment); void allocate_(void **ptr, ushort type_size, size_t count, ushort alignment);
void zeroFill_(void **ptr);
private: private:
class Block; class Block;
......
...@@ -66,6 +66,16 @@ public: ...@@ -66,6 +66,16 @@ public:
*ptr = buf; *ptr = buf;
return static_cast<void*>(static_cast<uchar*>(*ptr) + type_size * count); return static_cast<void*>(static_cast<uchar*>(*ptr) + type_size * count);
} }
bool operator==(void **other) const
{
CV_Assert(ptr && other);
return *ptr == *other;
}
void zeroFill() const
{
CV_Assert(ptr && *ptr);
memset(static_cast<uchar*>(*ptr), 0, count * type_size);
}
private: private:
void **ptr; void **ptr;
void * raw_mem; void * raw_mem;
...@@ -85,10 +95,7 @@ BufferArea::BufferArea(bool safe_) : ...@@ -85,10 +95,7 @@ BufferArea::BufferArea(bool safe_) :
BufferArea::~BufferArea() BufferArea::~BufferArea()
{ {
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i) release();
i->cleanup();
if (oneBuf)
fastFree(oneBuf);
} }
void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort alignment) void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort alignment)
...@@ -100,6 +107,26 @@ void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort al ...@@ -100,6 +107,26 @@ void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort al
totalSize += blocks.back().getByteCount(); totalSize += blocks.back().getByteCount();
} }
void BufferArea::zeroFill_(void **ptr)
{
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
{
if (*i == ptr)
{
i->zeroFill();
break;
}
}
}
void BufferArea::zeroFill()
{
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
{
i->zeroFill();
}
}
void BufferArea::commit() void BufferArea::commit()
{ {
if (!safe) if (!safe)
...@@ -116,6 +143,20 @@ void BufferArea::commit() ...@@ -116,6 +143,20 @@ void BufferArea::commit()
} }
} }
void BufferArea::release()
{
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
{
i->cleanup();
}
blocks.clear();
if (oneBuf)
{
fastFree(oneBuf);
oneBuf = 0;
}
}
//================================================================================================== //==================================================================================================
}} // cv::utils:: }} // cv::utils::
...@@ -337,6 +337,21 @@ TEST_P(BufferArea, basic) ...@@ -337,6 +337,21 @@ TEST_P(BufferArea, basic)
ASSERT_TRUE(dbl_ptr != NULL); ASSERT_TRUE(dbl_ptr != NULL);
EXPECT_EQ((size_t)0, (size_t)int_ptr % sizeof(int)); EXPECT_EQ((size_t)0, (size_t)int_ptr % sizeof(int));
EXPECT_EQ((size_t)0, (size_t)dbl_ptr % sizeof(double)); EXPECT_EQ((size_t)0, (size_t)dbl_ptr % sizeof(double));
for (size_t i = 0; i < SZ; ++i)
{
int_ptr[i] = (int)i + 1;
uchar_ptr[i] = (uchar)i + 1;
dbl_ptr[i] = (double)i + 1;
}
area.zeroFill(int_ptr);
area.zeroFill(uchar_ptr);
area.zeroFill(dbl_ptr);
for (size_t i = 0; i < SZ; ++i)
{
EXPECT_EQ((int)0, int_ptr[i]);
EXPECT_EQ((uchar)0, uchar_ptr[i]);
EXPECT_EQ((double)0, dbl_ptr[i]);
}
} }
EXPECT_TRUE(int_ptr == NULL); EXPECT_TRUE(int_ptr == NULL);
EXPECT_TRUE(uchar_ptr == NULL); EXPECT_TRUE(uchar_ptr == NULL);
......
...@@ -47,6 +47,7 @@ The references are: ...@@ -47,6 +47,7 @@ The references are:
#include "opencl_kernels_features2d.hpp" #include "opencl_kernels_features2d.hpp"
#include "hal_replacement.hpp" #include "hal_replacement.hpp"
#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/utils/buffer_area.private.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp" #include "opencv2/core/openvx/ovx_defs.hpp"
...@@ -80,20 +81,26 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo ...@@ -80,20 +81,26 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
for( i = -255; i <= 255; i++ ) for( i = -255; i <= 255; i++ )
threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);
AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); uchar* buf[3] = { 0 };
uchar* buf[3]; int* cpbuf[3] = { 0 };
buf[0] = _buf.data(); buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; utils::BufferArea area;
int* cpbuf[3]; for (unsigned idx = 0; idx < 3; ++idx)
cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; {
cpbuf[1] = cpbuf[0] + img.cols + 1; area.allocate(buf[idx], img.cols);
cpbuf[2] = cpbuf[1] + img.cols + 1; area.allocate(cpbuf[idx], img.cols + 1);
memset(buf[0], 0, img.cols*3); }
area.commit();
for (unsigned idx = 0; idx < 3; ++idx)
{
memset(buf[idx], 0, img.cols);
}
for(i = 3; i < img.rows-2; i++) for(i = 3; i < img.rows-2; i++)
{ {
const uchar* ptr = img.ptr<uchar>(i) + 3; const uchar* ptr = img.ptr<uchar>(i) + 3;
uchar* curr = buf[(i - 3)%3]; uchar* curr = buf[(i - 3)%3];
int* cornerpos = cpbuf[(i - 3)%3]; int* cornerpos = cpbuf[(i - 3)%3] + 1; // cornerpos[-1] is used to store a value
memset(curr, 0, img.cols); memset(curr, 0, img.cols);
int ncorners = 0; int ncorners = 0;
...@@ -266,7 +273,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo ...@@ -266,7 +273,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* prev = buf[(i - 4 + 3)%3];
const uchar* pprev = buf[(i - 5 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3]; cornerpos = cpbuf[(i - 4 + 3)%3] + 1; // cornerpos[-1] is used to store a value
ncorners = cornerpos[-1]; ncorners = cornerpos[-1];
for( k = 0; k < ncorners; k++ ) for( k = 0; k < ncorners; k++ )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment