Commit bf96d823 authored by Maksim Shabunin's avatar Maksim Shabunin

Use BufferArea in more places

parent f9bd0257
......@@ -55,7 +55,7 @@
#include <math.h>
#include <vector>
#include "rho.h"
#include "opencv2/core/utils/buffer_area.private.hpp"
......@@ -65,7 +65,6 @@ namespace cv{/* For C support, replace with extern "C" { */
/* Constants */
const int MEM_ALIGN = 32;
const size_t HSIZE = (3*3*sizeof(float));
const double MIN_DELTA_CHNG = 0.1;
// const double CHI_STAT = 2.706;
......@@ -312,16 +311,14 @@ struct RHO_HEST_REFC : RHO_HEST{
/* Levenberg-Marquardt Refinement */
struct{
float (* JtJ)[8]; /* JtJ matrix */
float (* tmp1)[8]; /* Temporary 1 */
float* JtJ; /* JtJ matrix */
float* tmp1; /* Temporary 1 */
float* Jte; /* Jte vector */
} lm;
/* Memory Management */
struct{
cv::Mat perObj;
cv::Mat perRun;
} mem;
utils::BufferArea runArea;
utils::BufferArea objArea;
/* Initialized? */
int initialized;
......@@ -659,16 +656,9 @@ inline int RHO_HEST_REFC::initialize(void){
fastSeed((uint64_t)~0);
initialized = 1;
int areAllAllocsSuccessful = !mem.perObj.empty();
if(!areAllAllocsSuccessful){
finalize();
}else{
initialized = 1;
}
return areAllAllocsSuccessful;
return true;
}
/**
......@@ -835,45 +825,14 @@ unsigned RHO_HEST_REFC::rhoHest(const float* src, /* Source points */
*/
inline void RHO_HEST_REFC::allocatePerObj(void){
/* We have known sizes */
size_t ctrl_smpl_sz = SMPL_SIZE*sizeof(*ctrl.smpl);
size_t curr_pkdPts_sz = SMPL_SIZE*2*2*sizeof(*curr.pkdPts);
size_t curr_H_sz = HSIZE;
size_t best_H_sz = HSIZE;
size_t lm_JtJ_sz = 8*8*sizeof(float);
size_t lm_tmp1_sz = 8*8*sizeof(float);
size_t lm_Jte_sz = 1*8*sizeof(float);
/* We compute offsets */
size_t total = 0;
#define MK_OFFSET(v) \
size_t v ## _of = total; \
total = alignSize(v ## _of + v ## _sz, MEM_ALIGN)
MK_OFFSET(ctrl_smpl);
MK_OFFSET(curr_pkdPts);
MK_OFFSET(curr_H);
MK_OFFSET(best_H);
MK_OFFSET(lm_JtJ);
MK_OFFSET(lm_tmp1);
MK_OFFSET(lm_Jte);
#undef MK_OFFSET
/* Allocate dynamic memory managed by cv::Mat */
mem.perObj.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
/* Extract aligned pointer */
unsigned char* ptr = alignPtr(mem.perObj.data, MEM_ALIGN);
/* Assign pointers */
ctrl.smpl = (unsigned*) (ptr + ctrl_smpl_of);
curr.pkdPts = (float*) (ptr + curr_pkdPts_of);
curr.H = (float*) (ptr + curr_H_of);
best.H = (float*) (ptr + best_H_of);
lm.JtJ = (float(*)[8])(ptr + lm_JtJ_of);
lm.tmp1 = (float(*)[8])(ptr + lm_tmp1_of);
lm.Jte = (float*) (ptr + lm_Jte_of);
objArea.allocate(ctrl.smpl, SMPL_SIZE);
objArea.allocate(curr.pkdPts, SMPL_SIZE*2*2);
objArea.allocate(curr.H, HSIZE);
objArea.allocate(best.H, HSIZE);
objArea.allocate(lm.JtJ, 8*8);
objArea.allocate(lm.tmp1, 8*8);
objArea.allocate(lm.Jte, 1*8);
objArea.commit();
}
......@@ -885,30 +844,9 @@ inline void RHO_HEST_REFC::allocatePerObj(void){
*/
inline void RHO_HEST_REFC::allocatePerRun(void){
/* We have known sizes */
size_t best_inl_sz = arg.N;
size_t curr_inl_sz = arg.N;
/* We compute offsets */
size_t total = 0;
#define MK_OFFSET(v) \
size_t v ## _of = total; \
total = alignSize(v ## _of + v ## _sz, MEM_ALIGN)
MK_OFFSET(best_inl);
MK_OFFSET(curr_inl);
#undef MK_OFFSET
/* Allocate dynamic memory managed by cv::Mat */
mem.perRun.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
/* Extract aligned pointer */
unsigned char* ptr = alignPtr(mem.perRun.data, MEM_ALIGN);
/* Assign pointers */
best.inl = (char*)(ptr + best_inl_of);
curr.inl = (char*)(ptr + curr_inl_of);
runArea.allocate(best.inl, arg.N);
runArea.allocate(curr.inl, arg.N);
runArea.commit();
}
......@@ -919,10 +857,7 @@ inline void RHO_HEST_REFC::allocatePerRun(void){
*/
inline void RHO_HEST_REFC::deallocatePerRun(void){
best.inl = NULL;
curr.inl = NULL;
mem.perRun.release();
runArea.release();
}
......@@ -933,15 +868,7 @@ inline void RHO_HEST_REFC::deallocatePerRun(void){
*/
inline void RHO_HEST_REFC::deallocatePerObj(void){
ctrl.smpl = NULL;
curr.pkdPts = NULL;
curr.H = NULL;
best.H = NULL;
lm.JtJ = NULL;
lm.tmp1 = NULL;
lm.Jte = NULL;
mem.perObj.release();
objArea.release();
}
......@@ -2144,7 +2071,7 @@ inline void RHO_HEST_REFC::refine(void){
*/
/* Find initial conditions */
sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
lm.JtJ, lm.Jte, &S);
(float(*)[8])lm.JtJ, lm.Jte, &S);
/*Levenberg-Marquardt Loop.*/
for(i=0;i<MAXLEVMARQITERS;i++){
......@@ -2169,11 +2096,11 @@ inline void RHO_HEST_REFC::refine(void){
* transpose) then multiply Jte in order to find dH.
*/
while(!sacChol8x8Damped(lm.JtJ, L, lm.tmp1)){
while(!sacChol8x8Damped((float(*)[8])lm.JtJ, L, (float(*)[8])lm.tmp1)){
L *= 2.0f;
}
sacTRInv8x8 (lm.tmp1, lm.tmp1);
sacTRISolve8x8(lm.tmp1, lm.Jte, dH);
sacTRInv8x8 ((float(*)[8])lm.tmp1, (float(*)[8])lm.tmp1);
sacTRISolve8x8((float(*)[8])lm.tmp1, lm.Jte, dH);
sacSub8x1 (newH, best.H, dH);
sacCalcJacobianErrors(newH, arg.src, arg.dst, best.inl, arg.N,
NULL, NULL, &newS);
......@@ -2204,7 +2131,7 @@ inline void RHO_HEST_REFC::refine(void){
S = newS;
memcpy(best.H, newH, sizeof(newH));
sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
lm.JtJ, lm.Jte, &S);
(float(*)[8])lm.JtJ, lm.Jte, &S);
}
}
}
......
......@@ -53,6 +53,7 @@
#include "precomp.hpp"
#include <limits.h>
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/utils/buffer_area.private.hpp"
namespace cv
{
......@@ -99,6 +100,16 @@ struct StereoSGBMParams
mode = _mode;
}
inline bool isFullDP() const
{
return mode == StereoSGBM::MODE_HH || mode == StereoSGBM::MODE_HH4;
}
inline Size calcSADWindowSize() const
{
const int dim = SADWindowSize > 0 ? SADWindowSize : 5;
return Size(dim, dim);
}
int minDisparity;
int numDisparities;
int SADWindowSize;
......@@ -148,6 +159,7 @@ static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_va
#endif
static const int DEFAULT_RIGHT_BORDER = -1;
/*
For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
and for each disparity minD<=d<maxD the function
......@@ -161,7 +173,7 @@ static const int DEFAULT_RIGHT_BORDER = -1;
static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
int minD, int maxD, CostType* cost,
PixType* buffer, const PixType* tab,
int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
{
int x, c, width = img1.cols, cn = img1.channels();
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
......@@ -178,8 +190,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
tab += tabOfs;
for( c = 0; c < cn*2; c++ )
{
prow1[width*c] = prow1[width*c + width-1] =
......@@ -297,6 +307,166 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
}
class BufferSGBM
{
private:
size_t width1;
size_t Da;
size_t Dlra;
size_t costWidth;
size_t costHeight;
size_t hsumRows;
bool fullDP;
uchar dirs;
uchar dirs2;
static const size_t TAB_OFS = 256*4;
public:
CostType* Cbuf;
CostType* Sbuf;
CostType* hsumBuf;
CostType* pixDiff;
CostType* disp2cost;
DispType* disp2ptr;
PixType* tempBuf;
std::vector<CostType*> Lr;
std::vector<CostType*> minLr;
PixType * clipTab;
private:
utils::BufferArea area;
public:
BufferSGBM(size_t width1_,
size_t Da_,
size_t Dlra_,
size_t cn,
size_t width,
size_t height,
const StereoSGBMParams &params)
: width1(width1_),
Da(Da_),
Dlra(Dlra_),
Cbuf(NULL),
Sbuf(NULL),
hsumBuf(NULL),
pixDiff(NULL),
disp2cost(NULL),
disp2ptr(NULL),
tempBuf(NULL),
Lr(2, (CostType*)NULL),
minLr(2, (CostType*)NULL),
clipTab(NULL)
{
const size_t TAB_SIZE = 256 + TAB_OFS*2;
fullDP = params.isFullDP();
costWidth = width1 * Da;
costHeight = fullDP ? height : 1;
hsumRows = params.calcSADWindowSize().height + 2;
dirs = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR;
dirs2 = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR2;
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
area.allocate(Cbuf, costWidth * costHeight, CV_SIMD_WIDTH); // summary cost over different (nDirs) directions
area.allocate(Sbuf, costWidth * costHeight, CV_SIMD_WIDTH);
area.allocate(hsumBuf, costWidth * hsumRows, CV_SIMD_WIDTH);
area.allocate(pixDiff, costWidth, CV_SIMD_WIDTH);
area.allocate(disp2cost, width, CV_SIMD_WIDTH);
area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
area.allocate(tempBuf, width * (4 * cn + 2), CV_SIMD_WIDTH);
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for 8-way dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
for (size_t i = 0; i < 2; ++i)
{
// 2D: [ NR ][ w1 * NR2 ][ NR ] * [ Dlra ]
area.allocate(Lr[i], calcLrCount() * Dlra, CV_SIMD_WIDTH);
// 1D: [ NR ][ w1 * NR2 ][ NR ]
area.allocate(minLr[i], calcLrCount(), CV_SIMD_WIDTH);
}
area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
area.commit();
// init clipTab
const int ftzero = std::max(params.preFilterCap, 15) | 1;
for(int i = 0; i < (int)TAB_SIZE; i++ )
clipTab[i] = (PixType)(std::min(std::max(i - (int)TAB_OFS, -ftzero), ftzero) + ftzero);
}
inline const PixType * getClipTab() const
{
return clipTab + TAB_OFS;
}
inline void initCBuf(CostType val) const
{
for (size_t i = 0; i < costWidth * costHeight; ++i)
Cbuf[i] = val;
}
inline void clearLr(const Range & range = Range::all()) const
{
for (uchar i = 0; i < 2; ++i)
{
if (range == Range::all())
{
memset(Lr[i], 0, calcLrCount() * Dlra * sizeof(CostType));
memset(minLr[i], 0, calcLrCount() * sizeof(CostType));
}
else
{
memset(getLr(i, range.start), 0, range.size() * sizeof(CostType) * Dlra);
memset(getMinLr(i, range.start), 0, range.size() * sizeof(CostType));
}
}
}
inline size_t calcLrCount() const
{
return width1 * dirs2 + 2 * dirs;
}
inline void swapLr()
{
std::swap(Lr[0], Lr[1]);
std::swap(minLr[0], minLr[1]);
}
inline CostType * getHSumBuf(int row) const
{
return hsumBuf + (row % hsumRows) * costWidth;
}
inline CostType * getCBuf(int row) const
{
CV_Assert(row >= 0);
return Cbuf + (!fullDP ? 0 : (row * costWidth));
}
inline CostType * getSBuf(int row) const
{
CV_Assert(row >= 0);
return Sbuf + (!fullDP ? 0 : (row * costWidth));
}
inline void clearSBuf(int row, const Range & range = Range::all()) const
{
if (range == Range::all())
memset(getSBuf(row), 0, costWidth * sizeof(CostType));
else
memset(getSBuf(row) + range.start * Da, 0, range.size() * Da * sizeof(CostType));
}
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
inline CostType * getLr(uchar id, int idx, uchar shift = 0) const
{
CV_Assert(id < 2);
const size_t fixed_offset = dirs * Dlra;
return Lr[id] + fixed_offset + (idx * (int)dirs2 + (int)shift) * (int)Dlra;
}
inline CostType * getMinLr(uchar id, int idx, uchar shift = 0) const
{
CV_Assert(id < 2);
const size_t fixed_offset = dirs;
return minLr[id] + fixed_offset + (idx * dirs2 + shift);
}
};
/*
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
......@@ -318,34 +488,25 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
*/
static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
Mat& disp1, const StereoSGBMParams& params,
Mat& buffer )
Mat& disp1, const StereoSGBMParams& params )
{
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
const int DISP_SCALE = (1 << DISP_SHIFT);
const CostType MAX_COST = SHRT_MAX;
int minD = params.minDisparity, maxD = minD + params.numDisparities;
Size SADWindowSize;
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
int ftzero = std::max(params.preFilterCap, 15) | 1;
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
int k, width = disp1.cols, height = disp1.rows;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int D = maxD - minD, width1 = maxX1 - minX1;
const int D = params.numDisparities;
int width1 = maxX1 - minX1;
int Da = (int)alignSize(D, v_int16::nlanes);
int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
bool fullDP = params.mode == StereoSGBM::MODE_HH;
int npasses = fullDP ? 2 : 1;
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType clipTab[TAB_SIZE];
for( k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
int npasses = params.isFullDP() ? 2 : 1;
if( minX1 >= maxX1 )
{
......@@ -353,39 +514,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
return;
}
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
size_t costBufSize = width1*Da;
size_t CSBufSize = costBufSize*(fullDP ? height : 1);
size_t minLrSize = (width1 + 2)*NR2, LrSize = minLrSize*Dlra;
int hsumBufNRows = SH2*2 + 2;
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for 8-way dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // alignment, C, S
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType) + // minLr[] and Lr[]
width*(sizeof(CostType) + sizeof(DispType)) + // disp2cost + disp2
width * (4*img1.channels() + 2) * sizeof(PixType); // temp buffer for computing per-pixel cost
if( buffer.empty() || !buffer.isContinuous() ||
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
buffer.reserveBuffer(totalBufSize);
// summary cost over different (nDirs) directions
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
CostType* Sbuf = Cbuf + CSBufSize;
CostType* hsumBuf = Sbuf + CSBufSize;
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
CostType* disp2cost = pixDiff + costBufSize + ((LrSize + minLrSize)*2 + v_int16::nlanes);
DispType* disp2ptr = (DispType*)(disp2cost + width);
PixType* tempBuf = (PixType*)(disp2ptr + width);
// add P2 to every C(x,y). it saves a few operations in the inner loops
for(k = 0; k < (int)CSBufSize; k++ )
Cbuf[k] = (CostType)P2;
BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
for( int pass = 1; pass <= npasses; pass++ )
{
......@@ -402,27 +532,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
x1 = width1-1; x2 = -1; dx = -1;
}
CostType *Lr[2]={0}, *minLr[2]={0};
for( k = 0; k < 2; k++ )
{
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
// however, then the alignment will be imperfect, i.e. bad for SSE,
// thus we shift the pointers by SIMD vector size
Lr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*k + NR2*Dlra;
memset( Lr[k] - NR2*Dlra, 0, LrSize*sizeof(CostType) );
minLr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*2 + minLrSize*k + NR2;
memset( minLr[k] - NR2, 0, minLrSize*sizeof(CostType) );
}
uchar lrID = 0;
mem.clearLr();
for( int y = y1; y != y2; y += dy )
{
int x, d;
DispType* disp1ptr = disp1.ptr<DispType>(y);
CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
CostType* const C = mem.getCBuf(y);
CostType* const S = mem.getSBuf(y);
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
{
......@@ -430,35 +548,35 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( k = dy1; k <= dy2; k++ )
{
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
if( k < height )
{
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
memset(hsumAdd, 0, Da*sizeof(CostType));
#if CV_SIMD
v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
for( d = 0; d < Da; d += v_int16::nlanes )
{
v_int16 v_hsumAdd = vx_load_aligned(pixDiff + d) * h_scale;
v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
for( x = Da; x <= SW2*Da; x += Da )
v_hsumAdd += vx_load_aligned(pixDiff + x + d);
v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
v_store_aligned(hsumAdd + d, v_hsumAdd);
}
#else
for (d = 0; d < D; d++)
{
hsumAdd[d] = (CostType)(pixDiff[d] * (SW2 + 1));
hsumAdd[d] = (CostType)(mem.pixDiff[d] * (SW2 + 1));
for( x = Da; x <= SW2*Da; x += Da )
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]);
hsumAdd[d] = (CostType)(hsumAdd[d] + mem.pixDiff[x + d]);
}
#endif
if( y > 0 )
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
......@@ -470,8 +588,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( x = Da; x < width1*Da; x += Da )
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes )
{
......@@ -501,8 +619,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
#endif
for( x = Da; x < width1*Da; x += Da )
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
......@@ -526,8 +644,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
{
if( y > 0 )
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for (x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
......@@ -551,7 +669,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
// also, clear the S buffer
memset(S, 0, width1*Da * sizeof(CostType));
mem.clearSBuf(y);
}
/*
......@@ -575,24 +693,26 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( x = x1; x != x2; x += dx )
{
int xm = x*NR2, xd = xm*Dlra;
int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
CostType* Lr_p0 = Lr[0] + xd - dx*NR2*Dlra;
CostType* Lr_p1 = Lr[1] + xd - NR2*Dlra + Dlra;
CostType* Lr_p2 = Lr[1] + xd + Dlra*2;
CostType* Lr_p3 = Lr[1] + xd + NR2*Dlra + Dlra*3;
Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
CostType* Lr_p = Lr[0] + xd;
int delta0 = P2 + *mem.getMinLr(lrID, x - dx);
int delta1 = P2 + *mem.getMinLr(1 - lrID, x - 1, 1);
int delta2 = P2 + *mem.getMinLr(1 - lrID, x, 2);
int delta3 = P2 + *mem.getMinLr(1 - lrID, x + 1, 3);
CostType* Lr_p0 = mem.getLr(lrID, x - dx);
CostType* Lr_p1 = mem.getLr(1 - lrID, x - 1, 1);
CostType* Lr_p2 = mem.getLr(1 - lrID, x, 2);
CostType* Lr_p3 = mem.getLr(1 - lrID, x + 1, 3);
Lr_p0[-1] = Lr_p0[D] = MAX_COST;
Lr_p1[-1] = Lr_p1[D] = MAX_COST;
Lr_p2[-1] = Lr_p2[D] = MAX_COST;
Lr_p3[-1] = Lr_p3[D] = MAX_COST;
CostType* Lr_p = mem.getLr(lrID, x);
const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da;
CostType* minL = minLr[0] + xm;
CostType* minL = mem.getMinLr(lrID, x);
d = 0;
#if CV_SIMD
v_int16 _P1 = vx_setall_s16((short)P1);
......@@ -703,14 +823,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
{
v_store(disp1ptr + x, v_inv_dist);
v_store(disp2ptr + x, v_inv_dist);
v_store(disp2cost + x, v_max_cost);
v_store(mem.disp2ptr + x, v_inv_dist);
v_store(mem.disp2cost + x, v_max_cost);
}
#endif
for( ; x < width; x++ )
{
disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
disp2cost[x] = MAX_COST;
disp1ptr[x] = mem.disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
mem.disp2cost[x] = MAX_COST;
}
for( x = width1 - 1; x >= 0; x-- )
......@@ -721,16 +841,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
if( npasses == 1 )
{
int xm = x*NR2, xd = xm*Dlra;
CostType* Lr_p0 = Lr[0] + xd + NR2*Dlra;
CostType* Lr_p0 = mem.getLr(lrID, x + 1);
Lr_p0[-1] = Lr_p0[D] = MAX_COST;
CostType* Lr_p = Lr[0] + xd;
CostType* Lr_p = mem.getLr(lrID, x);
const CostType* Cp = C + x*Da;
d = 0;
int delta0 = minLr[0][xm + NR2] + P2;
int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
int minL0 = MAX_COST;
#if CV_SIMD
v_int16 _P1 = vx_setall_s16((short)P1);
......@@ -768,7 +886,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
bestDisp = (short)d;
}
}
minLr[0][xm] = (CostType)minL0;
*mem.getMinLr(lrID, x) = (CostType)minL0;
}
else
{
......@@ -803,10 +921,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
continue;
d = bestDisp;
int _x2 = x + minX1 - d - minD;
if( disp2cost[_x2] > minS )
if( mem.disp2cost[_x2] > minS )
{
disp2cost[_x2] = (CostType)minS;
disp2ptr[_x2] = (DispType)(d + minD);
mem.disp2cost[_x2] = (CostType)minS;
mem.disp2ptr[_x2] = (DispType)(d + minD);
}
if( 0 < d && d < D-1 )
......@@ -833,15 +951,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
int _d = d1 >> DISP_SHIFT;
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
int _x = x - _d, x_ = x - d_;
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
if( 0 <= _x && _x < width && mem.disp2ptr[_x] >= minD && std::abs(mem.disp2ptr[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && mem.disp2ptr[x_] >= minD && std::abs(mem.disp2ptr[x_] - d_) > disp12MaxDiff )
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
}
}
// now shift the cyclic buffers
std::swap( Lr[0], Lr[1] );
std::swap( minLr[0], minLr[1] );
lrID = 1 - lrID; // now shift the cyclic buffers
}
}
}
......@@ -849,13 +965,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
////////////////////////////////////////////////////////////////////////////////////////////
struct CalcVerticalSums: public ParallelLoopBody
{
CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab)
CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, const BufferSGBM &mem_)
: img1(_img1), img2(_img2), mem(mem_)
{
minD = params.minDisparity;
maxD = minD + params.numDisparities;
SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
ftzero = std::max(params.preFilterCap, 15) | 1;
SW2 = SH2 = params.calcSADWindowSize().height/2;
P1 = params.P1 > 0 ? params.P1 : 2;
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
height = img1.rows;
......@@ -865,32 +980,27 @@ struct CalcVerticalSums: public ParallelLoopBody
Da = (int)alignSize(D, v_int16::nlanes);
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1;
costBufSize = width1*Da;
CSBufSize = costBufSize*height;
minLrSize = width1;
LrSize = minLrSize*Dlra;
hsumBufNRows = SH2*2 + 2;
Cbuf = alignedBuf;
Sbuf = Cbuf + CSBufSize;
hsumBuf = Sbuf + CSBufSize;
D = params.numDisparities;
Da = (int)alignSize(D, v_int16::nlanes);
}
void operator()(const Range& range) const CV_OVERRIDE
{
static const CostType MAX_COST = SHRT_MAX;
static const int TAB_OFS = 256*4;
static const int npasses = 2;
int x1 = range.start, x2 = range.end, k;
size_t pixDiffSize = ((x2 - x1) + 2*SW2)*Da;
size_t auxBufsSize = CV_SIMD_WIDTH + pixDiffSize*sizeof(CostType) + //alignment and pixdiff size
width*(4*img1.channels()+2)*sizeof(PixType); //tempBuf
Mat auxBuff;
auxBuff.create(1, (int)auxBufsSize, CV_8U);
CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH);
PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);
const CostType MAX_COST = SHRT_MAX;
const int npasses = 2;
const int x1 = range.start, x2 = range.end;
int k;
CostType* pixDiff = 0;
PixType* tempBuf = 0;
utils::BufferArea aux_area;
aux_area.allocate(pixDiff, ((x2 - x1) + 2 * SW2) * Da, CV_SIMD_WIDTH);
aux_area.allocate(tempBuf, width * (4 * img1.channels() + 2) * sizeof(PixType), CV_SIMD_WIDTH);
aux_area.commit();
// Simplification of index calculation
pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*Da;
if (x1 > SW2)
pixDiff -= (x1 - SW2) * Da;
for( int pass = 1; pass <= npasses; pass++ )
{
......@@ -905,26 +1015,14 @@ struct CalcVerticalSums: public ParallelLoopBody
y1 = height-1; y2 = -1; dy = -1;
}
CostType *Lr[2]={0}, *minLr[2]={0};
for( k = 0; k < 2; k++ )
{
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
// and will occasionally use negative indices with the arrays
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
// however, then the alignment will be imperfect, i.e. bad for SSE,
// thus we shift the pointers by SIMD vector size
Lr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*k;
memset( Lr[k] + x1*Dlra, 0, (x2-x1)*Dlra*sizeof(CostType) );
minLr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*2 + minLrSize*k;
memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
}
uchar lrID = 0;
mem.clearLr(range);
for( int y = y1; y != y2; y += dy )
{
int x, d;
CostType* C = Cbuf + y*costBufSize;
CostType* S = Sbuf + y*costBufSize;
CostType* C = mem.getCBuf(y);
CostType* S = mem.getSBuf(y);
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
{
......@@ -932,11 +1030,11 @@ struct CalcVerticalSums: public ParallelLoopBody
for( k = dy1; k <= dy2; k++ )
{
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
if( k < height )
{
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, mem.getClipTab(), x1 - SW2, x2 + SW2);
memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType));
for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
......@@ -953,8 +1051,8 @@ struct CalcVerticalSums: public ParallelLoopBody
if( y > 0 )
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
const CostType* Cprev = C - costBufSize;
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes )
v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
......@@ -1020,8 +1118,8 @@ struct CalcVerticalSums: public ParallelLoopBody
{
/* if (y > 0)
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
const CostType* Cprev = C - costBufSize;
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
......@@ -1044,9 +1142,7 @@ struct CalcVerticalSums: public ParallelLoopBody
}
}
}
// also, clear the S buffer
memset(S + x1*Da, 0, (x2-x1)*Da*sizeof(CostType));
mem.clearSBuf(y, range);
}
// [formula 13 in the paper]
......@@ -1061,19 +1157,16 @@ struct CalcVerticalSums: public ParallelLoopBody
for( x = x1; x != x2; x++ )
{
int xd = x*Dlra;
int delta = minLr[1][x] + P2;
CostType* Lr_ppr = Lr[1] + xd;
int delta = P2 + *mem.getMinLr(1 - lrID, x);
CostType* Lr_ppr = mem.getLr(1 - lrID, x);
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
CostType* Lr_p = Lr[0] + xd;
CostType* Lr_p = mem.getLr(lrID, x);
const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da;
CostType& minL = minLr[0][x];
CostType& minL = *(mem.getMinLr(lrID, x));
d = 0;
#if CV_SIMD
v_int16 _P1 = vx_setall_s16((short)P1);
......@@ -1105,19 +1198,13 @@ struct CalcVerticalSums: public ParallelLoopBody
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
}
}
// now shift the cyclic buffers
std::swap( Lr[0], Lr[1] );
std::swap( minLr[0], minLr[1] );
lrID = 1 - lrID; // now shift the cyclic buffers
}
}
}
const Mat& img1;
const Mat& img2;
CostType* Cbuf;
CostType* Sbuf;
CostType* hsumBuf;
PixType* clipTab;
const BufferSGBM & mem;
int minD;
int maxD;
int D, Da, Dlra;
......@@ -1128,18 +1215,12 @@ struct CalcVerticalSums: public ParallelLoopBody
int height;
int P1;
int P2;
size_t costBufSize;
size_t CSBufSize;
size_t minLrSize;
size_t LrSize;
size_t hsumBufNRows;
int ftzero;
};
struct CalcHorizontalSums: public ParallelLoopBody
{
CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params,
CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1)
CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, const BufferSGBM &mem_)
: img1(_img1), img2(_img2), disp1(_disp1), mem(mem_)
{
minD = params.minDisparity;
maxD = minD + params.numDisparities;
......@@ -1157,23 +1238,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
Da = (int)alignSize(D, v_int16::nlanes);
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1;
costBufSize = width1*Da;
CSBufSize = costBufSize*height;
LrSize = 2 * Dlra;
Cbuf = alignedBuf;
Sbuf = Cbuf + CSBufSize;
}
void operator()(const Range& range) const CV_OVERRIDE
{
int y1 = range.start, y2 = range.end;
size_t auxBufsSize = CV_SIMD_WIDTH + (v_int16::nlanes + LrSize) * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType));
Mat auxBuff;
auxBuff.create(1, (int)auxBufsSize, CV_8U);
CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH)) + v_int16::nlanes;
CostType* disp2cost = Lr + LrSize;
DispType* disp2ptr = (DispType*)(disp2cost + width);
const size_t LrSize = 2 * (1 + Dlra + 1);
CostType * Lr = 0;
CostType * disp2cost = 0;
DispType * disp2ptr = 0;
utils::BufferArea aux_area;
aux_area.allocate(Lr, LrSize);
aux_area.allocate(disp2cost, width, CV_SIMD_WIDTH);
aux_area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
aux_area.commit();
CostType minLr;
......@@ -1181,8 +1261,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
{
int x, d;
DispType* disp1ptr = disp1.ptr<DispType>(y);
CostType* C = Cbuf + y*costBufSize;
CostType* S = Sbuf + y*costBufSize;
CostType* C = mem.getCBuf(y);
CostType* S = mem.getSBuf(y);
x = 0;
#if CV_SIMD
......@@ -1202,8 +1282,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
}
// clear buffers
memset( Lr, 0, LrSize*sizeof(CostType) );
Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
aux_area.zeroFill(Lr);
Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
minLr = 0;
// [formula 13 in the paper]
......@@ -1219,10 +1299,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
for( x = 0; x != width1; x++)
{
int delta = minLr + P2;
CostType* Lr_ppr = Lr + ((x&1)? 0 : Dlra);
CostType* Lr_p = Lr + ((x&1)? Dlra :0);
CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da;
......@@ -1236,8 +1314,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
{
v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_store_aligned(Lr_p + d, L);
v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_store(Lr_p + d, L);
_minL = v_min(_minL, L);
v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
}
......@@ -1255,18 +1333,16 @@ struct CalcHorizontalSums: public ParallelLoopBody
}
}
memset( Lr, 0, LrSize*sizeof(CostType) );
Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
aux_area.zeroFill(Lr);
Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
minLr = 0;
for( x = width1-1; x != -1; x--)
{
int delta = minLr + P2;
CostType* Lr_ppr = Lr + ((x&1)? 0 :Dlra);
CostType* Lr_p = Lr + ((x&1)? Dlra :0);
CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
const CostType* Cp = C + x*Da;
CostType* Sp = S + x*Da;
CostType minS = MAX_COST;
......@@ -1283,8 +1359,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
{
v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_store_aligned(Lr_p + d, L);
v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_store(Lr_p + d, L);
_minL = v_min(_minL, L);
L += vx_load_aligned(Sp + d);
v_store_aligned(Sp + d, L);
......@@ -1366,8 +1442,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
const Mat& img1;
const Mat& img2;
Mat& disp1;
CostType* Cbuf;
CostType* Sbuf;
const BufferSGBM & mem;
int minD;
int maxD;
int D, Da, Dlra;
......@@ -1378,9 +1453,6 @@ struct CalcHorizontalSums: public ParallelLoopBody
int P2;
int minX1;
int maxX1;
size_t costBufSize;
size_t CSBufSize;
size_t LrSize;
int INVALID_DISP;
int INVALID_DISP_SCALED;
int uniquenessRatio;
......@@ -1401,28 +1473,21 @@ struct CalcHorizontalSums: public ParallelLoopBody
is written as is, without interpolation.
*/
static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
Mat& disp1, const StereoSGBMParams& params,
Mat& buffer )
Mat& disp1, const StereoSGBMParams& params)
{
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
const int DISP_SCALE = (1 << DISP_SHIFT);
int minD = params.minDisparity, maxD = minD + params.numDisparities;
Size SADWindowSize;
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
int ftzero = std::max(params.preFilterCap, 15) | 1;
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
int k, width = disp1.cols, height = disp1.rows;
int width = disp1.cols, height = disp1.rows;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
int Dlra = D + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int SH2 = SADWindowSize.height/2;
int width1 = maxX1 - minX1;
int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1;
int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType clipTab[TAB_SIZE];
for( k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
if( minX1 >= maxX1 )
{
......@@ -1430,54 +1495,79 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
return;
}
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
// we keep pixel difference cost (C) and the summary cost over 4 directions (S).
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
// for dynamic programming we need the current row and
// the previous row, i.e. 2 rows in total
size_t costBufSize = width1*D;
size_t CSBufSize = costBufSize*height;
size_t minLrSize = width1 , LrSize = minLrSize*Dlra;
int hsumBufNRows = SH2*2 + 2;
size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // Alignment, C, S
costBufSize*hsumBufNRows * sizeof(CostType) + // hsumBuf
((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType); // minLr[] and Lr[]
if( buffer.empty() || !buffer.isContinuous() ||
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
{
buffer.reserveBuffer(totalBufSize);
}
// summary cost over different (nDirs) directions
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
// add P2 to every C(x,y). it saves a few operations in the inner loops
for(k = 0; k < (int)CSBufSize; k++ )
Cbuf[k] = (CostType)P2;
parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, mem),8);
parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, mem),8);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
PixType*& tmpBuf, CostType*& horPassCostVolume,
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
CostType*& disp2CostBuf, short*& disp2Buf);
class BufferSGBM3Way
{
private:
size_t hsumCols;
size_t hsumRows;
public:
CostType *curCostVolumeLine;
CostType *hsumBuf;
CostType *pixDiff;
PixType *tmpBuf;
CostType *horPassCostVolume;
CostType *vertPassCostVolume;
CostType *vertPassMin;
CostType *rightPassBuf;
CostType *disp2CostBuf;
short *disp2Buf;
private:
utils::BufferArea area;
public:
BufferSGBM3Way(int width1, int width, int num_ch, int Da, int SH2, int P2) :
curCostVolumeLine(0),
hsumBuf(0),
pixDiff(0),
tmpBuf(0),
horPassCostVolume(0),
vertPassCostVolume(0),
vertPassMin(0),
rightPassBuf(0),
disp2CostBuf(0),
disp2Buf(0)
{
hsumCols = width1 * Da;
hsumRows = SH2*2 + 2;
area.allocate(curCostVolumeLine, hsumCols, CV_SIMD_WIDTH);
area.allocate(hsumBuf, hsumCols * hsumRows, CV_SIMD_WIDTH);
area.allocate(pixDiff,hsumCols, CV_SIMD_WIDTH);
area.allocate(tmpBuf, width * (4 * num_ch + 2), CV_SIMD_WIDTH);
area.allocate(horPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
area.allocate(vertPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
area.allocate(vertPassMin, width1 + 2, CV_SIMD_WIDTH);
area.allocate(rightPassBuf, Da, CV_SIMD_WIDTH);
area.allocate(disp2CostBuf, width, CV_SIMD_WIDTH);
area.allocate(disp2Buf, width, CV_SIMD_WIDTH);
area.commit();
area.zeroFill();
for(size_t i = 0; i < hsumCols; i++)
curCostVolumeLine[i] = (CostType)P2;
}
inline void clearRightPassBuf()
{
area.zeroFill(rightPassBuf);
}
CostType *getHSumBuf(int x) const
{
return hsumBuf + (x % hsumRows) * hsumCols;
}
};
struct SGBM3WayMainLoop : public ParallelLoopBody
{
Mat* buffers;
const Mat *img1, *img2;
Mat* dst_disp;
int nstripes, stripe_sz;
int stripe_sz;
int stripe_overlap;
int width,height;
......@@ -1488,25 +1578,54 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
int P1, P2;
int uniquenessRatio, disp12MaxDiff;
int costBufSize, hsumBufNRows;
int TAB_OFS, ftzero;
int TAB_OFS;
utils::BufferArea aux_area;
PixType* clipTab;
#if CV_SIMD
short idx_row[v_int16::nlanes];
#endif
SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
void operator () (const Range& range) const CV_OVERRIDE;
template<bool x_nlanes> void impl(const Range& range) const;
private:
void getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const;
template<bool x_nlanes>
void accumulateCostsLeftTop(const BufferSGBM3Way &mem,
int x,
CostType &leftMinCost) const;
template<bool x_nlanes>
void accumulateCostsRight(const BufferSGBM3Way &mem,
int x,
CostType &rightMinCost,
short &optimal_disp,
CostType &min_cost) const;
};
SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
const Mat& _img2,
Mat* _dst_disp,
const StereoSGBMParams& params,
int _stripe_sz,
int _stripe_overlap)
: img1(&_img1),
img2(&_img2),
dst_disp(_dst_disp),
stripe_sz(_stripe_sz),
stripe_overlap(_stripe_overlap),
clipTab(0)
{
nstripes = _nstripes;
stripe_overlap = _stripe_overlap;
stripe_sz = (int)ceil(img1->rows/(double)nstripes);
// precompute a lookup table for the raw matching cost computation:
TAB_OFS = 256*4;
const int TAB_SIZE = 256 + TAB_OFS*2;
aux_area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
aux_area.commit();
const int ftzero = std::max(params.preFilterCap, 15) | 1;
for(int k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
width = img1->cols; height = img1->rows;
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
......@@ -1519,100 +1638,27 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
costBufSize = width1*Da;
hsumBufNRows = SH2*2 + 2;
TAB_OFS = 256*4;
ftzero = std::max(params.preFilterCap, 15) | 1;
#if CV_SIMD
for(short i = 0; i < v_int16::nlanes; ++i)
idx_row[i] = i;
#endif
}
void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
PixType*& tmpBuf, CostType*& horPassCostVolume,
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
CostType*& disp2CostBuf, short*& disp2Buf)
{
// allocating all the required memory:
int costVolumeLineSize = width1*Da;
int width1_ext = width1+2;
int costVolumeLineSize_ext = width1_ext*Da;
int hsumBufNRows = SH2*2 + 2;
// main buffer to store matching costs for the current line:
int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
// auxiliary buffers for the raw matching cost computation:
int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
int pixDiffSize = costVolumeLineSize*sizeof(CostType);
int tmpBufSize = width * (4 * num_ch + 2) * sizeof(PixType);
// auxiliary buffers for the matching cost aggregation:
int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
int rightPassBufSize = Da * sizeof(CostType); // additional small buffer for the right-to-left pass
int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line
// buffers for the pseudo-LRC check:
int disp2CostBufSize = width*sizeof(CostType);
int disp2BufSize = width*sizeof(short);
// sum up the sizes of all the buffers:
size_t totalBufSize = CV_SIMD_WIDTH + curCostVolumeLineSize +
hsumBufSize +
pixDiffSize +
horPassCostVolumeSize +
vertPassCostVolumeSize +
rightPassBufSize +
vertPassMinSize +
disp2CostBufSize +
disp2BufSize +
tmpBufSize;
if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
buffer.reserveBuffer(totalBufSize);
// set up all the pointers:
curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
hsumBuf = curCostVolumeLine + costVolumeLineSize;
pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows;
horPassCostVolume = pixDiff + costVolumeLineSize;
vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext;
vertPassMin = rightPassBuf + Da;
disp2CostBuf = vertPassMin + width1_ext;
disp2Buf = disp2CostBuf + width;
tmpBuf = (PixType*)(disp2Buf + width);
// initialize memory:
memset(buffer.ptr(),0,totalBufSize);
int i = 0;
#if CV_SIMD
v_int16 _P2 = vx_setall_s16((CostType)P2);
for (; i<=costVolumeLineSize-v_int16::nlanes; i+=v_int16::nlanes)
v_store_aligned(curCostVolumeLine + i, _P2);
#endif
for(;i<costVolumeLineSize;i++)
curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
}
// performing block matching and building raw cost-volume for the current row
void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
int y, int src_start_idx) const
void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const
{
CostType* C = mem.curCostVolumeLine;
CostType* pixDiff = mem.pixDiff;
PixType* tmpBuf = mem.tmpBuf;
int x, d;
int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
for(int k = dy1; k <= dy2; k++ )
{
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
if( k < height )
{
calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
#if CV_SIMD
v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
......@@ -1634,7 +1680,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
#endif
if( y > src_start_idx )
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
......@@ -1702,7 +1748,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
{
if( y > src_start_idx )
{
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
#if CV_SIMD
for( x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
......@@ -1728,12 +1774,15 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
// performing SGM cost accumulation from left to right (result is stored in leftBuf) and
// in-place cost accumulation from top to bottom (result is stored in topBuf)
template<bool x_nlanes>
inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType& leftMinCost) const
{
CostType *leftBuf = mem.horPassCostVolume + x;
CostType *leftBuf_prev = mem.horPassCostVolume + x - Da;
CostType *topBuf = mem.vertPassCostVolume + x;
CostType *costs = mem.curCostVolumeLine - Da + x;
CostType& topMinCost = mem.vertPassMin[x/Da];
int i = 0;
#if CV_SIMD
int Da = (int)alignSize(D, v_int16::nlanes);
v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
......@@ -1847,12 +1896,16 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
// optimal disparity value with minimum accumulated cost
template<bool x_nlanes>
inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
CostType& rightMinCost, int D, int P1, int P2, short& optimal_disp, CostType& min_cost)
void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
CostType& rightMinCost, short& optimal_disp, CostType& min_cost) const
{
CostType* costs = mem.curCostVolumeLine - Da + x;
CostType* rightBuf = mem.rightPassBuf;
CostType* topBuf = mem.vertPassCostVolume + x;
CostType* leftBuf = mem.horPassCostVolume + x;
int i = 0;
#if CV_SIMD
int Da = (int)alignSize(D, v_int16::nlanes);
v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
......@@ -1955,6 +2008,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
if (D == Da) impl<true>(range);
else impl<false>(range);
}
template<bool x_nlanes>
void SGBM3WayMainLoop::impl(const Range& range) const
{
......@@ -1979,33 +2033,24 @@ void SGBM3WayMainLoop::impl(const Range& range) const
else
dst_offset=0;
Mat cur_buffer = buffers [range.start];
Mat cur_disp = dst_disp[range.start];
cur_disp = Scalar(INVALID_DISP_SCALED);
// prepare buffers:
CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
PixType* tmpBuf;
CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
short* disp2Buf;
getBufferPointers(cur_buffer,width,width1,Da,img1->channels(),SH2,P2,
curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
BufferSGBM3Way mem(width1, width, img1->channels(), Da, SH2, P2);
CostType *horPassCostVolume = mem.horPassCostVolume;
// start real processing:
for(int y=src_start_idx;y<src_end_idx;y++)
{
getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
getRawMatchingCost(mem, y, src_start_idx);
short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
// initialize the auxiliary buffers for the pseudo left-right consistency check:
for(int x=0;x<width;x++)
{
disp2Buf[x] = (short)INVALID_DISP_SCALED;
disp2CostBuf[x] = SHRT_MAX;
mem.disp2Buf[x] = (short)INVALID_DISP_SCALED;
mem.disp2CostBuf[x] = SHRT_MAX;
}
CostType* C = curCostVolumeLine - Da;
CostType prev_min, min_cost;
int d;
short best_d;
......@@ -2014,14 +2059,14 @@ void SGBM3WayMainLoop::impl(const Range& range) const
// forward pass
prev_min=0;
for (int x=Da;x<(1+width1)*Da;x+=Da)
accumulateCostsLeftTop<x_nlanes>(horPassCostVolume+x,horPassCostVolume+x-Da,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/Da],D,P1,P2);
accumulateCostsLeftTop<x_nlanes>(mem, x, prev_min);
//backward pass
memset(rightPassBuf,0,Da*sizeof(CostType));
mem.clearRightPassBuf();
prev_min=0;
for (int x=width1*Da;x>=Da;x-=Da)
{
accumulateCostsRight<x_nlanes>(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
accumulateCostsRight<x_nlanes>(mem, x, prev_min, best_d, min_cost);
if(uniquenessRatio>0)
{
......@@ -2074,10 +2119,10 @@ void SGBM3WayMainLoop::impl(const Range& range) const
d = best_d;
int _x2 = x/Da - 1 + minX1 - d - minD;
if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
if( _x2>=0 && _x2<width && mem.disp2CostBuf[_x2] > min_cost )
{
disp2CostBuf[_x2] = min_cost;
disp2Buf[_x2] = (short)(d + minD);
mem.disp2CostBuf[_x2] = min_cost;
mem.disp2Buf[_x2] = (short)(d + minD);
}
if( 0 < d && d < D-1 )
......@@ -2104,32 +2149,27 @@ void SGBM3WayMainLoop::impl(const Range& range) const
int _d = d1 >> StereoMatcher::DISP_SHIFT;
int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
int _x = x - _d, x_ = x - d_;
if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
if( 0 <= _x && _x < width && mem.disp2Buf[_x] >= minD && std::abs(mem.disp2Buf[_x] - _d) > disp12MaxDiff &&
0 <= x_ && x_ < width && mem.disp2Buf[x_] >= minD && std::abs(mem.disp2Buf[x_] - d_) > disp12MaxDiff )
disp_row[x] = (short)INVALID_DISP_SCALED;
}
}
}
static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
Mat& disp1, const StereoSGBMParams& params,
Mat* buffers, int nstripes )
template <uchar nstripes>
static void computeDisparity3WaySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params)
{
// precompute a lookup table for the raw matching cost computation:
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
PixType* clipTab = new PixType[TAB_SIZE];
int ftzero = std::max(params.preFilterCap, 15) | 1;
for(int k = 0; k < TAB_SIZE; k++ )
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
// allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
Mat* dst_disp = new Mat[nstripes];
Mat dst_disp[nstripes];
for(int i=0;i<nstripes;i++)
dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
parallel_for_(
Range(0,nstripes),
SGBM3WayMainLoop(img1,img2,dst_disp,params,stripe_sz,stripe_overlap)
);
//assemble disp1 from dst_disp:
short* dst_row;
......@@ -2140,9 +2180,6 @@ static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
memcpy(dst_row,src_row,disp1.cols*sizeof(short));
}
delete[] clipTab;
delete[] dst_disp;
}
class StereoSGBMImpl CV_FINAL : public StereoSGBM
......@@ -2176,11 +2213,13 @@ public:
Mat disp = disparr.getMat();
if(params.mode==MODE_SGBM_3WAY)
computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
// the number of stripes is fixed, disregarding the number of threads/processors
// to make the results fully reproducible
computeDisparity3WaySGBM<4>( left, right, disp, params );
else if(params.mode==MODE_HH4)
computeDisparitySGBM_HH4( left, right, disp, params, buffer );
computeDisparitySGBM_HH4( left, right, disp, params );
else
computeDisparitySGBM( left, right, disp, params, buffer );
computeDisparitySGBM( left, right, disp, params );
medianBlur(disp, disp, 3);
......@@ -2259,11 +2298,6 @@ public:
StereoSGBMParams params;
Mat buffer;
// the number of stripes is fixed, disregarding the number of threads/processors
// to make the results fully reproducible:
static const int num_stripes = 4;
Mat buffers[num_stripes];
static const char* name_;
};
......
......@@ -74,6 +74,25 @@ public:
allocate_((void**)(&ptr), static_cast<ushort>(sizeof(T)), count, alignment);
}
/** @brief Fill one of buffers with zeroes
@param ptr pointer to memory block previously added using BufferArea::allocate
BufferArea::commit must be called before using this method
*/
template <typename T>
void zeroFill(T*&ptr)
{
CV_Assert(ptr);
zeroFill_((void**)&ptr);
}
/** @brief Fill all buffers with zeroes
BufferArea::commit must be called before using this method
*/
void zeroFill();
/** @brief Allocate memory and initialize all bound pointers
Each pointer bound to the area with the BufferArea::allocate will be initialized and will be set
......@@ -83,10 +102,18 @@ public:
*/
void commit();
/** @brief Release all memory and unbind all pointers
All memory will be freed and all pointers will be reset to NULL and untied from the area allowing
to call `allocate` and `commit` again.
*/
void release();
private:
BufferArea(const BufferArea &); // = delete
BufferArea &operator=(const BufferArea &); // = delete
void allocate_(void **ptr, ushort type_size, size_t count, ushort alignment);
void zeroFill_(void **ptr);
private:
class Block;
......
......@@ -66,6 +66,16 @@ public:
*ptr = buf;
return static_cast<void*>(static_cast<uchar*>(*ptr) + type_size * count);
}
bool operator==(void **other) const
{
CV_Assert(ptr && other);
return *ptr == *other;
}
void zeroFill() const
{
CV_Assert(ptr && *ptr);
memset(static_cast<uchar*>(*ptr), 0, count * type_size);
}
private:
void **ptr;
void * raw_mem;
......@@ -85,10 +95,7 @@ BufferArea::BufferArea(bool safe_) :
BufferArea::~BufferArea()
{
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
i->cleanup();
if (oneBuf)
fastFree(oneBuf);
release();
}
void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort alignment)
......@@ -100,6 +107,26 @@ void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort al
totalSize += blocks.back().getByteCount();
}
void BufferArea::zeroFill_(void **ptr)
{
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
{
if (*i == ptr)
{
i->zeroFill();
break;
}
}
}
void BufferArea::zeroFill()
{
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
{
i->zeroFill();
}
}
void BufferArea::commit()
{
if (!safe)
......@@ -116,6 +143,20 @@ void BufferArea::commit()
}
}
void BufferArea::release()
{
for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
{
i->cleanup();
}
blocks.clear();
if (oneBuf)
{
fastFree(oneBuf);
oneBuf = 0;
}
}
//==================================================================================================
}} // cv::utils::
......@@ -337,6 +337,21 @@ TEST_P(BufferArea, basic)
ASSERT_TRUE(dbl_ptr != NULL);
EXPECT_EQ((size_t)0, (size_t)int_ptr % sizeof(int));
EXPECT_EQ((size_t)0, (size_t)dbl_ptr % sizeof(double));
for (size_t i = 0; i < SZ; ++i)
{
int_ptr[i] = (int)i + 1;
uchar_ptr[i] = (uchar)i + 1;
dbl_ptr[i] = (double)i + 1;
}
area.zeroFill(int_ptr);
area.zeroFill(uchar_ptr);
area.zeroFill(dbl_ptr);
for (size_t i = 0; i < SZ; ++i)
{
EXPECT_EQ((int)0, int_ptr[i]);
EXPECT_EQ((uchar)0, uchar_ptr[i]);
EXPECT_EQ((double)0, dbl_ptr[i]);
}
}
EXPECT_TRUE(int_ptr == NULL);
EXPECT_TRUE(uchar_ptr == NULL);
......
......@@ -47,6 +47,7 @@ The references are:
#include "opencl_kernels_features2d.hpp"
#include "hal_replacement.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/utils/buffer_area.private.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp"
......@@ -80,20 +81,26 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
for( i = -255; i <= 255; i++ )
threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);
AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
uchar* buf[3];
buf[0] = _buf.data(); buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
int* cpbuf[3];
cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
cpbuf[1] = cpbuf[0] + img.cols + 1;
cpbuf[2] = cpbuf[1] + img.cols + 1;
memset(buf[0], 0, img.cols*3);
uchar* buf[3] = { 0 };
int* cpbuf[3] = { 0 };
utils::BufferArea area;
for (unsigned idx = 0; idx < 3; ++idx)
{
area.allocate(buf[idx], img.cols);
area.allocate(cpbuf[idx], img.cols + 1);
}
area.commit();
for (unsigned idx = 0; idx < 3; ++idx)
{
memset(buf[idx], 0, img.cols);
}
for(i = 3; i < img.rows-2; i++)
{
const uchar* ptr = img.ptr<uchar>(i) + 3;
uchar* curr = buf[(i - 3)%3];
int* cornerpos = cpbuf[(i - 3)%3];
int* cornerpos = cpbuf[(i - 3)%3] + 1; // cornerpos[-1] is used to store a value
memset(curr, 0, img.cols);
int ncorners = 0;
......@@ -266,7 +273,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
const uchar* prev = buf[(i - 4 + 3)%3];
const uchar* pprev = buf[(i - 5 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3] + 1; // cornerpos[-1] is used to store a value
ncorners = cornerpos[-1];
for( k = 0; k < ncorners; k++ )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment