Commit 0a1b9573 authored by Vitaly Tuzov's avatar Vitaly Tuzov

StereoBM algorithm updated to use wide universal intrinsics

parent c2096771
......@@ -216,30 +216,30 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
x = 1;
#if CV_SIMD128
v_int16x8 ftz = v_setall_s16((short) ftzero);
v_int16x8 ftz2 = v_setall_s16((short)(ftzero*2));
v_int16x8 z = v_setzero_s16();
v_int16 ftz = vx_setall_s16((short) ftzero);
v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
v_int16 z = vx_setzero_s16();
for(; x <= (size.width - 1) - 8; x += 8 )
for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
v_int16x8 s00 = v_reinterpret_as_s16(v_load_expand(srow0 + x + 1));
v_int16x8 s01 = v_reinterpret_as_s16(v_load_expand(srow0 + x - 1));
v_int16x8 s10 = v_reinterpret_as_s16(v_load_expand(srow1 + x + 1));
v_int16x8 s11 = v_reinterpret_as_s16(v_load_expand(srow1 + x - 1));
v_int16x8 s20 = v_reinterpret_as_s16(v_load_expand(srow2 + x + 1));
v_int16x8 s21 = v_reinterpret_as_s16(v_load_expand(srow2 + x - 1));
v_int16x8 s30 = v_reinterpret_as_s16(v_load_expand(srow3 + x + 1));
v_int16x8 s31 = v_reinterpret_as_s16(v_load_expand(srow3 + x - 1));
v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
v_int16 s10 = v_reinterpret_as_s16(vx_load_expand(srow1 + x + 1));
v_int16 s11 = v_reinterpret_as_s16(vx_load_expand(srow1 + x - 1));
v_int16 s20 = v_reinterpret_as_s16(vx_load_expand(srow2 + x + 1));
v_int16 s21 = v_reinterpret_as_s16(vx_load_expand(srow2 + x - 1));
v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));
v_int16x8 d0 = s00 - s01;
v_int16x8 d1 = s10 - s11;
v_int16x8 d2 = s20 - s21;
v_int16x8 d3 = s30 - s31;
v_int16 d0 = s00 - s01;
v_int16 d1 = s10 - s11;
v_int16 d2 = s20 - s21;
v_int16 d3 = s30 - s31;
v_uint16x8 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
v_uint16x8 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
v_pack_store(dptr0 + x, v0);
v_pack_store(dptr1 + x, v1);
......@@ -262,10 +262,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
uchar* dptr = dst.ptr<uchar>(y);
x = 0;
#if CV_SIMD128
v_uint8x16 val0_16 = v_setall_u8(val0);
for(; x <= size.width-16; x+=16 )
v_uint8 val0_16 = vx_setall_u8(val0);
for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
v_store(dptr + x, val0_16);
......@@ -309,13 +309,13 @@ inline int dispDescale(int v1, int v2, int d)
return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float
#if CV_SIMD128
template <typename dType>
static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
Mat& disp, Mat& cost, StereoBMParams& state,
uchar* buf, int _dy0, int _dy1 )
const int ALIGN = 16;
const int ALIGN = CV_SIMD_WIDTH;
int x, y, d;
int wsz = state.SADWindowSize, wsz2 = wsz/2;
int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
......@@ -345,7 +345,9 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
int coststep = ? (int)(cost.step/sizeof(costbuf)) : 0;
const int TABSZ = 256;
uchar tab[TABSZ];
const v_int16x8 d0_8 = v_int16x8(0,1,2,3,4,5,6,7), dd_8 = v_setall_s16(8);
short v_seq[v_int16::nlanes];
for (short i = 0; i < v_int16::nlanes; ++i)
v_seq[i] = i;
sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
......@@ -368,20 +370,26 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
int lval = lptr[0];
v_uint8x16 lv = v_setall_u8((uchar)lval);
for( d = 0; d < ndisp; d += 16 )
v_uint8 lv = vx_setall_u8((uchar)lval);
for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
v_uint8x16 rv = v_load(rptr + d);
v_uint16x8 hsad_l = v_load(hsad + d);
v_uint16x8 hsad_h = v_load(hsad + d + 8);
v_uint8x16 diff = v_absdiff(lv, rv);
v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
v_store(cbuf + d, diff);
v_uint16x8 diff0, diff1;
v_expand(diff, diff0, diff1);
hsad_l += diff0;
hsad_h += diff1;
v_store(hsad + d, hsad_l);
v_store(hsad + d + 8, hsad_h);
v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
if( d <= ndisp - v_uint16::nlanes )
v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
v_store_low(cbuf + d, diff);
v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
d += v_uint16::nlanes;
for( ; d < ndisp; d++ )
int diff = abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] += (ushort)diff;
htext[y] += tab[lval];
......@@ -412,24 +420,27 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
int lval = lptr[0];
v_uint8x16 lv = v_setall_u8((uchar)lval);
for( d = 0; d < ndisp; d += 16 )
v_uint8x16 rv = v_load(rptr + d);
v_uint16x8 hsad_l = v_load(hsad + d);
v_uint16x8 hsad_h = v_load(hsad + d + 8);
v_uint8x16 cbs = v_load(cbuf_sub + d);
v_uint8x16 diff = v_absdiff(lv, rv);
v_int16x8 diff_l, diff_h, cbs_l, cbs_h;
v_uint8 lv = vx_setall_u8((uchar)lval);
for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
v_store(cbuf + d, diff);
v_expand(v_reinterpret_as_s8(diff), diff_l, diff_h);
v_expand(v_reinterpret_as_s8(cbs), cbs_l, cbs_h);
diff_l -= cbs_l;
diff_h -= cbs_h;
hsad_h = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_h) + diff_h);
hsad_l = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_l) + diff_l);
v_store(hsad + d, hsad_l);
v_store(hsad + d + 8, hsad_h);
v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
if( d <= ndisp - v_uint16::nlanes)
v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
v_store_low(cbuf + d, diff);
v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
d += v_uint16::nlanes;
for( ; d < ndisp; d++ )
int diff = abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] = hsad[d] + (ushort)diff - cbuf_sub[d];
htext[y] += tab[lval] - tab[lptr_sub[0]];
......@@ -446,16 +457,24 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
hsad = hsad0 + (1 - dy0)*ndisp;
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
for( d = 0; d <= ndisp-16; d += 16 )
v_uint16x8 s0 = v_load(sad + d);
v_uint16x8 s1 = v_load(sad + d + 8);
v_uint16x8 t0 = v_load(hsad + d);
v_uint16x8 t1 = v_load(hsad + d + 8);
s0 = s0 + t0;
s1 = s1 + t1;
v_store(sad + d, s0);
v_store(sad + d + 8, s1);
for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
if( d <= ndisp-v_uint16::nlanes )
v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
d += v_uint16::nlanes;
if( d <= ndisp-v_uint16::nlanes/2 )
v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
d += v_uint16::nlanes/2;
for( ; d < ndisp; d++ )
sad[d] = sad[d] + hsad[d];
int tsum = 0;
for( y = -wsz2-1; y < wsz2; y++ )
......@@ -467,38 +486,41 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
int minsad = INT_MAX, mind = -1;
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
v_int16x8 minsad8 = v_setall_s16(SHRT_MAX);
v_int16x8 mind8 = v_setall_s16(0), d8 = d0_8;
v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
v_int16 mind8 = vx_setall_s16(0);
for( d = 0; d < ndisp; d += 16 )
for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
v_int16x8 u0 = v_reinterpret_as_s16(v_load(hsad_sub + d));
v_int16x8 u1 = v_reinterpret_as_s16(v_load(hsad + d));
v_int16x8 v0 = v_reinterpret_as_s16(v_load(hsad_sub + d + 8));
v_int16x8 v1 = v_reinterpret_as_s16(v_load(hsad + d + 8));
v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
v_store(sad + d, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
minsad8 = v_min(minsad8, sad8);
v_int16x8 usad8 = v_reinterpret_as_s16(v_load(sad + d));
v_int16x8 vsad8 = v_reinterpret_as_s16(v_load(sad + d + 8));
u1 -= u0;
v1 -= v0;
usad8 += u1;
vsad8 += v1;
v_int16x8 mask = minsad8 > usad8;
minsad8 = v_min(minsad8, usad8);
mind8 = v_max(mind8, (mask& d8));
v_store(sad + d, v_reinterpret_as_u16(usad8));
v_store(sad + d + 8, v_reinterpret_as_u16(vsad8));
mask = minsad8 > vsad8;
minsad8 = v_min(minsad8, vsad8);
d8 = d8 + dd_8;
mind8 = v_max(mind8, (mask & d8));
d8 = d8 + dd_8;
sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d+v_int16::nlanes));
minsad8 = v_min(minsad8, sad8);
if( d <= ndisp - v_int16::nlanes )
v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
v_store(sad + d, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
minsad8 = v_min(minsad8, sad8);
d += v_int16::nlanes;
minsad = v_reduce_min(minsad8);
v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
for( ; d < ndisp; d++ )
int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
sad[d] = (ushort)sad8;
if(minsad > sad8)
mind = d;
minsad = sad8;
tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
......@@ -508,41 +530,42 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
v_store(minsad_buf, v_reinterpret_as_u16(minsad8));
v_store(mind_buf, v_reinterpret_as_u16(mind8));
for( d = 0; d < 8; d++ )
if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
minsad = minsad_buf[d];
mind = mind_buf[d];
if( uniquenessRatio > 0 )
int thresh = minsad + (minsad * uniquenessRatio/100);
v_int32x4 thresh4 = v_setall_s32(thresh + 1);
v_int32x4 d1 = v_setall_s32(mind-1), d2 = v_setall_s32(mind+1);
v_int32x4 dd_4 = v_setall_s32(4);
v_int32x4 d4 = v_int32x4(0,1,2,3);
v_int32x4 mask4;
for( d = 0; d < ndisp; d += 8 )
v_int16x8 sad8 = v_reinterpret_as_s16(v_load(sad + d));
v_int32x4 sad4_l, sad4_h;
v_expand(sad8, sad4_l, sad4_h);
mask4 = thresh4 > sad4_l;
mask4 = mask4 & ((d1 > d4) | (d4 > d2));
if( v_check_any(mask4) )
v_int32 thresh4 = vx_setall_s32(thresh + 1);
v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
v_int32 d4 = vx_load_expand(v_seq);
for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
v_int32 sad4_l, sad4_h;
v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
d4 += dd_4;
mask4 = thresh4 > sad4_h;
mask4 = mask4 & ((d1 > d4) | (d4 > d2));
if( v_check_any(mask4) )
if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
d4 += dd_4;
if( d <= ndisp - v_int16::nlanes )
dptr[y*dstep] = FILTERED;
if( d <= ndisp - v_int32::nlanes )
v_int32 sad4_l = vx_load_expand((short*)sad + d);
if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
d += v_int16::nlanes;
for( ; d < ndisp; d++ )
if( (thresh + 1) > sad[d] && ((mind - 1) > d || d > (mind + 1)) )
if( d < ndisp )
dptr[y*dstep] = FILTERED;
......@@ -571,7 +594,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
uchar* buf, int _dy0, int _dy1 )
const int ALIGN = 16;
const int ALIGN = CV_SIMD_WIDTH;
int x, y, d;
int wsz = state.SADWindowSize, wsz2 = wsz/2;
int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
......@@ -587,12 +610,6 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
const int disp_shift = dispShiftTemplate<mType>::value;
mType FILTERED = (mType)((mindisp - 1) << disp_shift);
#if CV_SIMD128
CV_Assert (ndisp % 8 == 0);
int *sad, *hsad0, *hsad, *hsad_sub, *htext;
uchar *cbuf0, *cbuf;
const uchar* lptr0 = left.ptr() + lofs;
......@@ -607,6 +624,13 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
const int TABSZ = 256;
uchar tab[TABSZ];
int v_seq[v_int32::nlanes];
for (int i = 0; i < v_int32::nlanes; ++i)
v_seq[i] = i;
v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
......@@ -628,22 +652,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
int lval = lptr[0];
d = 0;
#if CV_SIMD128
v_uint8x16 lv = v_setall_u8((uchar)lval);
v_uint8 lv = vx_setall_u8((uchar)lval);
for( ; d <= ndisp - 16; d += 16 )
for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
v_uint8x16 rv = v_load(rptr + d);
v_int32x4 hsad_0 = v_load(hsad + d);
v_int32x4 hsad_1 = v_load(hsad + d + 4);
v_int32x4 hsad_2 = v_load(hsad + d + 8);
v_int32x4 hsad_3 = v_load(hsad + d + 12);
v_uint8x16 diff = v_absdiff(lv, rv);
v_uint8 rv = vx_load(rptr + d);
v_int32 hsad_0 = vx_load(hsad + d);
v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
v_uint8 diff = v_absdiff(lv, rv);
v_store(cbuf + d, diff);
v_uint16x8 diff0, diff1;
v_uint32x4 diff00, diff01, diff10, diff11;
v_uint16 diff0, diff1;
v_uint32 diff00, diff01, diff10, diff11;
v_expand(diff, diff0, diff1);
v_expand(diff0, diff00, diff01);
v_expand(diff1, diff10, diff11);
......@@ -654,9 +678,9 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
hsad_3 += v_reinterpret_as_s32(diff11);
v_store(hsad + d, hsad_0);
v_store(hsad + d + 4, hsad_1);
v_store(hsad + d + 8, hsad_2);
v_store(hsad + d + 12, hsad_3);
v_store(hsad + d + v_int32::nlanes, hsad_1);
v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
......@@ -696,22 +720,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
int lval = lptr[0];
d = 0;
#if CV_SIMD128
v_uint8x16 lv = v_setall_u8((uchar)lval);
for( ; d <= ndisp - 16; d += 16 )
v_uint8x16 rv = v_load(rptr + d);
v_int32x4 hsad_0 = v_load(hsad + d);
v_int32x4 hsad_1 = v_load(hsad + d + 4);
v_int32x4 hsad_2 = v_load(hsad + d + 8);
v_int32x4 hsad_3 = v_load(hsad + d + 12);
v_uint8x16 cbs = v_load(cbuf_sub + d);
v_uint8x16 diff = v_absdiff(lv, rv);
v_uint8 lv = vx_setall_u8((uchar)lval);
for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
v_uint8 rv = vx_load(rptr + d);
v_int32 hsad_0 = vx_load(hsad + d);
v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
v_uint8 cbs = vx_load(cbuf_sub + d);
v_uint8 diff = v_absdiff(lv, rv);
v_store(cbuf + d, diff);
v_uint16x8 diff0, diff1, cbs0, cbs1;
v_int32x4 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
v_uint16 diff0, diff1, cbs0, cbs1;
v_int32 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
v_expand(diff, diff0, diff1);
v_expand(cbs, cbs0, cbs1);
v_expand(v_reinterpret_as_s16(diff0), diff00, diff01);
......@@ -719,19 +743,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
v_int32x4 diff_0 = diff00 - cbs00;
v_int32x4 diff_1 = diff01 - cbs01;
v_int32x4 diff_2 = diff10 - cbs10;
v_int32x4 diff_3 = diff11 - cbs11;
v_int32 diff_0 = diff00 - cbs00;
v_int32 diff_1 = diff01 - cbs01;
v_int32 diff_2 = diff10 - cbs10;
v_int32 diff_3 = diff11 - cbs11;
hsad_0 += diff_0;
hsad_1 += diff_1;
hsad_2 += diff_2;
hsad_3 += diff_3;
v_store(hsad + d, hsad_0);
v_store(hsad + d + 4, hsad_1);
v_store(hsad + d + 8, hsad_2);
v_store(hsad + d + 12, hsad_3);
v_store(hsad + d + v_int32::nlanes, hsad_1);
v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
......@@ -758,18 +782,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
d = 0;
#if CV_SIMD128
for( d = 0; d <= ndisp-8; d += 8 )
for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
v_int32x4 s0 = v_load(sad + d);
v_int32x4 s1 = v_load(sad + d + 4);
v_int32x4 t0 = v_load(hsad + d);
v_int32x4 t1 = v_load(hsad + d + 4);
v_int32 s0 = vx_load(sad + d);
v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
v_int32 t0 = vx_load(hsad + d);
v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
s0 += t0;
s1 += t1;
v_store(sad + d, s0);
v_store(sad + d + 4, s1);
v_store(sad + d + v_int32::nlanes, s1);
......@@ -787,50 +811,31 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
d = 0;
#if CV_SIMD128
v_int32x4 d0_4 = v_int32x4(0, 1, 2, 3);
v_int32x4 dd_4 = v_setall_s32(4);
v_int32x4 minsad4 = v_setall_s32(INT_MAX);
v_int32x4 mind4 = v_setall_s32(0), d4 = d0_4;
v_int32 minsad4 = vx_setall_s32(INT_MAX);
v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;
for( ; d <= ndisp - 8; d += 8 )
for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
v_int32x4 u0 = v_load(hsad_sub + d);
v_int32x4 u1 = v_load(hsad + d);
v_int32x4 v0 = v_load(hsad_sub + d + 4);
v_int32x4 v1 = v_load(hsad + d + 4);
v_int32x4 usad4 = v_load(sad + d);
v_int32x4 vsad4 = v_load(sad + d + 4);
u1 -= u0;
v1 -= v0;
usad4 += u1;
vsad4 += v1;
v_store(sad + d, usad4);
v_store(sad + d + 4, vsad4);
v_int32x4 mask = minsad4 > usad4;
minsad4 = v_min(minsad4, usad4);
mind4 = v_select(mask, d4, mind4);
v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
v_store(sad + d, sad4);
mind4 = v_select(minsad4 > sad4, d4, mind4);
minsad4 = v_min(minsad4, sad4);
d4 += dd_4;
mask = minsad4 > vsad4;
minsad4 = v_min(minsad4, vsad4);
mind4 = v_select(mask, d4, mind4);
sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
v_store(sad + d + v_int32::nlanes, sad4);
mind4 = v_select(minsad4 > sad4, d4, mind4);
minsad4 = v_min(minsad4, sad4);
d4 += dd_4;
int CV_DECL_ALIGNED(16) minsad_buf[4], mind_buf[4];
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
v_store(minsad_buf, minsad4);
v_store(mind_buf, mind4);
if(minsad_buf[0] < minsad || (minsad == minsad_buf[0] && mind_buf[0] < mind)) { minsad = minsad_buf[0]; mind = mind_buf[0]; }
if(minsad_buf[1] < minsad || (minsad == minsad_buf[1] && mind_buf[1] < mind)) { minsad = minsad_buf[1]; mind = mind_buf[1]; }
if(minsad_buf[2] < minsad || (minsad == minsad_buf[2] && mind_buf[2] < mind)) { minsad = minsad_buf[2]; mind = mind_buf[2]; }
if(minsad_buf[3] < minsad || (minsad == minsad_buf[3] && mind_buf[3] < mind)) { minsad = minsad_buf[3]; mind = mind_buf[3]; }
for (int i = 0; i < v_int32::nlanes; ++i)
if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
for( ; d < ndisp; d++ )
......@@ -1027,7 +1032,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
Mat disp_i = disp->rowRange(row0, row1);
Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
#if CV_SIMD128
if (useShorts)
if( disp_i.type() == CV_16S)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment