Unverified Commit 4b632022 authored by Tobias Senst's avatar Tobias Senst Committed by GitHub

Merge pull request #2476 from tsenst:optimize_performance_rlof

Rework RLOF by using HAL universal instructions

* * distinguish between SSE2 and SSE 4.1 support
* SSE2 now implements no paralellized _blendv functiona and allows to compile with SSE2 instructions

* * add interface function do enable disable M-estimator framework
* bugfix blendv functions

* * make use of _mm_store and _mm_load functions in blendv_ps and blendv_epi function to fix compiler error

* * implement substitute of _mm_cvtepi8_epi16 and _mm_cvtepi16_epi32 when compiling with less than SSE 4.1 support

* * implement substitute of _mm_abs_epi16 when compiling with less than SSE 3 support

* * move _mm_abs_epi16 to SSE4 req.

* * add HAL intrinsic functions for RLOF ICA
* first tests are OK

* * HAL intrinsic functions -> RLOF ICA is ready now

* * add HAL functions for RLOF illumination model

* * add HAL for PLK ILL

* * add HAL functions for plk ica

* * commit HAL functions for beplk ica

* * add HAL functions for berlof ica

* * add HAL functions for BEPLK ILL

* *remove unused #ifdef

* * remove white spaces and unused variables

* * remove not used variables

* + remove get4BitMask function which contained pure SEE instructions

* * remove buffIdx which has not been used but initialized

* * fix replacement of #ifdef CV_SIMD128 with #if CV_SIMD128
* rename useMEstimator to setUseMEstimator

* *remove whitespace

* * remove unused variable abss

* * remove unused value

* * rename W_BITS1 to W_BITS since they contain same value
parent 8ab145c4
......@@ -66,12 +66,12 @@ public:
RLOFOpticalFlowParameter()
:solverType(ST_BILINEAR)
,supportRegionType(SR_CROSS)
,normSigma0(3.2f)
,normSigma1(7.f)
,normSigma0(std::numeric_limits<float>::max())
,normSigma1(std::numeric_limits<float>::max())
,smallWinSize(9)
,largeWinSize(21)
,crossSegmentationThreshold(25)
,maxLevel(5)
,maxLevel(4)
,useInitialFlow(false)
,useIlluminationModel(true)
,useGlobalMotionPrior(true)
......@@ -90,13 +90,13 @@ public:
*/
float normSigma0;
/**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
/**< &sigma parameter of the shrinked Hampel norm introduced in @cite Senst2012. If
* &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
* instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
* region the least-square can be fast in computation.
*/
float normSigma1;
/**< &sigma paramter of the shrinked Hampel norm introduced in @cite Senst2012. If
/**< &sigma parameter of the shrinked Hampel norm introduced in @cite Senst2012. If
* &sigma = std::numeric_limist<float>::max() the least-square estimator will be used
* instead of the M-estimator. Althoug M-estimator is more robust against outlier in the support
* region the least-square can be fast in computation.
......@@ -151,6 +151,14 @@ public:
* See @cite Senst2016 for more details.
*/
//! @brief Enable M-estimator or disable and use least-square estimator.
/** Enables M-estimator by setting sigma parameters to (3.2, 7.0). Disabling M-estimator can reduce
* runtime, while enabling can improve the accuracy.
* @param val If true M-estimator is used. If false least-square estimator is used.
* @see setNormSigma0, setNormSigma1
*/
CV_WRAP void setUseMEstimator(bool val);
CV_WRAP void setSolverType(SolverType val);
CV_WRAP SolverType getSolverType() const;
......@@ -216,9 +224,8 @@ public:
* For the RLOF configuration see optflow::RLOFOpticalFlowParameter for further details.
* Parameters have been described in @cite Senst2012 @cite Senst2013 @cite Senst2014 and @cite Senst2016.
*
* @note SIMD parallelization is only available when compiling with SSE4.1. If the grid size is set to (1,1) and the
* forward backward threshold <= 0 that the dense optical flow field is purely.
* computed with the RLOF.
* @note If the grid size is set to (1,1) and the forward backward threshold <= 0 than pixelwise dense optical flow field is
* computed by RLOF without using interpolation.
*
* @see optflow::calcOpticalFlowDenseRLOF(), optflow::RLOFOpticalFlowParameter
*/
......
......@@ -47,7 +47,7 @@ typedef tuple<std::string, int> INTERP_GRID_Dense_t;
typedef TestBaseWithParam<INTERP_GRID_Dense_t> INTERP_GRID_Dense;
PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
testing::Combine(
testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO"),
testing::Values<std::string>("INTERP_EPIC", "INTERP_GEO", "INTERP_RIC"),
testing::Values<int>(4,10))
)
{
......@@ -63,6 +63,8 @@ PERF_TEST_P(INTERP_GRID_Dense, OpticalFlow_DenseRLOF,
interp_type = INTERP_EPIC;
if (get<0>(GetParam()) == "INTERP_GEO")
interp_type = INTERP_GEO;
if (get<0>(GetParam()) == "INTERP_RIC")
interp_type = INTERP_RIC;
PERF_SAMPLE_BEGIN()
calcOpticalFlowDenseRLOF(frame1, frame2,flow, param, 1.0f, Size(get<1>(GetParam()), get<1>(GetParam())), interp_type);
PERF_SAMPLE_END()
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -17,7 +17,6 @@ class TrackerInvoker : public cv::ParallelLoopBody
{
public:
TrackerInvoker(
const Mat& _prevImg,
const Mat& _prevDeriv,
const Mat& _nextImg,
......@@ -73,7 +72,7 @@ public:
int winMaskwidth = roundUp(winSize.width, 16);
cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
winMaskMatBuf.setTo(1);
const float FLT_SCALE = (1.f/(1 << 16));//(1.f/(1 << 20)); // 20
const float FLT_SCALE = (1.f/(1 << 16));
int cn = I.channels(), cn2 = cn*2;
int winbufwidth = roundUp(winSize.width, 16);
......@@ -137,7 +136,7 @@ public:
float a = prevPt.x - iprevPt.x;
float b = prevPt.y - iprevPt.y;
const int W_BITS = 14, W_BITS1 = 14;
const int W_BITS = 14;
int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
......@@ -151,93 +150,8 @@ public:
float w1 = 0, w2 = 0; // -IyI
float dI = 0; // I^2
float D = 0;
#ifdef RLOF_SSE
__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128i z = _mm_setzero_si128();
__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
__m128i mmMask4_epi32;
__m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
get4BitMask(winSize.width, mmMask4_epi32);
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
int x, y;
for( y = 0; y < winSize.height; y++ )
{
const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
short* Iptr = IWinBuf.ptr<short>(y, 0);
short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
#ifdef RLOF_SSE
for( ; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
{
__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
__m128i v00, v01, v10, v11, t0, t1;
v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
if( x + 4 > winSize.width)
{
t0 = _mm_and_si128(t0, mmMask4_epi32);
}
t0 = _mm_and_si128(t0, mask_0_3_epi16);
_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
v00 = _mm_loadu_si128((const __m128i*)(dsrc));
v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
if( x + 4 > winSize.width)
{
v00 = _mm_and_si128(v00, mmMask4_epi32);
}
v00 = _mm_and_si128(v00, mask_0_3_epi16);
_mm_storeu_si128((__m128i*)dIptr, v00);
}
#else
for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
{
if( maskPtr[x] == 0)
{
dIptr[0] = 0;
dIptr[1] = 0;
continue;
}
int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
dsrc1[cn2+1]*iw11, W_BITS1);
Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;
}
#endif
}
copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, iprevPt);
cv::Point2f backUpNextPt = nextPt;
nextPt += halfWin;
......@@ -286,183 +200,136 @@ public:
A12 = 0;
A22 = 0;
}
#ifdef RLOF_SSE
qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
__m128 qb3 = _mm_setzero_ps();
__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
__m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
#if CV_SIMD128
v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32(), vqb2 = v_setzero_f32(), vqb3 = v_setzero_f32();
v_float32x4 vsumW1 = v_setzero_f32(), vsumW2 = v_setzero_f32();
v_float32x4 vsumIy = v_setzero_f32(), vsumIx = v_setzero_f32(), vsumI = v_setzero_f32(), vsumDI = v_setzero_f32();
v_float32x4 vAxx = v_setzero_f32(), vAxy = v_setzero_f32(), vAyy = v_setzero_f32();
v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
v_int16x8 vgain_value = v_setall_s16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
v_int16x8 vconst_value = v_setall_s16(static_cast<short>(gainVec.y));
#endif
for( y = 0; y < winSize.height; y++ )
for(int y = 0; y < winSize.height; y++ )
{
const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
const short* Iptr = IWinBuf.ptr<short>(y, 0);
const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
#ifdef RLOF_SSE
for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
{
// iw = iw << 14 (16384 short / 2)
// iq0 = iw01 |iw00, iw01 |iw00, iw01 |iw00, iw01 |iw00 16 bit
__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr+x))), mmMaskSet_epi16);
// I [0...8]
__m128i diff0, diff1;
__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // element 0 to 7
__m128i v00 = _mm_unpacklo_epi8(
_mm_loadl_epi64((const __m128i*)(Jptr + x)) // J0, J1, J2, ..., J7, 64 bit nullen
, z); //J0 , 0, J1, 0, J2, 0 ... J7,0
__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
__m128i t0 = _mm_add_epi32
(_mm_madd_epi16(
_mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4
qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
__m128i Ilo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
__m128i Ihi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(Ilo, Ihi), bitShift);
__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(Ilo, Ihi), bitShift);
__m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
// diff = J - I + I*gain.x + gain.y
__m128i mmDiff_epi16 = _mm_add_epi16(
_mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I
_mm_add_epi16(Igain_epi16, mmConstValue_epi16));
mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13...
diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9...
v10 = _mm_mullo_epi16(Ixy_0, diff0);
v11 = _mm_mulhi_epi16(Ixy_0, diff0);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
v10 = _mm_mullo_epi16(Ixy_1, diff1);
v11 = _mm_mulhi_epi16(Ixy_1, diff1);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
// for set 1 sigma 1
// b3 += diff * Iptr[x]
Ilo = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
Ihi = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
v00 = _mm_unpacklo_epi16(Ilo, Ihi);
v10 = _mm_unpackhi_epi16(Ilo, Ihi);
qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
// b4+= diff
__m128 mmDiff_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mmDiff_epi16)); // (_mm_srai_epi32(_mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16),16));
__m128 mmDiff_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16),16)));
qb3 = _mm_add_ps(qb3, mmDiff_0_3_ps);
qb3 = _mm_add_ps(qb3, mmDiff_4_7_ps);
if( j == 0 )
#if CV_SIMD128
for(int x = 0 ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
{
__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
v_int16x8 vI = v_reinterpret_as_s16(v_load(Iptr + x)), diff0, diff1, diff2;
v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
__m128 fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_0_4_ps);
__m128 fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_0_4_ps);
//subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
// 0 ... 3
__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
// diff = J - I
diff0 = v_pack(t0, t1) - vI;
// I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1);
diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
diff0 = diff0 & vmask;
v_zip(diff0, diff0, diff2, diff1);
// A11 - A22
mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
v_int32x4 diff0_0;
v_int32x4 diff0_1;
v_expand(diff0, diff0_0, diff0_1);
// sumIx und sumIy
mmSumIx = _mm_add_ps(mmSumIx, fx);
mmSumIy = _mm_add_ps(mmSumIy, fy);
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
// sumI
I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_0_4_ps);
mmSumI = _mm_add_ps(mmSumI,I_ps);
v_int32x4 vI0, vI1;
v_expand(vI, vI0, vI1);
vqb2 += v_cvt_f32(diff0_0 * vI0);
vqb2 += v_cvt_f32(diff0_1 * vI1);
// sumDI
mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
vqb3 += v_cvt_f32(diff0_0);
vqb3 += v_cvt_f32(diff0_1);
t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
if (j == 0)
{
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_0))));
v_expand(v00, t1, t0);
fy = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t0), mask_4_7_ps);
fx = _mm_blendv_ps(_mm_set1_ps(0), _mm_cvtepi32_ps(t1), mask_4_7_ps);
v_float32x4 vI_ps = v_cvt_f32(vI0);
// 4 ... 7
I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
v_float32x4 fy = v_cvt_f32(t0);
v_float32x4 fx = v_cvt_f32(t1);
vAyy = v_muladd(fy, fy, vAyy);
vAxy = v_muladd(fx, fy, vAxy);
vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy
vsumIx += fx;
vsumIy += fy;
vsumW1 += vI_ps * fx;
vsumW2 += vI_ps * fy;
// sumI
vsumI += vI_ps;
// sumDI
vsumDI += vI_ps * vI_ps;
mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fy));
mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fy));
mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fx));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0);
vI_ps = v_cvt_f32(vI1);
fy = v_cvt_f32(t0);
fx = v_cvt_f32(t1);
vAyy = v_muladd(fy, fy, vAyy);
vAxy = v_muladd(fx, fy, vAxy);
vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy
mmSumIx = _mm_add_ps(mmSumIx, fx);
mmSumIy = _mm_add_ps(mmSumIy, fy);
vsumIx += fx;
vsumIy += fy;
mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fx));
mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fy));
vsumW1 += vI_ps * fx;
vsumW2 += vI_ps * fy;
// sumI
I_ps = _mm_blendv_ps(_mm_set1_ps(0), I_ps, mask_4_7_ps);
mmSumI = _mm_add_ps(mmSumI, I_ps);
vsumI += vI_ps;
// sumW
// sumDI
mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps( I_ps, I_ps));
vsumDI += vI_ps * vI_ps;
}
}
#else
for( ; x < winSize.width*cn; x++, dIptr += 2 )
for(int x = 0 ; x < winSize.width*cn; x++, dIptr += 2 )
{
if( maskPtr[x] == 0)
continue;
int J_val = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
W_BITS1-5);
W_BITS-5);
int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
b1 += (float)(diff*dIptr[0]) * FLT_RESCALE;
......@@ -492,26 +359,19 @@ public:
}
#endif
}
#ifdef RLOF_SSE
float CV_DECL_ALIGNED(16) wbuf[4];
#endif
if( j == 0 )
{
#ifdef RLOF_SSE
_mm_store_ps(wbuf, mmSumW1);
w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumW2);
w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumDI);
dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumI);
sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumIx);
sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumIy);
sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumW);
sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
#if CV_SIMD128
w1 = v_reduce_sum(vsumW1);
w2 = v_reduce_sum(vsumW2);
dI = v_reduce_sum(vsumDI);
sumI = v_reduce_sum(vsumI);
sumIx = v_reduce_sum(vsumIx);
sumIy = v_reduce_sum(vsumIy);
A11 = v_reduce_sum(vAxx);
A12 = v_reduce_sum(vAxy);
A22 = v_reduce_sum(vAyy);
#endif
sumIx *= -FLT_SCALE;
sumIy *= -FLT_SCALE;
......@@ -520,31 +380,19 @@ public:
w1 *= -FLT_SCALE;
w2 *= -FLT_SCALE;
dI *= FLT_SCALE;
#ifdef RLOF_SSE
float CV_DECL_ALIGNED(32) A11buf[4], A12buf[4], A22buf[4];//
_mm_store_ps(A11buf, mmAxx);
_mm_store_ps(A12buf, mmAxy);
_mm_store_ps(A22buf, mmAyy);
A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
#endif
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
}
#ifdef RLOF_SSE
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
v_store_aligned(bbuf, vqb0 + vqb1);
b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3];
_mm_store_ps(bbuf, qb2);
b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
_mm_store_ps(bbuf, qb3);
b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
b3 = v_reduce_sum(vqb2);
b4 = v_reduce_sum(vqb3);
#endif
mismatchMat(0) = b1 * FLT_SCALE;
......@@ -718,8 +566,6 @@ public:
cv::Mat winMaskMatBuf(winMaskwidth, winMaskwidth, tCVMaskType);
winMaskMatBuf.setTo(1);
const float FLT_SCALE = (1.f/(1 << 20)); // 20
int j, cn = I.channels(), cn2 = cn*2;
......@@ -775,7 +621,7 @@ public:
float a = prevPt.x - iprevPt.x;
float b = prevPt.y - iprevPt.y;
const int W_BITS = 14, W_BITS1 = 14;
const int W_BITS = 14;
int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
......@@ -783,124 +629,9 @@ public:
int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
float A11 = 0, A12 = 0, A22 = 0;
#ifdef RLOF_SSE
__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128i z = _mm_setzero_si128();
__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
__m128i mmMask4_epi32;
get4BitMask(winSize.width, mmMask4_epi32);
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
int x, y;
for( y = 0; y < winSize.height; y++ )
{
const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
short* Iptr = IWinBuf.ptr<short>(y, 0);
short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
#ifdef RLOF_SSE
for( ; x <= winSize.width*cn; x += 4, dsrc += 4*2, dsrc1 += 8, dIptr += 4*2 )
{
__m128i wMask = _mm_set_epi32(MaskSet * maskPtr[x+3],
MaskSet * maskPtr[x+2],
MaskSet * maskPtr[x+1],
MaskSet * maskPtr[x]);
__m128i v00, v01, v10, v11, t0, t1;
v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
v00 = _mm_loadu_si128((const __m128i*)(dsrc));
v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
if( x + 4 > winSize.width)
{
v00 = _mm_and_si128(v00, mmMask4_epi32);
}
v00 = _mm_and_si128(v00, wMask);
_mm_storeu_si128((__m128i*)dIptr, v00);
t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
__m128 fy = _mm_cvtepi32_ps(t0);
__m128 fx = _mm_cvtepi32_ps(t1);
qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
}
#else
for( ; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2 )
{
if( maskPtr[x] == 0)
{
dIptr[0] = 0;
dIptr[1] = 0;
continue;
}
int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src1[x]*iw10 + src1[x+cn]*iw11, W_BITS1-5);
int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc1[0]*iw10 + dsrc1[cn2]*iw11, W_BITS1);
int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc1[1]*iw10 +
dsrc1[cn2+1]*iw11, W_BITS1);
Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;
A11 += (float)(ixval*ixval);
A12 += (float)(ixval*iyval);
A22 += (float)(iyval*iyval);
}
#endif
}
#ifdef RLOF_SSE
float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
_mm_store_ps(A11buf, qA11);
_mm_store_ps(A12buf, qA12);
_mm_store_ps(A22buf, qA22);
A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
#endif
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, A11, A22, A12, iprevPt);
float D = A11*A22 - A12*A12;
float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
......@@ -920,10 +651,6 @@ public:
nextPt += halfWin;
Point2f prevDelta(0,0); //relates to h(t-1)
#ifdef RLOF_SSE
__m128i mmMask0, mmMask1, mmMask;
getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
#endif
for( j = 0; j < criteria.maxCount; j++ )
{
status[ptidx] = static_cast<uchar>(j);
......@@ -945,12 +672,12 @@ public:
iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
float b1 = 0, b2 = 0;
#ifdef RLOF_SSE
qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
#if CV_SIMD128
v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32();
v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
#endif
for( y = 0; y < winSize.height; y++ )
{
......@@ -960,58 +687,38 @@ public:
const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
x = 0;
#ifdef RLOF_SSE
#if CV_SIMD128
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
for( ; x <= winSize.width*cn; x += 8, dIptr += 8*2 )
{
if( maskPtr[x ] == 0 && maskPtr[x+1] == 0 && maskPtr[x+2] == 0 && maskPtr[x+3] == 0
&& maskPtr[x+4] == 0 && maskPtr[x+5] == 0 && maskPtr[x+6] == 0 && maskPtr[x+7] == 0)
continue;
__m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
__m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
__m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
if( x > winSize.width*cn - 8)
{
Ixy_0 = _mm_and_si128(Ixy_0, mmMask0);
Ixy_1 = _mm_and_si128(Ixy_1, mmMask1);
}
v_int16x8 diff0 = v_reinterpret_as_s16(v_load(Iptr + x)), diff1, diff2;
v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13...
diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9...
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
diff0 = v_pack(t0, t1) - diff0;
diff0 = diff0 & vmask;
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v10 = _mm_mullo_epi16(Ixy_0, diff0);
v11 = _mm_mulhi_epi16(Ixy_0, diff0);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
// It * Ix It * Iy [4 ... 7]
// for set 1 hi sigma 1
v10 = _mm_mullo_epi16(Ixy_1, diff1);
v11 = _mm_mulhi_epi16(Ixy_1, diff1);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
}
#else
for( ; x < winSize.width*cn; x++, dIptr += 2 )
......@@ -1020,7 +727,7 @@ public:
continue;
int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr1[x]*iw10 + Jptr1[x+cn]*iw11,
W_BITS1-5) - Iptr[x];
W_BITS-5) - Iptr[x];
b1 += (float)(diff*dIptr[0]) * FLT_RESCALE;
b2 += (float)(diff*dIptr[1]) * FLT_RESCALE;
......@@ -1028,11 +735,11 @@ public:
#endif
}
#ifdef RLOF_SSE
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
b1 += bbuf[0] + bbuf[2];
b2 += bbuf[1] + bbuf[3];
v_store_aligned(bbuf, vqb0 + vqb1);
b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3];
#endif
b1 *= FLT_SCALE;
......
......@@ -126,7 +126,7 @@ public:
float a = prevPt.x - iprevPt.x;
float b = prevPt.y - iprevPt.y;
const int W_BITS = 14, W_BITS1 = 14;
const int W_BITS = 14;
int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
......@@ -137,104 +137,13 @@ public:
float D = 0;
float minEig;
#ifdef RLOF_SSE
__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128i z = _mm_setzero_si128();
__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
__m128i mmMask4_epi32;
__m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
get4BitMask(winSize.width, mmMask4_epi32);
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
int x, y;
for (y = 0; y < winSize.height; y++)
{
const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
short* Iptr = IWinBuf.ptr<short>(y, 0);
short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
#ifdef RLOF_SSE
for (; x <= winSize.width*cn; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
{
__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
__m128i v00, v01, v10, v11, t0, t1;
v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
if (x + 4 > winSize.width)
{
t0 = _mm_and_si128(t0, mmMask4_epi32);
}
t0 = _mm_and_si128(t0, mask_0_3_epi16);
_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
v00 = _mm_loadu_si128((const __m128i*)(dsrc));
v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
if (x + 4 > winSize.width)
{
v00 = _mm_and_si128(v00, mmMask4_epi32);
}
v00 = _mm_and_si128(v00, mask_0_3_epi16);
_mm_storeu_si128((__m128i*)dIptr, v00);
}
#else
for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
{
if (maskPtr[x] == 0)
{
dIptr[0] = 0;
dIptr[1] = 0;
continue;
}
int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS1);
int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
dsrc1[cn2 + 1] * iw11, W_BITS1);
Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;
}
#endif
}
copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, iprevPt);
cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
cv::Point2f backUpNextPt = nextPt;
nextPt += halfWin;
int j;
#ifdef RLOF_SSE
__m128i mmMask0, mmMask1, mmMask;
getWBitMask(winSize.width, mmMask0, mmMask1, mmMask);
#endif
float MEstimatorScale = 1;
int buffIdx = 0;
cv::Point2f prevDelta(0, 0);
......@@ -274,19 +183,18 @@ public:
if (j == 0 )
{
buffIdx = 0;
for (y = 0; y < winSize.height; y++)
for (int y = 0; y < winSize.height; y++)
{
const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
const short* Iptr = IWinBuf.ptr<short>(y, 0);
const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
for (; x < winSize.width*cn; x++, dIptr += 2)
for (int x = 0; x < winSize.width*cn; x++, dIptr += 2)
{
if (maskPtr[x] == 0)
continue;
int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS1 - 5) - Iptr[x];
int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11, W_BITS - 5) - Iptr[x];
residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
}
}
......@@ -300,171 +208,144 @@ public:
float fParam1 = normSigma1 * 32.f;
fParam0 = normSigma0 * MEstimatorScale;
fParam1 = normSigma1 * MEstimatorScale;
#ifdef RLOF_SSE
qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
float s2Val = std::fabs(normSigma2);
int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
__m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2);
__m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
__m128 mmOnes = _mm_set1_ps(1.f);
__m128i mmEta = _mm_setzero_si128();
__m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
#if CV_SIMD128
v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32();
v_float32x4 vAxx = v_setzero_f32(), vAxy = v_setzero_f32(), vAyy = v_setzero_f32();
v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
v_int16x8 vscale = v_setall_s16(static_cast<short>(MEstimatorScale));
v_int16x8 veta = v_setzero_s16();
v_int16x8 vzero = v_setzero_s16();
v_int16x8 vparam0 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
v_int16x8 vparam1 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
v_int16x8 vneg_param1 = v_setall_s16(-MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / std::fabs(normSigma2)) / log(2.f));
v_int16x8 vparam2 = v_setall_s16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
v_int16x8 voness = v_setall_s16(1 << s2bitShift);
v_float32x4 vparam2s = v_setall_f32(0.01f * normSigma2);
v_float32x4 vparam2s2 = v_setall_f32(normSigma2 * normSigma2);
v_float32x4 vones = v_setall_f32(1.f);
v_float32x4 vzeros = v_setzero_f32();
v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
#endif
buffIdx = 0;
for (y = 0; y < winSize.height; y++)
for (int y = 0; y < winSize.height; y++)
{
const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
const short* Iptr = IWinBuf.ptr<short>(y, 0);
const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
#ifdef RLOF_SSE
for (; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
{
__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
__m128i diff0, diff1;
__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
__m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
__m128i mmDiff_epi16 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16);
mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
__m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
__m128i bSet2_epi16, bSet1_epi16;
#if CV_SIMD128
for (int x = 0; x <= winSize.width*cn; x += 8, dIptr += 8 * 2)
{
v_int16x8 diff0 = v_reinterpret_as_s16(v_load(Iptr + x)), diff1, diff2;
v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
diff0 = v_pack(t0, t1) - diff0;
diff0 = diff0 & vmask;
v_int16x8 vscale_diff_is_pos = diff0 > vscale;
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
// since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1;
// |It| < sigma1 ?
bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
vset2 = vabs_diff < vparam1;
// It > 0 ?
__m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
v_int16x8 vdiff_is_pos = diff0 > vzero;
// sigma0 < |It| < sigma1 ?
bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
vset1 = vset2 & (vabs_diff > vparam0);
// val = |It| -/+ sigma1
__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
_mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
// It == 0 ? |It| > sigma13
mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
diff0 = vset2 & diff0;
// It == val ? sigma0 < |It| < sigma1
mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
diff0 = v_select(vset1, vtmp_param1, diff0);
__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
v_int16x8 tale_ = v_select(vset1, vparam2, voness); // mask for 0 - 3
// diff = diff * sigma2
__m128i lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
__m128i hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32);
diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13...
diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9...
// Ix * diff / Iy * diff
v10 = _mm_mullo_epi16(Ixy_0, diff0);
v11 = _mm_mulhi_epi16(Ixy_0, diff0);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
// It * Ix It * Iy [4 ... 7]
// for set 1 hi sigma 1
v10 = _mm_mullo_epi16(Ixy_1, diff1);
v11 = _mm_mulhi_epi16(Ixy_1, diff1);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
if (j == 0)
{
__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
v_int32x4 vmask_0, vmask_1;
__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
v_expand(vset1, vset1_0, vset1_1);
v_expand(vset2, vset2_0, vset2_1);
v_expand(vmask, vmask_0, vmask_1);
__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
v_float32x4 vtale_0 = v_select(v_reinterpret_as_f32(vset1_0), vparam2s2, vones);
v_float32x4 vtale_1 = v_select(v_reinterpret_as_f32(vset1_1), vparam2s2, vones);
vtale_0 = v_select(v_reinterpret_as_f32(vset2_0), vtale_0, vparam2s);
vtale_1 = v_select(v_reinterpret_as_f32(vset2_1), vtale_1, vparam2s);
tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
vtale_0 = v_select(v_reinterpret_as_f32(vmask_0), vtale_0, vzeros);
vtale_1 = v_select(v_reinterpret_as_f32(vmask_1), vtale_1, vzeros);
t0 = _mm_srai_epi32(Ixy_0, 16); // Iy0 Iy1 Iy2 Iy3
t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16); // Ix0 Ix1 Ix2 Ix3
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_0))));
v_expand(v00, t1, t0);
__m128 fy = _mm_cvtepi32_ps(t0);
__m128 fx = _mm_cvtepi32_ps(t1);
v_float32x4 fy = v_cvt_f32(t0);
v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22
__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
v_float32x4 fxtale = fx * vtale_0;
v_float32x4 fytale = fy * vtale_0;
mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0);
fy = _mm_cvtepi32_ps(t0);
fx = _mm_cvtepi32_ps(t1);
fy = v_cvt_f32(t0);
fx = v_cvt_f32(t1);
// A11 - A22
fxtale = _mm_mul_ps(fx, tale_4_7_ps);
fytale = _mm_mul_ps(fy, tale_4_7_ps);
fxtale = fx * vtale_1;
fytale = fy * vtale_1;
mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
}
}
#else
for (; x < winSize.width*cn; x++, dIptr += 2)
for (int x = 0; x < winSize.width*cn; x++, dIptr += 2)
{
if (maskPtr[x] == 0)
continue;
int diff = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 +
Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
W_BITS1 - 5) - Iptr[x];
W_BITS - 5) - Iptr[x];
if (diff > MEstimatorScale)
MEstimatorScale += eta;
......@@ -512,39 +393,25 @@ public:
A12 += (float)(ixval*iyval*tale);
A22 += (float)(iyval*iyval*tale);
}
}
#endif
}
#ifdef RLOF_SSE
short etaValues[8];
_mm_storeu_si128((__m128i*)(etaValues), mmEta);
MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+ etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
#if CV_SIMD128
MEstimatorScale += eta * v_reduce_sum(veta);
#endif
if (j == 0)
{
#ifdef RLOF_SSE
float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];//
_mm_store_ps(A11buf, mmAxx);
_mm_store_ps(A12buf, mmAxy);
_mm_store_ps(A22buf, mmAyy);
A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
#if CV_SIMD128
A11 = v_reduce_sum(vAxx);
A12 = v_reduce_sum(vAxy);
A22 = v_reduce_sum(vAyy);
#endif
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
D = A11 * A22 - A12 * A12;
minEig = (A22 + A11 - std::sqrt((A11 - A22)*(A11 - A22) +
4.f*A12*A12)) / (2 * winArea);
......@@ -562,17 +429,15 @@ public:
}
}
#ifdef RLOF_SSE
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
v_store_aligned(bbuf, vqb0 + vqb1);
b1 += bbuf[0] + bbuf[2];
b2 += bbuf[1] + bbuf[3];
#endif
b1 *= FLT_SCALE;
b2 *= FLT_SCALE;
Point2f delta((float)((A12*b2 - A22 * b1) * D), (float)((A12*b1 - A11 * b2) * D));
delta.x = (delta.x != delta.x) ? 0 : delta.x;
......@@ -587,13 +452,9 @@ public:
nextPts[ptidx] -= delta * 0.5f;
break;
}
prevDelta = delta;
}
}
}
const Mat* prevImg;
......@@ -672,9 +533,6 @@ public:
void operator()(const cv::Range& range) const CV_OVERRIDE
{
#ifdef DEBUG_INVOKER
printf("rlof::radial");fflush(stdout);
#endif
Point2f halfWin;
cv::Size winSize;
const Mat& I = *prevImg;
......@@ -753,7 +611,7 @@ public:
float a = prevPt.x - iprevPt.x;
float b = prevPt.y - iprevPt.y;
const int W_BITS = 14, W_BITS1 = 14;
const int W_BITS = 14;
int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
......@@ -771,95 +629,7 @@ public:
float dI = 0; // I^2
float D = 0;
#ifdef RLOF_SSE
__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128i z = _mm_setzero_si128();
__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1 - 1));
__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1 - 5 - 1));
__m128i mmMask4_epi32;
__m128i mmMaskSet_epi16 = _mm_set1_epi16(std::numeric_limits<unsigned short>::max());
get4BitMask(winSize.width, mmMask4_epi32);
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
int x, y;
for (y = 0; y < winSize.height; y++)
{
const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
short* Iptr = IWinBuf.ptr<short>(y, 0);
short* dIptr = derivIWinBuf.ptr<short>(y, 0);
x = 0;
#ifdef RLOF_SSE
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
for (; x <= winBufSize.width*cn - 4; x += 4, dsrc += 4 * 2, dsrc1 += 8, dIptr += 4 * 2)
{
__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
__m128i mask_0_3_epi16 = _mm_unpacklo_epi16(mask_0_7_epi16, mask_0_7_epi16);
__m128i v00, v01, v10, v11, t0, t1;
v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x)), z);
v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + x + cn)), z);
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
if (x + 4 > winSize.width)
{
t0 = _mm_and_si128(t0, mmMask4_epi32);
}
t0 = _mm_and_si128(t0, mask_0_3_epi16);
_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0, t0));
v00 = _mm_loadu_si128((const __m128i*)(dsrc));
v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
v10 = _mm_loadu_si128((const __m128i*)(dsrc1));
v11 = _mm_loadu_si128((const __m128i*)(dsrc1 + cn2));
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
if (x + 4 > winSize.width)
{
v00 = _mm_and_si128(v00, mmMask4_epi32);
}
v00 = _mm_and_si128(v00, mask_0_3_epi16);
_mm_storeu_si128((__m128i*)dIptr, v00);
}
#else
for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
{
if (winMaskMat.at<uchar>(y, x) == 0)
{
dIptr[0] = 0;
dIptr[1] = 0;
continue;
}
int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
src1[x] * iw10 + src1[x + cn] * iw11, W_BITS1 - 5);
int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
dsrc1[0] * iw10 + dsrc1[ cn2] * iw11, W_BITS1);
int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
dsrc1[cn2 + 1] * iw11, W_BITS1);
Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;
}
#endif
}
copyWinBuffers(iw00, iw01, iw10, iw11, winSize, I, derivI, winMaskMat, IWinBuf, derivIWinBuf, iprevPt);
cv::Mat residualMat = cv::Mat::zeros(winSize.height * (winSize.width + 8) * cn, 1, CV_16SC1);
......@@ -871,11 +641,6 @@ public:
cv::Point2f backUpGain = gainVec;
cv::Size _winSize = winSize;
int j;
#ifdef RLOF_SSE
__m128i mmMask0, mmMask1, mmMask;
getWBitMask(_winSize.width, mmMask0, mmMask1, mmMask);
__m128 mmOnes = _mm_set1_ps(1.f);
#endif
float MEstimatorScale = 1;
int buffIdx = 0;
float minEigValue;
......@@ -929,21 +694,20 @@ public:
if (j == 0 )
{
buffIdx = 0;
for (y = 0; y < winSize.height; y++)
for (int y = 0; y < winSize.height; y++)
{
const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
const short* Iptr = IWinBuf.ptr<short>(y, 0);
const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
x = 0;
for (; x < winSize.width*cn; x++, dIptr += 2)
for (int x = 0; x < winSize.width*cn; x++, dIptr += 2)
{
if (dIptr[0] == 0 && dIptr[1] == 0)
continue;
int diff = static_cast<int>(CV_DESCALE( Jptr[x] * iw00 +
Jptr[x + cn] * iw01 +
Jptr1[x] * iw10 +
Jptr1[x + cn] * iw11, W_BITS1 - 5)
Jptr1[x + cn] * iw11, W_BITS - 5)
- Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
residualMat.at<short>(buffIdx++) = static_cast<short>(diff);
}
......@@ -959,248 +723,205 @@ public:
fParam0 = normSigma0 * MEstimatorScale;
fParam1 = normSigma1 * MEstimatorScale;
#ifdef RLOF_SSE
qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(), qb2 = _mm_setzero_ps();
__m128 qb3 = _mm_setzero_ps();
__m128 mmSumW1 = _mm_setzero_ps(), mmSumW2 = _mm_setzero_ps();
__m128 mmSumI = _mm_setzero_ps(), mmSumW = _mm_setzero_ps(), mmSumDI = _mm_setzero_ps();
__m128 mmSumIy = _mm_setzero_ps(), mmSumIx = _mm_setzero_ps();
__m128 mmAxx = _mm_setzero_ps(), mmAxy = _mm_setzero_ps(), mmAyy = _mm_setzero_ps();
__m128i mmParam0 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
__m128i mmParam1 = _mm_set1_epi16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
float s2Val = std::fabs(normSigma2);
int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / s2Val) / log(2.f));
__m128i mmParam2_epi16 = _mm_set1_epi16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
__m128i mmOness_epi16 = _mm_set1_epi16(1 << s2bitShift);
__m128 mmParam2s = _mm_set1_ps(0.01f * normSigma2);
__m128 mmParam2s2 = _mm_set1_ps(normSigma2 * normSigma2);
#if CV_SIMD128
v_int16x8 vqw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
v_int16x8 vqw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
v_float32x4 vqb0 = v_setzero_f32(), vqb1 = v_setzero_f32(), vqb2 = v_setzero_f32(), vqb3 = v_setzero_f32();
v_float32x4 vsumW1 = v_setzero_f32(), vsumW2 = v_setzero_f32(), vsumW = v_setzero_f32();
v_float32x4 vsumIy = v_setzero_f32(), vsumIx = v_setzero_f32(), vsumI = v_setzero_f32(), vsumDI = v_setzero_f32();
v_float32x4 vAxx = v_setzero_f32(), vAxy = v_setzero_f32(), vAyy = v_setzero_f32();
int s2bitShift = normSigma2 == 0 ? 1 : cvCeil(log(200.f / std::fabs(normSigma2)) / log(2.f));
v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
v_int16x8 vzero = v_setzero_s16();
v_int16x8 voness = v_setall_s16(1 << s2bitShift);
v_float32x4 vones = v_setall_f32(1.f);
v_float32x4 vzeros = v_setzero_f32();
v_int16x8 vmax_val_16 = v_setall_s16(std::numeric_limits<unsigned short>::max());
v_int16x8 vscale = v_setall_s16(static_cast<short>(MEstimatorScale));
v_int16x8 veta = v_setzero_s16();
v_int16x8 vparam0 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam0)));
v_int16x8 vparam1 = v_setall_s16(MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
v_int16x8 vneg_param1 = v_setall_s16(-MIN(std::numeric_limits<short>::max() - 1, static_cast<short>(fParam1)));
v_int16x8 vparam2 = v_setall_s16(static_cast<short>(normSigma2 * (float)(1 << s2bitShift)));
v_float32x4 vparam2s = v_setall_f32(0.01f * normSigma2);
v_float32x4 vparam2s2 = v_setall_f32(normSigma2 * normSigma2);
float gainVal = gainVec.x > 0 ? gainVec.x : -gainVec.x;
int bitShift = gainVec.x == 0 ? 1 : cvCeil(log(200.f / gainVal) / log(2.f));
__m128i mmGainValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
__m128i mmConstValue_epi16 = _mm_set1_epi16(static_cast<short>(gainVec.y));
__m128i mmEta = _mm_setzero_si128();
__m128i mmScale = _mm_set1_epi16(static_cast<short>(MEstimatorScale));
v_int16x8 vgain_value = v_setall_s16(static_cast<short>(gainVec.x * (float)(1 << bitShift)));
v_int16x8 vconst_value = v_setall_s16(static_cast<short>(gainVec.y));
#endif
buffIdx = 0;
for (y = 0; y < _winSize.height; y++)
for (int y = 0; y < _winSize.height; y++)
{
const uchar* Jptr = J.ptr<uchar>(y + inextPt.y, inextPt.x*cn);
const uchar* Jptr1 = J.ptr<uchar>(y + inextPt.y + 1, inextPt.x*cn);
const short* Iptr = IWinBuf.ptr<short>(y, 0);
const short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
#ifdef RLOF_SSE
for (; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
{
__m128i mask_0_7_epi16 = _mm_mullo_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*)(maskPtr + x))), mmMaskSet_epi16);
__m128i diff0, diff1;
__m128i I_0_7_epi16 = _mm_loadu_si128((const __m128i*)(Iptr + x)); // von element 0 bis 7
__m128i v00 = _mm_unpacklo_epi8(
_mm_loadl_epi64((const __m128i*)(Jptr + x)) , z); //J0 , 0, J1, 0, J2, 0 ... J7,0
__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x)), z);
__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr1 + x + cn)), z);
__m128i t0 = _mm_add_epi32
(_mm_madd_epi16(
_mm_unpacklo_epi16(v00, v01), //J0, , J1, J1, J2, J2, ... J3 , J4
qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1 - 5);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1 - 5);
__m128i lo = _mm_mullo_epi16(mmGainValue_epi16, I_0_7_epi16);
__m128i hi = _mm_mulhi_epi16(mmGainValue_epi16, I_0_7_epi16);
__m128i Igain_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), bitShift);
__m128i Igain_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), bitShift);
__m128i Igain_epi16 = _mm_packs_epi32(Igain_0_3_epi32, Igain_4_7_epi32);
// diff = J - I + I*gain.x + gain.y
__m128i mmDiff_epi16 = _mm_add_epi16(
_mm_subs_epi16(_mm_packs_epi32(t0, t1), I_0_7_epi16), // J - I
_mm_add_epi16(Igain_epi16, mmConstValue_epi16));
mmDiff_epi16 = _mm_and_si128(mmDiff_epi16, mask_0_7_epi16);
__m128i scalediffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, mmScale);
mmEta = _mm_add_epi16(mmEta, _mm_add_epi16(_mm_and_si128(scalediffIsPos_epi16, _mm_set1_epi16(2)), _mm_set1_epi16(-1)));
__m128i Ixy_0 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1
__m128i Ixy_1 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
__m128i abs_epi16 = _mm_abs_epi16(mmDiff_epi16);
__m128i bSet2_epi16, bSet1_epi16;
#if CV_SIMD128
for (int x = 0; x <= _winSize.width*cn; x += 8, dIptr += 8 * 2)
{
v_int16x8 vI = v_reinterpret_as_s16(v_load(Iptr + x)), diff0, diff1, diff2;
v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
//subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
// diff = J - I
diff0 = v_pack(t0, t1) - vI;
// I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1);
diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
diff0 = diff0 & vmask;
v_int16x8 vscale_diff_is_pos = diff0 > vscale;
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
// since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1;
// |It| < sigma1 ?
bSet2_epi16 = _mm_cmplt_epi16(abs_epi16, mmParam1);
vset2 = vabs_diff < vparam1;
// It > 0 ?
__m128i diffIsPos_epi16 = _mm_cmpgt_epi16(mmDiff_epi16, _mm_setzero_si128());
v_int16x8 vdiff_is_pos = diff0 > vzero;
// sigma0 < |It| < sigma1 ?
bSet1_epi16 = _mm_and_si128(bSet2_epi16, _mm_cmpgt_epi16(abs_epi16, mmParam0));
vset1 = vset2 & (vabs_diff > vparam0);
// val = |It| -/+ sigma1
__m128i tmpParam1_epi16 = _mm_add_epi16(_mm_and_si128(diffIsPos_epi16, _mm_sub_epi16(mmDiff_epi16, mmParam1)),
_mm_andnot_si128(diffIsPos_epi16, _mm_add_epi16(mmDiff_epi16, mmParam1)));
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
// It == 0 ? |It| > sigma13
mmDiff_epi16 = _mm_and_si128(bSet2_epi16, mmDiff_epi16);
diff0 = vset2 & diff0;
// It == val ? sigma0 < |It| < sigma1
mmDiff_epi16 = _mm_blendv_epi8(mmDiff_epi16, tmpParam1_epi16, bSet1_epi16);
diff0 = v_select(vset1, vtmp_param1, diff0);
__m128i tale_epi16_ = _mm_blendv_epi8(mmOness_epi16, mmParam2_epi16, bSet1_epi16); // mask for 0 - 3
v_int16x8 tale_ = v_select(vset1, vparam2, voness); // mask for 0 - 3
// diff = diff * sigma2
lo = _mm_mullo_epi16(tale_epi16_, mmDiff_epi16);
hi = _mm_mulhi_epi16(tale_epi16_, mmDiff_epi16);
__m128i diff_0_3_epi32 = _mm_srai_epi32(_mm_unpacklo_epi16(lo, hi), s2bitShift); //diff 0_3_epi32
__m128i diff_4_7_epi32 = _mm_srai_epi32(_mm_unpackhi_epi16(lo, hi), s2bitShift); // diff 4_7_epi32
mmDiff_epi16 = _mm_packs_epi32(diff_0_3_epi32, diff_4_7_epi32); // ,da das ergebniss kleiner als 16 bit sein sollte
diff1 = _mm_unpackhi_epi16(mmDiff_epi16, mmDiff_epi16); // It4 It4 It5 It5 It6 It6 It7 It7 | It12 It12 It13 It13...
diff0 = _mm_unpacklo_epi16(mmDiff_epi16, mmDiff_epi16); // It0 It0 It1 It1 It2 It2 It3 It3 | It8 It8 It9 It9...
// Ix * diff / Iy * diff
v10 = _mm_mullo_epi16(Ixy_0, diff0);
v11 = _mm_mulhi_epi16(Ixy_0, diff0);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
// It * Ix It * Iy [4 ... 7]
// for set 1 hi sigma 1
v10 = _mm_mullo_epi16(Ixy_1, diff1);
v11 = _mm_mulhi_epi16(Ixy_1, diff1);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
// diff * J [0 ... 7]
// for set 1 sigma 1
// b3 += diff * Iptr[x]
v10 = _mm_mullo_epi16(mmDiff_epi16, I_0_7_epi16);
v11 = _mm_mulhi_epi16(mmDiff_epi16, I_0_7_epi16);
v00 = _mm_unpacklo_epi16(v10, v11);
v10 = _mm_unpackhi_epi16(v10, v11);
qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v00));
qb2 = _mm_add_ps(qb2, _mm_cvtepi32_ps(v10));
qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_0_3_epi32));
qb3 = _mm_add_ps(qb3, _mm_cvtepi32_ps(diff_4_7_epi32));
v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
diff0 = v_pack(diff0_0, diff0_1);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
v_int32x4 vI0, vI1;
v_expand(vI, vI0, vI1);
vqb2 += v_cvt_f32(diff0_0 * vI0);
vqb2 += v_cvt_f32(diff0_1 * vI1);
vqb3 += v_cvt_f32(diff0_0);
vqb3 += v_cvt_f32(diff0_1);
if (j == 0)
{
__m128 bSet1_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet1_epi16));
__m128 bSet1_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet1_epi16, bSet1_epi16), 16));
__m128 mask_0_4_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(mask_0_7_epi16));
__m128 mask_4_7_ps = _mm_cvtepi32_ps((_mm_srai_epi32(_mm_unpackhi_epi16(mask_0_7_epi16, mask_0_7_epi16), 16)));
v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
v_int32x4 vmask_0, vmask_1;
__m128 bSet2_0_3_ps = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(bSet2_epi16));
__m128 bSet2_4_7_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(bSet2_epi16, bSet2_epi16), 16));
v_expand(vset1, vset1_0, vset1_1);
v_expand(vset2, vset2_0, vset2_1);
v_expand(vmask, vmask_0, vmask_1);
__m128 tale_0_3_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_0_3_ps);
__m128 tale_4_7_ps = _mm_blendv_ps(mmOnes, mmParam2s2, bSet1_4_7_ps);
tale_0_3_ps = _mm_blendv_ps(mmParam2s, tale_0_3_ps, bSet2_0_3_ps);
tale_4_7_ps = _mm_blendv_ps(mmParam2s, tale_4_7_ps, bSet2_4_7_ps);
v_float32x4 vtale_0 = v_select(v_reinterpret_as_f32(vset1_0), vparam2s2, vones);
v_float32x4 vtale_1 = v_select(v_reinterpret_as_f32(vset1_1), vparam2s2, vones);
vtale_0 = v_select(v_reinterpret_as_f32(vset2_0), vtale_0, vparam2s);
vtale_1 = v_select(v_reinterpret_as_f32(vset2_1), vtale_1, vparam2s);
tale_0_3_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_0_3_ps, mask_0_4_ps);
tale_4_7_ps = _mm_blendv_ps(_mm_set1_ps(0), tale_4_7_ps, mask_4_7_ps);
t0 = _mm_srai_epi32(Ixy_0, 16);
t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_0, 16), 16);
vtale_0 = v_select(v_reinterpret_as_f32(vmask_0), vtale_0, vzeros);
vtale_1 = v_select(v_reinterpret_as_f32(vmask_1), vtale_1, vzeros);
__m128 fy = _mm_cvtepi32_ps(t0);
__m128 fx = _mm_cvtepi32_ps(t1);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_0))));
v_expand(v00, t1, t0);
// 0 ... 3
__m128 I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(I_0_7_epi16, I_0_7_epi16), 16));
v_float32x4 vI_ps = v_cvt_f32(vI0);
v_float32x4 fy = v_cvt_f32(t0);
v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22
__m128 fxtale = _mm_mul_ps(fx, tale_0_3_ps);
__m128 fytale = _mm_mul_ps(fy, tale_0_3_ps);
v_float32x4 fxtale = fx * vtale_0;
v_float32x4 fytale = fy * vtale_0;
mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy
mmSumIx = _mm_add_ps(mmSumIx, fxtale);
mmSumIy = _mm_add_ps(mmSumIy, fytale);
vsumIx += fxtale;
vsumIy += fytale;
mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
vsumW1 += vI_ps * fxtale;
vsumW2 += vI_ps * fytale;
// sumI
__m128 I_tale_ps = _mm_mul_ps(I_ps, tale_0_3_ps);
mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
v_float32x4 vI_tale = vI_ps * vtale_0;
vsumI += vI_tale;
// sumW
mmSumW = _mm_add_ps(mmSumW, tale_0_3_ps);
vsumW += vtale_0;
// sumDI
mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
t0 = _mm_srai_epi32(Ixy_1, 16); // Iy8 Iy9 Iy10 Iy11
t1 = _mm_srai_epi32(_mm_slli_epi32(Ixy_1, 16), 16); // Ix0 Ix1 Ix2 Ix3
vsumDI += vI_ps * vI_tale;
fy = _mm_cvtepi32_ps(t0);
fx = _mm_cvtepi32_ps(t1);
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0);
vI_ps = v_cvt_f32(vI1);
// 4 ... 7
I_ps = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(I_0_7_epi16, I_0_7_epi16), 16));
fy = v_cvt_f32(t0);
fx = v_cvt_f32(t1);
// A11 - A22
fxtale = _mm_mul_ps(fx, tale_4_7_ps);
fytale = _mm_mul_ps(fy, tale_4_7_ps);
fxtale = fx * vtale_1;
fytale = fy * vtale_1;
mmAyy = _mm_add_ps(mmAyy, _mm_mul_ps(fy, fytale));
mmAxy = _mm_add_ps(mmAxy, _mm_mul_ps(fx, fytale));
mmAxx = _mm_add_ps(mmAxx, _mm_mul_ps(fx, fxtale));
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy
mmSumIx = _mm_add_ps(mmSumIx, fxtale);
mmSumIy = _mm_add_ps(mmSumIy, fytale);
vsumIx += fxtale;
vsumIy += fytale;
mmSumW1 = _mm_add_ps(mmSumW1, _mm_mul_ps(I_ps, fxtale));
mmSumW2 = _mm_add_ps(mmSumW2, _mm_mul_ps(I_ps, fytale));
vsumW1 += vI_ps * fxtale;
vsumW2 += vI_ps * fytale;
// sumI
I_tale_ps = _mm_mul_ps(I_ps, tale_4_7_ps);
mmSumI = _mm_add_ps(mmSumI, I_tale_ps);
vI_tale = vI_ps * vtale_1;
vsumI += vI_tale;
mmSumW = _mm_add_ps(mmSumW, tale_4_7_ps);
// sumW
vsumW += vtale_1;
mmSumDI = _mm_add_ps(mmSumDI, _mm_mul_ps(I_ps, I_tale_ps));
// sumDI
vsumDI += vI_ps * vI_tale;
}
}
#else
for (; x < _winSize.width*cn; x++, dIptr += 2)
for (int x = 0; x < _winSize.width*cn; x++, dIptr += 2)
{
if (maskPtr[x] == 0)
continue;
int J_val = CV_DESCALE(Jptr[x] * iw00 + Jptr[x + cn] * iw01 + Jptr1[x] * iw10 + Jptr1[x + cn] * iw11,
W_BITS1 - 5);
W_BITS - 5);
short ixval = static_cast<short>(dIptr[0]);
short iyval = static_cast<short>(dIptr[1]);
int diff = static_cast<int>(J_val - Iptr[x] + Iptr[x] * gainVec.x + gainVec.y);
......@@ -1264,30 +985,22 @@ public:
}
#endif
}
#ifdef RLOF_SSE
short etaValues[8];
_mm_storeu_si128((__m128i*)(etaValues), mmEta);
MEstimatorScale += eta * (etaValues[0] + etaValues[1] + etaValues[2] + etaValues[3]
+ etaValues[4] + etaValues[5] + etaValues[6] + etaValues[7]);
float CV_DECL_ALIGNED(32) wbuf[4];
#if CV_SIMD128
MEstimatorScale += eta * v_reduce_sum(veta);
#endif
if (j == 0)
{
#ifdef RLOF_SSE
_mm_store_ps(wbuf, mmSumW1);
w1 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumW2);
w2 = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumDI);
dI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumI);
sumI = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumIx);
sumIx = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumIy);
sumIy = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
_mm_store_ps(wbuf, mmSumW);
sumW = wbuf[0] + wbuf[1] + wbuf[2] + wbuf[3];
#if CV_SIMD128
w1 = v_reduce_sum(vsumW1);
w2 = v_reduce_sum(vsumW2);
dI = v_reduce_sum(vsumDI);
sumI = v_reduce_sum(vsumI);
sumIx = v_reduce_sum(vsumIx);
sumIy = v_reduce_sum(vsumIy);
sumW = v_reduce_sum(vsumW);
A11 = v_reduce_sum(vAxx);
A12 = v_reduce_sum(vAxy);
A22 = v_reduce_sum(vAyy);
#endif
sumIx *= -FLT_SCALE;
sumIy *= -FLT_SCALE;
......@@ -1296,30 +1009,19 @@ public:
w1 *= -FLT_SCALE;
w2 *= -FLT_SCALE;
dI *= FLT_SCALE;
#ifdef RLOF_SSE
float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
_mm_store_ps(A11buf, mmAxx);
_mm_store_ps(A12buf, mmAxy);
_mm_store_ps(A22buf, mmAyy);
A11 = A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
A12 = A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
A22 = A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
#if CV_SIMD128
#endif
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
}
#ifdef RLOF_SSE
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
v_store_aligned(bbuf, vqb0 + vqb1);
b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3];
_mm_store_ps(bbuf, qb2);
b3 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
_mm_store_ps(bbuf, qb3);
b4 = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
b3 = v_reduce_sum(vqb2);
b4 = v_reduce_sum(vqb3);
#endif
mismatchMat(0) = b1 * FLT_SCALE;
mismatchMat(1) = b2 * FLT_SCALE;
......
......@@ -4,62 +4,306 @@
#ifndef _RLOF_INVOKERBASE_HPP_
#define _RLOF_INVOKERBASE_HPP_
#if CV_SSE4_1
#define RLOF_SSE
#endif
//#define DEBUG_INVOKER
#ifndef CV_DESCALE
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
#endif
#define FLT_RESCALE 1
#include "rlof_localflow.h"
#include <unordered_map>
#include "opencv2/core/hal/intrin.hpp"
using namespace std;
using namespace cv;
namespace cv {
namespace optflow {
typedef short deriv_type;
#ifdef RLOF_SSE
static inline void get4BitMask(const int & width, __m128i & mask)
{
int noBits = width - static_cast<int>(floor(width / 4.f) * 4.f);
unsigned int val[4];
for (int n = 0; n < 4; n++)
{
val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
}
mask = _mm_set_epi32(val[3], val[2], val[1], val[0]);
#if CV_SIMD128
}
static inline void getWBitMask(const int & width, __m128i & t0, __m128i & t1, __m128i & t2)
static inline void getVBitMask(const int & width, v_int32x4 & mask0, v_int32x4 & mask1)
{
int noBits = width - static_cast<int>(floor(width / 8.f) * 8.f);
unsigned short val[8];
unsigned int val[8];
for (int n = 0; n < 8; n++)
{
val[n] = (noBits > n) ? (0xffff) : 0;
val[n] = (noBits > n) ? (std::numeric_limits<unsigned int>::max()) : 0;
}
t1 = _mm_set_epi16(val[7], val[7], val[6], val[6], val[5], val[5], val[4], val[4]);
t0 = _mm_set_epi16(val[3], val[3], val[2], val[2], val[1], val[1], val[0], val[0]);
t2 = _mm_set_epi16(val[7], val[6], val[5], val[4], val[3], val[2], val[1], val[0]);
mask0 = v_int32x4(val[0], val[1], val[2], val[3]);
mask1 = v_int32x4(val[4], val[5], val[6], val[7]);
}
#endif
typedef uchar tMaskType;
#define tCVMaskType CV_8UC1
#define MaskSet 0xffffffff
static
void getLocalPatch(
static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
Size winSize,
const Mat & I, const Mat & derivI, const Mat & winMaskMat,
Mat & IWinBuf, Mat & derivIWinBuf,
Point iprevPt)
{
int cn = I.channels(), cn2 = cn * 2;
const int W_BITS = 14;
#if CV_SIMD128
v_int16x8 vqw0((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
v_int16x8 vqw1((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
v_int32x4 vdelta_d = v_setall_s32(1 << (W_BITS - 1));
v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
v_int32x4 vmax_val_32 = v_setall_s32(std::numeric_limits<unsigned int>::max());
v_int32x4 vmask_border_0, vmask_border_1;
getVBitMask(winSize.width, vmask_border_0, vmask_border_1);
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
int x, y;
for (y = 0; y < winSize.height; y++)
{
const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
short* Iptr = IWinBuf.ptr<short>(y, 0);
short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
x = 0;
#if CV_SIMD128
for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
{
v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
if (x + 4 > winSize.width)
{
vmask0 = vmask0 & vmask_border_0;
}
if (x + 8 > winSize.width)
{
vmask1 = vmask1 & vmask_border_1;
}
v_int32x4 t0, t1;
v_int16x8 v00, v01, v10, v11, t00, t01, t10, t11;
v00 = v_reinterpret_as_s16(v_load_expand(src + x));
v01 = v_reinterpret_as_s16(v_load_expand(src + x + cn));
v10 = v_reinterpret_as_s16(v_load_expand(src1 + x));
v11 = v_reinterpret_as_s16(v_load_expand(src1 + x + cn));
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5) & vmask0;
t1 = t1 >> (W_BITS - 5) & vmask1;
v_store(Iptr + x, v_pack(t0, t1));
v00 = v_reinterpret_as_s16(v_load(dsrc));
v01 = v_reinterpret_as_s16(v_load(dsrc + cn2));
v10 = v_reinterpret_as_s16(v_load(dsrc1));
v11 = v_reinterpret_as_s16(v_load(dsrc1 + cn2));
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask0);
v_store(dIptr, v00);
v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
v01 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2 + cn2));
v10 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2));
v11 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2 + cn2));
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask1);
v_store(dIptr + 4 * 2, v00);
}
#else
for (; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
{
if (maskPtr[x] == 0)
{
dIptr[0] = 0;
dIptr[1] = 0;
continue;
}
int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
src1[x] * iw10 + src1[x + cn] * iw11, W_BITS - 5);
int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS);
int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
dsrc1[cn2 + 1] * iw11, W_BITS);
Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;
}
#endif
}
}
static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
Size winSize,
const Mat & I, const Mat & derivI, const Mat & winMaskMat,
Mat & IWinBuf, Mat & derivIWinBuf,
float & A11, float & A22, float & A12,
Point iprevPt)
{
const float FLT_SCALE = (1.f / (1 << 20));
int cn = I.channels(), cn2 = cn * 2;
const int W_BITS = 14;
#if CV_SIMD128
v_int16x8 vqw0((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
v_int16x8 vqw1((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
v_int32x4 vdelta_d = v_setall_s32(1 << (W_BITS - 1));
v_int32x4 vdelta = v_setall_s32(1 << (W_BITS - 5 - 1));
v_int32x4 vmax_val_32 = v_setall_s32(std::numeric_limits<unsigned int>::max());
v_int32x4 vmask_border0, vmask_border1;
v_float32x4 vA11 = v_setzero_f32(), vA22 = v_setzero_f32(), vA12 = v_setzero_f32();
getVBitMask(winSize.width, vmask_border0, vmask_border1);
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
for (int y = 0; y < winSize.height; y++)
{
const uchar* src = I.ptr<uchar>(y + iprevPt.y, 0) + iprevPt.x*cn;
const uchar* src1 = I.ptr<uchar>(y + iprevPt.y + 1, 0) + iprevPt.x*cn;
const short* dsrc = derivI.ptr<short>(y + iprevPt.y, 0) + iprevPt.x*cn2;
const short* dsrc1 = derivI.ptr<short>(y + iprevPt.y + 1, 0) + iprevPt.x*cn2;
short* Iptr = IWinBuf.ptr<short>(y, 0);
short* dIptr = derivIWinBuf.ptr<short>(y, 0);
const tMaskType* maskPtr = winMaskMat.ptr<tMaskType>(y, 0);
#if CV_SIMD128
for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
{
v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
if (x + 4 > winSize.width)
{
vmask0 = vmask0 & vmask_border0;
}
if (x + 8 > winSize.width)
{
vmask1 = vmask1 & vmask_border1;
}
v_int32x4 t0, t1;
v_int16x8 v00, v01, v10, v11, t00, t01, t10, t11;
v00 = v_reinterpret_as_s16(v_load_expand(src + x));
v01 = v_reinterpret_as_s16(v_load_expand(src + x + cn));
v10 = v_reinterpret_as_s16(v_load_expand(src1 + x));
v11 = v_reinterpret_as_s16(v_load_expand(src1 + x + cn));
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
t0 = t0 & vmask0;
t1 = t1 & vmask1;
v_store(Iptr + x, v_pack(t0, t1));
v00 = v_reinterpret_as_s16(v_load(dsrc));
v01 = v_reinterpret_as_s16(v_load(dsrc + cn2));
v10 = v_reinterpret_as_s16(v_load(dsrc1));
v11 = v_reinterpret_as_s16(v_load(dsrc1 + cn2));
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask0);
v_store(dIptr, v00);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
v_expand(v00, t1, t0);
v_float32x4 fy = v_cvt_f32(t0);
v_float32x4 fx = v_cvt_f32(t1);
vA22 = v_muladd(fy, fy, vA22);
vA12 = v_muladd(fx, fy, vA12);
vA11 = v_muladd(fx, fx, vA11);
v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
v01 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2 + cn2));
v10 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2));
v11 = v_reinterpret_as_s16(v_load(dsrc1 + 4 * 2 + cn2));
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask1);
v_store(dIptr + 4 * 2, v00);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
v_expand(v00, t1, t0);
fy = v_cvt_f32(t0);
fx = v_cvt_f32(t1);
vA22 = v_muladd(fy, fy, vA22);
vA12 = v_muladd(fx, fy, vA12);
vA11 = v_muladd(fx, fx, vA11);
}
#else
for (int x = 0; x < winSize.width*cn; x++, dsrc += 2, dsrc1 += 2, dIptr += 2)
{
if (maskPtr[x] == 0)
{
dIptr[0] = 0;
dIptr[1] = 0;
continue;
}
int ival = CV_DESCALE(src[x] * iw00 + src[x + cn] * iw01 +
src1[x] * iw10 + src1[x + cn] * iw11, W_BITS - 5);
int ixval = CV_DESCALE(dsrc[0] * iw00 + dsrc[cn2] * iw01 +
dsrc1[0] * iw10 + dsrc1[cn2] * iw11, W_BITS);
int iyval = CV_DESCALE(dsrc[1] * iw00 + dsrc[cn2 + 1] * iw01 + dsrc1[1] * iw10 +
dsrc1[cn2 + 1] * iw11, W_BITS);
Iptr[x] = (short)ival;
dIptr[0] = (short)ixval;
dIptr[1] = (short)iyval;
A11 += (float)(ixval*ixval);
A12 += (float)(ixval*iyval);
A22 += (float)(iyval*iyval);
}
#endif
}
#if CV_SIMD128
A11 += v_reduce_sum(vA11);
A12 += v_reduce_sum(vA12);
A22 += v_reduce_sum(vA22);
#endif
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
}
static void getLocalPatch(
const cv::Mat & src,
const cv::Point2i & prevPoint, // feature points
cv::Mat & winPointMask,
......
......@@ -469,6 +469,8 @@ void calcLocalOpticalFlowCore(
if (isrobust(param) == false)
{
if (param.useIlluminationModel)
{
if (param.solverType == SolverType::ST_STANDART)
{
cv::parallel_for_(cv::Range(0, npoints),
plk::radial::TrackerInvoker(
......@@ -482,6 +484,20 @@ void calcLocalOpticalFlowCore(
param.crossSegmentationThreshold));
}
else
{
cv::parallel_for_(cv::Range(0, npoints),
beplk::radial::TrackerInvoker(
prevImage, derivI, currImage, tRGBPrevPyr, tRGBNextPyr,
prevPts, nextPts, &status[0], &err[0], &gainPts[0],
level, maxLevel, winSizes,
param.maxIteration,
param.useInitialFlow,
param.supportRegionType,
param.crossSegmentationThreshold,
param.minEigenValue));
}
}
else
{
if (param.solverType == SolverType::ST_STANDART)
{
......
......@@ -6,6 +6,7 @@
#include "rlof/geo_interpolation.hpp"
#include "opencv2/ximgproc.hpp"
namespace cv {
namespace optflow {
......@@ -14,6 +15,19 @@ Ptr<RLOFOpticalFlowParameter> RLOFOpticalFlowParameter::create()
return Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
}
void RLOFOpticalFlowParameter::setUseMEstimator(bool val)
{
if (val)
{
normSigma0 = 3.2f;
normSigma1 = 7.f;
}
else
{
normSigma0 = std::numeric_limits<float>::max();
normSigma1 = std::numeric_limits<float>::max();
}
}
void RLOFOpticalFlowParameter::setSolverType(SolverType val){ solverType = val;}
SolverType RLOFOpticalFlowParameter::getSolverType() const { return solverType;}
......@@ -198,7 +212,7 @@ public:
gd->setLambda(lambda);
gd->setFGSLambda(fgs_lambda);
gd->setFGSSigma(fgs_sigma);
gd->setUsePostProcessing(false);
gd->setUsePostProcessing(use_post_proc);
gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
}
else if (interp_type == InterpolationType::INTERP_RIC)
......@@ -209,7 +223,7 @@ public:
gd->setFGSSigma(fgs_sigma);
gd->setSuperpixelSize(sp_size);
gd->setSuperpixelMode(slic_type);
gd->setUseGlobalSmootherFilter(false);
gd->setUseGlobalSmootherFilter(use_post_proc);
gd->setUseVariationalRefinement(false);
gd->interpolate(prevImage, filtered_prevPoints, currImage, filtered_currPoints, dense_flow);
}
......@@ -225,6 +239,10 @@ public:
cv::bilateralFilter(vecMats[0], vecMats2[0], 5, 2, 20);
cv::bilateralFilter(vecMats[1], vecMats2[1], 5, 2, 20);
cv::merge(vecMats2, dense_flow);
if (use_post_proc)
{
ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
}
}
if (use_variational_refinement)
{
......@@ -235,10 +253,6 @@ public:
variationalrefine->setOmega(1.9f);
variationalrefine->calc(prevGrey, currGrey, flow);
}
if (use_post_proc)
{
ximgproc::fastGlobalSmootherFilter(prevImage, flow, flow, fgs_lambda, fgs_sigma);
}
}
virtual void collectGarbage() CV_OVERRIDE
......
......@@ -196,6 +196,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
param->supportRegionType = SR_CROSS;
param->useIlluminationModel = true;
param->solverType = ST_BILINEAR;
param->setUseMEstimator(true);
algo->setRLOFOpticalFlowParameter(param);
algo->calc(frame1, frame2, prevPts, currPts, status, err);
EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.3f);
......@@ -216,8 +217,7 @@ TEST(SparseOpticalFlow, ReferenceAccuracy)
algo->calc(frame1, frame2, prevPts, currPts, status, err);
EXPECT_LE(calcRMSE(prevPts, currPts, GT), 0.27f);
param->normSigma0 = numeric_limits<float>::max();
param->normSigma1 = numeric_limits<float>::max();
param->setUseMEstimator(false);
param->useIlluminationModel = true;
param->solverType = ST_BILINEAR;
......@@ -250,6 +250,7 @@ TEST(DenseOpticalFlow_RLOF, ReferenceAccuracy)
Mat flow;
Ptr<DenseRLOFOpticalFlow> algo = DenseRLOFOpticalFlow::create();
Ptr<RLOFOpticalFlowParameter> param = Ptr<RLOFOpticalFlowParameter>(new RLOFOpticalFlowParameter);
param->setUseMEstimator(true);
param->supportRegionType = SR_CROSS;
param->solverType = ST_BILINEAR;
algo->setRLOFOpticalFlowParameter(param);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment