Commit 92795ba4 authored by Ilya Lavrenov's avatar Ilya Lavrenov

parallel version of remap, resize, warpaffine, warpPerspective. Some…

parallel version of remap, resize, warpaffine, warpPerspective. Some optimization for  2x decimation in resize algorithm
parent f2a02fef
#include "perf_precomp.hpp"
using namespace std;
using namespace cv;
using namespace perf;
using namespace testing;
using std::tr1::make_tuple;
using std::tr1::get;
CV_ENUM(MatrixType, CV_16UC1, CV_16SC1, CV_32FC1)
CV_ENUM(MapType, CV_16SC2, CV_32FC1, CV_32FC2)
CV_ENUM(InterType, INTER_LINEAR, INTER_CUBIC, INTER_LANCZOS4, INTER_NEAREST)
typedef TestBaseWithParam< tr1::tuple<Size, MatrixType, MapType, InterType> > TestRemap;
PERF_TEST_P( TestRemap, Remap,
Combine(
Values( szVGA, sz1080p ),
ValuesIn( MatrixType::all() ),
ValuesIn( MapType::all() ),
ValuesIn( InterType::all() )
)
)
{
Size sz;
int src_type, map1_type, inter_type;
sz = get<0>(GetParam());
src_type = get<1>(GetParam());
map1_type = get<2>(GetParam());
inter_type = get<3>(GetParam());
Mat src(sz, src_type);
Mat map1(sz, map1_type);
Mat dst(sz, src_type);
Mat map2(map1_type == CV_32FC1 ? sz : Size(), CV_32FC1);
RNG rng;
rng.fill(src, RNG::UNIFORM, 0, 256);
for (int j = 0; j < map1.rows; ++j)
for (int i = 0; i < map1.cols; ++i)
switch (map1_type)
{
case CV_32FC1:
map1.at<float>(j, i) = src.cols - i;
map2.at<float>(j, i) = j;
break;
case CV_32FC2:
map1.at<Vec2f>(j, i)[0] = src.cols - i;
map1.at<Vec2f>(j, i)[1] = j;
break;
case CV_16SC2:
map1.at<Vec2s>(j, i)[0] = src.cols - i;
map1.at<Vec2s>(j, i)[1] = j;
break;
default:
CV_Assert(0);
}
declare.in(src, WARMUP_RNG).out(dst).time(20);
TEST_CYCLE() remap(src, dst, map1, map2, inter_type);
SANITY_CHECK(dst);
}
......@@ -59,11 +59,11 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
typedef tr1::tuple<MatType, Size, int> MatInfo_Size_Scale_t;
typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;
PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast,
testing::Combine(
testing::Values(CV_8UC1, CV_8UC4),
testing::Values(szVGA, szqHD, sz720p, sz1080p),
testing::Values(2, 4)
testing::Values(2)
)
)
{
......@@ -84,3 +84,31 @@ PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
//difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
SANITY_CHECK(dst, 1);
}
typedef TestBaseWithParam<tr1::tuple<MatType, Size, double> > MatInfo_Size_Scale_Area;
PERF_TEST_P(MatInfo_Size_Scale_Area, ResizeArea,
testing::Combine(
testing::Values(CV_8UC1, CV_8UC4),
testing::Values(szVGA, szqHD, sz720p, sz1080p),
testing::Values(2.4, 3.4, 1.3)
)
)
{
int matType = get<0>(GetParam());
Size from = get<1>(GetParam());
double scale = get<2>(GetParam());
cv::Mat src(from, matType);
Size to(cvRound(from.width * scale), cvRound(from.height * scale));
cv::Mat dst(to, matType);
declare.in(src, WARMUP_RNG).out(dst);
TEST_CYCLE() resize(src, dst, dst.size(), 0, 0, INTER_AREA);
//difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
SANITY_CHECK(dst, 1);
}
......@@ -240,24 +240,22 @@ template<typename ST, typename DT, int bits> struct FixedPtCast
* Resize *
\****************************************************************************************/
static void
resizeNN( const Mat& src, Mat& dst, double fx, double fy )
class resizeNNInvoker :
public ParallelLoopBody
{
Size ssize = src.size(), dsize = dst.size();
AutoBuffer<int> _x_ofs(dsize.width);
int* x_ofs = _x_ofs;
int pix_size = (int)src.elemSize();
int pix_size4 = (int)(pix_size / sizeof(int));
double ifx = 1./fx, ify = 1./fy;
int x, y;
for( x = 0; x < dsize.width; x++ )
public:
resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
ify(_ify)
{
int sx = cvFloor(x*ifx);
x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
}
for( y = 0; y < dsize.height; y++ )
virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size();
int y, x, pix_size = (int)src.elemSize();
for( y = range.start; y < range.end; y++ )
{
uchar* D = dst.data + dst.step*y;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
......@@ -326,6 +324,35 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
}
}
}
}
private:
const Mat src;
Mat dst;
int* x_ofs, pix_size4;
double ify;
};
static void
resizeNN( const Mat& src, Mat& dst, double fx, double fy )
{
Size ssize = src.size(), dsize = dst.size();
AutoBuffer<int> _x_ofs(dsize.width);
int* x_ofs = _x_ofs;
int pix_size = (int)src.elemSize();
int pix_size4 = (int)(pix_size / sizeof(int));
double ifx = 1./fx, ify = 1./fy;
int x;
for( x = 0; x < dsize.width; x++ )
{
int sx = cvFloor(x*ifx);
x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
}
Range range(0, dsize.height);
resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
parallel_for_(range, invoker);
}
......@@ -1092,33 +1119,35 @@ static inline int clip(int x, int a, int b)
static const int MAX_ESIZE=16;
template<class HResize, class VResize>
static void resizeGeneric_( const Mat& src, Mat& dst,
const int* xofs, const void* _alpha,
const int* yofs, const void* _beta,
int xmin, int xmax, int ksize )
template <typename HResize, typename VResize>
class resizeGeneric_Invoker :
public ParallelLoopBody
{
public:
typedef typename HResize::value_type T;
typedef typename HResize::buf_type WT;
typedef typename HResize::alpha_type AT;
const AT* alpha = (const AT*)_alpha;
const AT* beta = (const AT*)_beta;
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
ssize.width *= cn;
dsize.width *= cn;
resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
int _ksize, int _xmin, int _xmax) :
ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
ksize(_ksize), xmin(_xmin), xmax(_xmax)
{
}
virtual void operator() (const Range& range) const
{
int dy, cn = src.channels();
HResize hresize;
VResize vresize;
int bufstep = (int)alignSize(dsize.width, 16);
AutoBuffer<WT> _buffer(bufstep*ksize);
const T* srows[MAX_ESIZE]={0};
WT* rows[MAX_ESIZE]={0};
int prev_sy[MAX_ESIZE];
int dy;
xmin *= cn;
xmax *= cn;
HResize hresize;
VResize vresize;
for(int k = 0; k < ksize; k++ )
{
......@@ -1126,8 +1155,9 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
rows[k] = (WT*)_buffer + bufstep*k;
}
// image resize is a separable operation. In case of not too strong
for( dy = 0; dy < dsize.height; dy++, beta += ksize )
const AT* beta = _beta + ksize * range.start;
for( dy = range.start; dy < range.end; dy++, beta += ksize )
{
int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
......@@ -1145,35 +1175,144 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
}
if( k1 == ksize )
k0 = std::min(k0, k); // remember the first row that needs to be computed
srows[k] = (const T*)(src.data + src.step*sy);
srows[k] = (T*)(src.data + src.step*sy);
prev_sy[k] = sy;
}
if( k0 < ksize )
hresize( srows + k0, rows + k0, ksize - k0, xofs, alpha,
hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
ssize.width, dsize.width, cn, xmin, xmax );
vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
}
}
private:
const Mat src;
Mat dst;
const int* xofs, *yofs;
const AT* alpha, *_beta;
const Size ssize, dsize;
const int ksize, xmin, xmax;
};
template<class HResize, class VResize>
static void resizeGeneric_( const Mat& src, Mat& dst,
const int* xofs, const void* _alpha,
const int* yofs, const void* _beta,
int xmin, int xmax, int ksize )
{
typedef typename HResize::value_type T;
typedef typename HResize::buf_type WT;
typedef typename HResize::alpha_type AT;
const AT* beta = (const AT*)_beta;
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
ssize.width *= cn;
dsize.width *= cn;
xmin *= cn;
xmax *= cn;
// image resize is a separable operation. In case of not too strong
Range range(0, dsize.height);
resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
ssize, dsize, ksize, xmin, xmax);
parallel_for_(range, invoker);
}
template <typename T, typename WT>
struct ResizeAreaFastNoVec
{
ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
};
template<typename T, typename WT>
static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
int scale_x, int scale_y )
template <typename T, typename WT>
struct ResizeAreaFast_2x2_8u
{
ResizeAreaFast_2x2_8u(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
{
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
}
int operator() (const T* S, T* D, int w) const
{
if( !fast_mode )
return 0;
const T* nextS = S + step;
int dx = 0;
if (cn == 1)
for( ; dx < w; ++dx )
{
int index = dx*2;
D[dx] = (S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2;
}
else if (cn == 3)
for( ; dx < w; dx += 3 )
{
int index = dx*2;
D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
}
else
{
assert(cn == 4);
for( ; dx < w; dx += 4 )
{
int index = dx*2;
D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
D[dx+3] = (S[index+3] + S[index+6] + nextS[index+3] + nextS[index+6] + 2) >> 2;
}
}
return dx;
}
private:
const int scale_x, scale_y;
const int cn;
bool fast_mode;
const int step;
};
template <typename T, typename WT, typename VecOp>
class resizeAreaFast_Invoker :
public ParallelLoopBody
{
public:
resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
{
}
virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
int dy, dx, k = 0;
int area = scale_x*scale_y;
float scale = 1.f/(scale_x*scale_y);
float scale = 1.f/(area);
int dwidth1 = (ssize.width/scale_x)*cn;
dsize.width *= cn;
ssize.width *= cn;
int dy, dx, k = 0;
for( dy = 0; dy < dsize.height; dy++ )
VecOp vop(scale_x, scale_y, src.channels(), src.step/*, area_ofs*/);
for( dy = range.start; dy < range.end; dy++ )
{
T* D = (T*)(dst.data + dst.step*dy);
int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
int sy0 = dy*scale_y;
int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
if( sy0 >= ssize.height )
{
for( dx = 0; dx < dsize.width; dx++ )
......@@ -1181,11 +1320,12 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
continue;
}
for( dx = 0; dx < w; dx++ )
dx = vop((const T*)(src.data + src.step * sy0), D, w);
for( ; dx < w; dx++ )
{
const T* S = (const T*)(src.data + src.step*sy0) + xofs[dx];
const T* S = (const T*)(src.data + src.step * sy0) + xofs[dx];
WT sum = 0;
k=0;
k = 0;
#if CV_ENABLE_UNROLLED
for( ; k <= area - 4; k += 4 )
sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
......@@ -1193,7 +1333,7 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
for( ; k < area; k++ )
sum += S[ofs[k]];
D[dx] = saturate_cast<T>(sum*scale);
D[dx] = saturate_cast<T>(sum * scale);
}
for( ; dx < dsize.width; dx++ )
......@@ -1217,9 +1357,26 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
}
}
D[dx] = saturate_cast<T>((float)sum/count);
D[dx] = saturate_cast<WT>((float)sum/count);
}
}
}
private:
const Mat src;
Mat dst;
const int scale_x, scale_y;
const int *ofs, *xofs;
};
template<typename T, typename WT, typename VecOp>
static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
int scale_x, int scale_y )
{
Range range(0, dst.rows);
resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
scale_y, ofs, xofs);
parallel_for_(range, invoker);
}
struct DecimateAlpha
......@@ -1228,24 +1385,46 @@ struct DecimateAlpha
float alpha;
};
template<typename T, typename WT>
static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
template <typename T, typename WT>
class resizeArea_Invoker :
public ParallelLoopBody
{
public:
resizeArea_Invoker(const Mat& _src, Mat& _dst, const DecimateAlpha* _xofs,
int _xofs_count, double _scale_y_
#ifdef HAVE_TBB
, const int* _yofs, const int* _cur_dy_ofs
#endif
) :
ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs),
xofs_count(_xofs_count), scale_y_(_scale_y_)
#ifdef HAVE_TBB
, yofs(_yofs), cur_dy_ofs(_cur_dy_ofs)
#endif
{
}
virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
dsize.width *= cn;
AutoBuffer<WT> _buffer(dsize.width*2);
WT *buf = _buffer, *sum = buf + dsize.width;
int k, sy, dx, cur_dy = 0;
int k, sy, dx, cur_dy = 0, num = sizeof(WT) * dsize.width;
WT scale_y = (WT)scale_y_;
CV_Assert( cn <= 4 );
for( dx = 0; dx < dsize.width; dx++ )
buf[dx] = sum[dx] = 0;
memset(buf, 0, num * 2);
for( sy = 0; sy < ssize.height; sy++ )
#ifdef HAVE_TBB
sy = yofs[range.start];
cur_dy = cur_dy_ofs[sy];
for( ; sy < range.start; sy++ )
{
const T* S = (const T*)(src.data + src.step*sy);
const T* S = (const T*)(src.data + src.step * sy);
memset(buf, 0, num);
if( cn == 1 )
for( k = 0; k < xofs_count; k++ )
{
......@@ -1269,9 +1448,11 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
int sxn = xofs[k].si;
int dxn = xofs[k].di;
WT alpha = xofs[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
}
else
......@@ -1280,35 +1461,30 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
int sxn = xofs[k].si;
int dxn = xofs[k].di;
WT alpha = xofs[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1;
t0 = buf[dxn+2] + S[sxn+2]*alpha;
t1 = buf[dxn+3] + S[sxn+3]*alpha;
buf[dxn+2] = t0; buf[dxn+3] = t1;
}
if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
{
WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y, (WT)0);
WT beta1 = 1 - beta;
T* D = (T*)(dst.data + dst.step*cur_dy);
WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0);
if( fabs(beta) < 1e-3 )
{
if(cur_dy >= dsize.height) return;
for( dx = 0; dx < dsize.width; dx++ )
{
D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
sum[dx] = buf[dx] = 0;
}
if(cur_dy >= dsize.height)
break;
memset(sum, 0, num);
}
else
for( dx = 0; dx < dsize.width; dx++ )
{
D[dx] = saturate_cast<T>((sum[dx] + buf[dx]* beta1)/ min(scale_y, src.rows - cur_dy*scale_y));
sum[dx] = buf[dx]*beta;
buf[dx] = 0;
}
sum[dx] = buf[dx] * beta;
cur_dy++;
}
else
......@@ -1318,132 +1494,151 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
WT t0 = sum[dx] + buf[dx];
WT t1 = sum[dx+1] + buf[dx+1];
sum[dx] = t0; sum[dx+1] = t1;
buf[dx] = buf[dx+1] = 0;
}
for( ; dx < dsize.width; dx++ )
{
sum[dx] += buf[dx];
buf[dx] = 0;
}
}
}
}
static void resizeAreaFast_8u( const Mat& src, Mat& dst,
const int* ofs, const int* xofs,
int scale_x, int scale_y )
{
#if CV_SSE2
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
int dy, dx, k = 0;
int area = scale_x*scale_y;
float scale = 1.f/(scale_x*scale_y);
int dwidth1 = (ssize.width/scale_x)*cn;
dsize.width *= cn;
ssize.width *= cn;
//avg values
for( dy = 0; dy < dsize.height; dy++ )
{
uchar* D = (uchar*)(dst.data + dst.step*dy);
int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
if( sy0 >= ssize.height )
{
for( dx = 0; dx < dsize.width; dx++ ) //memset(D,0, dsize.width);//warning, never executed -> not tested
D[dx] = 0;
continue;
}
dx = 0;
#if CV_SSE2
if( haveSSE2 )
for( sy = range.start; sy < range.end; sy++ )
{
const __m128 _scale = _mm_set1_ps(scale);
const __m128i _ucMAXs = _mm_set1_epi16(UCHAR_MAX);
const uchar* _S[8];
const T* S = (const T*)(src.data + src.step * sy);
memset(buf, 0, num);
for(; dx < w-8; dx+=8 )
if( cn == 1 )
for( k = 0; k < xofs_count; k++ )
{
__m128i _sum = _mm_setzero_si128();
__m128i _sum1 = _mm_setzero_si128();
_S[0] = (const uchar*)(src.data + src.step*sy0) + xofs[dx];
_S[1] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+1];
_S[2] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+2];
_S[3] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+3];
_S[4] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+4];
_S[5] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+5];
_S[6] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+6];
_S[7] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+7];
for( k = 0; k < area; k++ )
int dxn = xofs[k].di;
WT alpha = xofs[k].alpha;
buf[dxn] += S[xofs[k].si]*alpha;
}
else if( cn == 2 )
for( k = 0; k < xofs_count; k++ )
{
int ofsk = ofs[k];
__m128i _temp = _mm_set_epi32(_S[3][ofsk],_S[2][ofsk],_S[1][ofsk],_S[0][ofsk]);
_sum = _mm_add_epi32(_sum, _temp);
__m128i _temp1 = _mm_set_epi32(_S[7][ofsk],_S[6][ofsk],_S[5][ofsk],_S[4][ofsk]);
_sum1 = _mm_add_epi32(_sum1, _temp1);
int sxn = xofs[k].si;
int dxn = xofs[k].di;
WT alpha = xofs[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1;
}
else if( cn == 3 )
for( k = 0; k < xofs_count; k++ )
{
int sxn = xofs[k].si;
int dxn = xofs[k].di;
WT alpha = xofs[k].alpha;
__m128i _tempSum = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum), _scale));
__m128i _tempSum1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale));
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
_tempSum = _mm_packs_epi32(_tempSum, _tempSum1);
_tempSum = _mm_min_epi16(_ucMAXs, _tempSum);
_tempSum = _mm_packus_epi16(_tempSum, _tempSum);
_mm_storel_epi64((__m128i*)(D+dx),_tempSum);
}
buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
}
#endif
for(; dx < w; dx++ )
else
for( k = 0; k < xofs_count; k++ )
{
const uchar* S = (const uchar*)(src.data + src.step*sy0) + xofs[dx];
int sum = 0;
k=0;
int sxn = xofs[k].si;
int dxn = xofs[k].di;
WT alpha = xofs[k].alpha;
#if CV_ENABLE_UNROLLED
for( ; k <= area - 4; k += 4 )
sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
#endif
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
for( ; k < area; k++ )
sum += S[ofs[k]];
buf[dxn] = t0; buf[dxn+1] = t1;
t0 = buf[dxn+2] + S[sxn+2]*alpha;
t1 = buf[dxn+3] + S[sxn+3]*alpha;
D[dx] = saturate_cast<uchar>(sum*scale);
buf[dxn+2] = t0; buf[dxn+3] = t1;
}
for( ; dx < dsize.width; dx++ )
if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
{
int sum = 0;
int count = 0, sx0 = xofs[dx];
if( sx0 >= ssize.width )
D[dx] = 0;
for( int sy = 0; sy < scale_y; sy++ )
WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0);
T* D = (T*)(dst.data + dst.step*cur_dy);
if( fabs(beta) < 1e-3 )
{
if( sy0 + sy >= ssize.height )
break;
const uchar* S = (const uchar*)(src.data + src.step*(sy0 + sy)) + sx0;
int sx = 0;
for( ; sx < scale_x*cn; sx += cn )
if(cur_dy >= dsize.height)
return;
for( dx = 0; dx < dsize.width; dx++ )
D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
memset(sum, 0, num);
}
else
{
if( sx0 + sx >= ssize.width )
break;
sum += S[sx];
count++;
WT beta1 = 1 - beta;
for( dx = 0; dx < dsize.width; dx++ )
{
D[dx] = saturate_cast<T>((sum[dx] + buf[dx] * beta1)/ min(scale_y, src.rows - cur_dy * scale_y));
sum[dx] = buf[dx] * beta;
}
}
D[dx] = saturate_cast<uchar>((float)sum/count);
cur_dy++;
}
else
{
for( dx = 0; dx <= dsize.width - 2; dx += 2 )
{
WT t0 = sum[dx] + buf[dx];
WT t1 = sum[dx+1] + buf[dx+1];
sum[dx] = t0; sum[dx+1] = t1;
}
for( ; dx < dsize.width; dx++ )
sum[dx] += buf[dx];
}
}
}
private:
const Mat src;
Mat dst;
const DecimateAlpha* xofs;
const int xofs_count;
const double scale_y_;
#ifdef HAVE_TBB
const int *yofs, *cur_dy_ofs;
#endif
};
template<typename T, typename WT>
static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
{
#ifdef HAVE_TBB
Size ssize = src.size(), dsize = dst.size();
AutoBuffer<int> _yofs(2 * ssize.height);
int *yofs = _yofs, *cur_dy_ofs = _yofs + ssize.height;
int index = 0, cur_dy = 0, sy;
for( sy = 0; sy < ssize.height; sy++ )
{
bool reset = false;
cur_dy_ofs[sy] = cur_dy;
if( (cur_dy + 1)*scale_y_ <= sy + 1 || sy == ssize.height - 1 )
{
WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y_, 0.);
if( fabs(beta) < 1e-3 )
{
if(cur_dy >= dsize.height)
break;
reset = true;
}
cur_dy++;
}
yofs[sy] = index;
if (reset)
index = sy + 1;
}
#endif
Range range(0, src.rows);
resizeArea_Invoker<T, WT> invoker(src, dst, xofs, xofs_count, scale_y_
#ifdef HAVE_TBB
, yofs, cur_dy_ofs
#endif
);
parallel_for_(range, invoker);
}
......@@ -1457,10 +1652,12 @@ typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
int scale_x, int scale_y );
typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
const DecimateAlpha* xofs, int xofs_count, double scale_y_);
const DecimateAlpha* xofs, int xofs_count,
double scale_y_);
}
//////////////////////////////////////////////////////////////////////////////////////////
void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
......@@ -1553,30 +1750,33 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
static ResizeAreaFastFunc areafast_tab[] =
{
resizeAreaFast_8u, 0,
resizeAreaFast_<ushort, float>,
resizeAreaFast_<short, float>,
resizeAreaFast_<uchar, int, ResizeAreaFast_2x2_8u<uchar, int> >,
0,
resizeAreaFast_<ushort, float, ResizeAreaFastNoVec<ushort, float> >,
resizeAreaFast_<short, float, ResizeAreaFastNoVec<short, float> >,
0,
resizeAreaFast_<float, float>,
resizeAreaFast_<double, double>,
resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
0
};
static ResizeAreaFunc area_tab[] =
{
resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, resizeArea_<short, float>,
0, resizeArea_<float, float>, resizeArea_<double, double>, 0
resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
resizeArea_<short, float>, 0, resizeArea_<float, float>,
resizeArea_<double, double>, 0
};
Mat src = _src.getMat();
Size ssize = src.size();
CV_Assert( ssize.area() > 0 );
CV_Assert( !(dsize == Size()) || (inv_scale_x > 0 && inv_scale_y > 0) );
if( dsize == Size() )
CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
if( !dsize.area() )
{
dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
saturate_cast<int>(src.rows*inv_scale_y));
CV_Assert( dsize.area() );
}
else
{
......@@ -1602,15 +1802,24 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
return;
}
// true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
// In other cases it is emulated using some variant of bilinear interpolation
if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
{
int iscale_x = saturate_cast<int>(scale_x);
int iscale_y = saturate_cast<int>(scale_y);
if( std::abs(scale_x - iscale_x) < DBL_EPSILON &&
std::abs(scale_y - iscale_y) < DBL_EPSILON )
bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
std::abs(scale_y - iscale_y) < DBL_EPSILON;
// in case of scale_x && scale_y is equal to 2
// INTER_AREA (fast) also is equal to INTER_LINEAR
if ( interpolation == INTER_LINEAR &&
scale_x >= 1 && scale_y >= 1 && is_area_fast)
interpolation = INTER_AREA;
// true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
// In other cases it is emulated using some variant of bilinear interpolation
if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
{
if( is_area_fast )
{
int area = iscale_x*iscale_y;
size_t srcstep = src.step / src.elemSize1();
......@@ -1626,9 +1835,10 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
for( dx = 0; dx < dsize.width; dx++ )
{
sx = dx*iscale_x*cn;
int j = dx * cn;
sx = iscale_x * j;
for( k = 0; k < cn; k++ )
xofs[dx*cn + k] = sx + k;
xofs[j + k] = sx + k;
}
func( src, dst, ofs, xofs, iscale_x, iscale_y );
......@@ -1643,7 +1853,8 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
for( dx = 0, k = 0; dx < dsize.width; dx++ )
{
double fsx1 = dx*scale_x, fsx2 = fsx1 + scale_x;
double fsx1 = dx*scale_x;
double fsx2 = fsx1 + scale_x;
int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
sx1 = std::min(sx1, ssize.width-1);
sx2 = std::min(sx2, ssize.width-1);
......@@ -1672,9 +1883,11 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
xofs[k++].alpha = (float)(min(fsx2 - sx2, 1.) / min(scale_x, src.cols - fsx1));
}
}
func( src, dst, xofs, k ,scale_y);
func( src, dst, xofs, k, scale_y);
return;
}
}
int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
bool area_mode = interpolation == INTER_AREA;
......@@ -2549,134 +2762,48 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
const Mat& _fxy, const void* _wtab,
int borderType, const Scalar& _borderValue);
}
void cv::remap( InputArray _src, OutputArray _dst,
InputArray _map1, InputArray _map2,
int interpolation, int borderType, const Scalar& borderValue )
class remapInvoker :
public ParallelLoopBody
{
static RemapNNFunc nn_tab[] =
{
remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
};
static RemapFunc linear_tab[] =
{
remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
remapBilinear<Cast<float, float>, RemapNoVec, float>,
remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
};
static RemapFunc cubic_tab[] =
{
remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapBicubic<Cast<float, ushort>, float, 1>,
remapBicubic<Cast<float, short>, float, 1>, 0,
remapBicubic<Cast<float, float>, float, 1>,
remapBicubic<Cast<double, double>, float, 1>, 0
};
static RemapFunc lanczos4_tab[] =
{
remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapLanczos4<Cast<float, ushort>, float, 1>,
remapLanczos4<Cast<float, short>, float, 1>, 0,
remapLanczos4<Cast<float, float>, float, 1>,
remapLanczos4<Cast<double, double>, float, 1>, 0
};
Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
CV_Assert( (!map2.data || map2.size() == map1.size()));
_dst.create( map1.size(), src.type() );
Mat dst = _dst.getMat();
if( dst.data == src.data )
src = src.clone();
int depth = src.depth(), map_depth = map1.depth();
RemapNNFunc nnfunc = 0;
RemapFunc ifunc = 0;
const void* ctab = 0;
bool fixpt = depth == CV_8U;
bool planar_input = false;
if( interpolation == INTER_NEAREST )
public:
remapInvoker(const Mat& _src, Mat _dst, const Mat& _map1, const Mat& _map2, const Mat *_m1,
const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
ParallelLoopBody(), src(_src), dst(_dst), map1(_map1), map2(_map2), m1(_m1), m2(_m2),
interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue),
planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
{
nnfunc = nn_tab[depth];
CV_Assert( nnfunc != 0 );
if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
{
nnfunc( src, dst, map1, borderType, borderValue );
return;
}
}
else
{
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
if( interpolation == INTER_LINEAR )
ifunc = linear_tab[depth];
else if( interpolation == INTER_CUBIC )
ifunc = cubic_tab[depth];
else if( interpolation == INTER_LANCZOS4 )
ifunc = lanczos4_tab[depth];
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
CV_Assert( ifunc != 0 );
ctab = initInterTab2D( interpolation, fixpt );
}
const Mat *m1 = &map1, *m2 = &map2;
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
{
if( map1.type() != CV_16SC2 )
std::swap(m1, m2);
if( ifunc )
virtual void operator() (const Range& range) const
{
ifunc( src, dst, *m1, *m2, ctab, borderType, borderValue );
return;
}
}
else
{
CV_Assert( (map1.type() == CV_32FC2 && !map2.data) ||
(map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
planar_input = map1.channels() == 1;
}
int x, y, x1, y1;
const int buf_size = 1 << 14;
int brows0 = std::min(128, dst.rows);
int brows0 = std::min(128, dst.rows), map_depth = map1.depth();
int bcols0 = std::min(buf_size/brows0, dst.cols);
brows0 = std::min(buf_size/bcols0, dst.rows);
#if CV_SSE2
#if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
#endif
Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
if( !nnfunc )
_bufa.create(brows0, bcols0, CV_16UC1);
for( y = 0; y < dst.rows; y += brows0 )
for( y = range.start; y < range.end; y += brows0 )
{
for( x = 0; x < dst.cols; x += bcols0 )
{
int brows = std::min(brows0, dst.rows - y);
int brows = std::min(brows0, range.end - y);
int bcols = std::min(bcols0, dst.cols - x);
Mat dpart(dst, Rect(x, y, bcols, brows));
Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
if( nnfunc )
{
if( map_depth != CV_32F )
if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
bufxy = map1(Rect(x, y, bcols, brows));
else if( map_depth != CV_32F )
{
for( y1 = 0; y1 < brows; y1++ )
{
......@@ -2693,7 +2820,7 @@ void cv::remap( InputArray _src, OutputArray _dst,
}
}
else if( !planar_input )
map1(Rect(0,0,bcols,brows)).convertTo(bufxy, bufxy.depth());
map1(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
else
{
for( y1 = 0; y1 < brows; y1++ )
......@@ -2737,13 +2864,19 @@ void cv::remap( InputArray _src, OutputArray _dst,
continue;
}
Mat bufa(_bufa, Rect(0,0,bcols, brows));
Mat bufa(_bufa, Rect(0, 0, bcols, brows));
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = (short*)(bufxy.data + bufxy.step*y1);
ushort* A = (ushort*)(bufa.data + bufa.step*y1);
if( planar_input )
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
{
bufxy = m1->operator()(Rect(x, y, bcols, brows));
bufa = m2->operator()(Rect(x, y, bcols, brows));
}
else if( planar_input )
{
const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
......@@ -2815,6 +2948,118 @@ void cv::remap( InputArray _src, OutputArray _dst,
ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue);
}
}
}
private:
const Mat src;
Mat dst;
const Mat map1, map2, *m1, *m2;
int interpolation, borderType;
const Scalar borderValue;
int planar_input;
RemapNNFunc nnfunc;
RemapFunc ifunc;
const void *ctab;
};
}
void cv::remap( InputArray _src, OutputArray _dst,
InputArray _map1, InputArray _map2,
int interpolation, int borderType, const Scalar& borderValue )
{
static RemapNNFunc nn_tab[] =
{
remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
};
static RemapFunc linear_tab[] =
{
remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
remapBilinear<Cast<float, float>, RemapNoVec, float>,
remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
};
static RemapFunc cubic_tab[] =
{
remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapBicubic<Cast<float, ushort>, float, 1>,
remapBicubic<Cast<float, short>, float, 1>, 0,
remapBicubic<Cast<float, float>, float, 1>,
remapBicubic<Cast<double, double>, float, 1>, 0
};
static RemapFunc lanczos4_tab[] =
{
remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapLanczos4<Cast<float, ushort>, float, 1>,
remapLanczos4<Cast<float, short>, float, 1>, 0,
remapLanczos4<Cast<float, float>, float, 1>,
remapLanczos4<Cast<double, double>, float, 1>, 0
};
Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
CV_Assert( map1.size().area() > 0 );
CV_Assert( !map2.data || (map2.size() == map1.size()));
_dst.create( map1.size(), src.type() );
Mat dst = _dst.getMat();
if( dst.data == src.data )
src = src.clone();
int depth = src.depth();
RemapNNFunc nnfunc = 0;
RemapFunc ifunc = 0;
const void* ctab = 0;
bool fixpt = depth == CV_8U;
bool planar_input = false;
if( interpolation == INTER_NEAREST )
{
nnfunc = nn_tab[depth];
CV_Assert( nnfunc != 0 );
}
else
{
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
if( interpolation == INTER_LINEAR )
ifunc = linear_tab[depth];
else if( interpolation == INTER_CUBIC )
ifunc = cubic_tab[depth];
else if( interpolation == INTER_LANCZOS4 )
ifunc = lanczos4_tab[depth];
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
CV_Assert( ifunc != 0 );
ctab = initInterTab2D( interpolation, fixpt );
}
const Mat *m1 = &map1, *m2 = &map2;
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
{
if( map1.type() != CV_16SC2 )
std::swap(m1, m2);
}
else
{
CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && !map2.data) ||
(map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
planar_input = map1.channels() == 1;
}
Range range(0, dst.rows);
remapInvoker invoker(src, dst, map1, map2, m1, m2, interpolation,
borderType, borderValue, planar_input, nnfunc, ifunc,
ctab);
parallel_for_(range, invoker);
}
......@@ -2957,71 +3202,42 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
}
void cv::warpAffine( InputArray _src, OutputArray _dst,
InputArray _M0, Size dsize,
int flags, int borderType, const Scalar& borderValue )
namespace cv
{
Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data )
src = src.clone();
const int BLOCK_SZ = 64;
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
double M[6];
Mat matM(2, 3, CV_64F, M);
int interpolation = flags & INTER_MAX;
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
M0.convertTo(matM, matM.type());
#ifdef HAVE_TEGRA_OPTIMIZATION
if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
return;
#endif
if( !(flags & WARP_INVERSE_MAP) )
class warpAffineInvoker :
public ParallelLoopBody
{
public:
warpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
M(_M)
{
double D = M[0]*M[4] - M[1]*M[3];
D = D != 0 ? 1./D : 0;
double A11 = M[4]*D, A22=M[0]*D;
M[0] = A11; M[1] *= -D;
M[3] *= -D; M[4] = A22;
double b1 = -M[0]*M[2] - M[1]*M[5];
double b2 = -M[3]*M[2] - M[4]*M[5];
M[2] = b1; M[5] = b2;
}
int x, y, x1, y1, width = dst.cols, height = dst.rows;
AutoBuffer<int> _abdelta(width*2);
int* adelta = &_abdelta[0], *bdelta = adelta + width;
virtual void operator() (const Range& range) const
{
const int BLOCK_SZ = 64;
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2;
#if CV_SSE2
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
for( x = 0; x < width; x++ )
{
adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
}
#endif
int bh0 = std::min(BLOCK_SZ/2, height);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
for( y = 0; y < height; y += bh0 )
for( y = range.start; y < range.end; y += bh0 )
{
for( x = 0; x < width; x += bw0 )
for( x = 0; x < dst.cols; x += bw0 )
{
int bw = std::min( bw0, width - x);
int bh = std::min( bh0, height - y);
int bw = std::min( bw0, dst.cols - x);
int bh = std::min( bh0, range.end - y);
Mat _XY(bh, bw, CV_16SC2, XY), matA;
Mat dpart(dst, Rect(x, y, bw, bh));
......@@ -3099,51 +3315,107 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
}
}
}
}
private:
const Mat src;
Mat dst;
int interpolation, borderType;
const Scalar borderValue;
int *adelta, *bdelta;
double *M;
};
}
void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
Size dsize, int flags, int borderType, const Scalar& borderValue )
void cv::warpAffine( InputArray _src, OutputArray _dst,
InputArray _M0, Size dsize,
int flags, int borderType, const Scalar& borderValue )
{
Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data )
src = src.clone();
const int BLOCK_SZ = 32;
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
double M[9];
Mat matM(3, 3, CV_64F, M);
double M[6];
Mat matM(2, 3, CV_64F, M);
int interpolation = flags & INTER_MAX;
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
M0.convertTo(matM, matM.type());
#ifdef HAVE_TEGRA_OPTIMIZATION
if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
return;
#endif
if( !(flags & WARP_INVERSE_MAP) )
invert(matM, matM);
{
double D = M[0]*M[4] - M[1]*M[3];
D = D != 0 ? 1./D : 0;
double A11 = M[4]*D, A22=M[0]*D;
M[0] = A11; M[1] *= -D;
M[3] *= -D; M[4] = A22;
double b1 = -M[0]*M[2] - M[1]*M[5];
double b2 = -M[3]*M[2] - M[4]*M[5];
M[2] = b1; M[5] = b2;
}
int x;
AutoBuffer<int> _abdelta(dst.cols*2);
int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
for( x = 0; x < dst.cols; x++ )
{
adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
}
Range range(0, dst.rows);
warpAffineInvoker invoker(src, dst, interpolation, borderType,
borderValue, adelta, bdelta, M);
parallel_for_(range, invoker);
}
namespace cv
{
class warpPerspectiveInvoker :
public ParallelLoopBody
{
public:
warpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
int _borderType, const Scalar &_borderValue) :
ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
borderType(_borderType), borderValue(_borderValue)
{
}
virtual void operator() (const Range& range) const
{
const int BLOCK_SZ = 32;
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
int x, y, x1, y1, width = dst.cols, height = dst.rows;
int bh0 = std::min(BLOCK_SZ/2, height);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
for( y = 0; y < height; y += bh0 )
for( y = range.start; y < range.end; y += bh0 )
{
for( x = 0; x < width; x += bw0 )
{
int bw = std::min( bw0, width - x);
int bh = std::min( bh0, height - y);
int bh = std::min( bh0, range.end - y); // height
Mat _XY(bh, bw, CV_16SC2, XY), matA;
Mat dpart(dst, Rect(x, y, bw, bh));
......@@ -3197,6 +3469,49 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
}
}
}
}
private:
const Mat src;
Mat dst;
double* M;
int interpolation, borderType;
const Scalar borderValue;
};
}
void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
Size dsize, int flags, int borderType, const Scalar& borderValue )
{
Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data )
src = src.clone();
double M[9];
Mat matM(3, 3, CV_64F, M);
int interpolation = flags & INTER_MAX;
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
M0.convertTo(matM, matM.type());
#ifdef HAVE_TEGRA_OPTIMIZATION
if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
return;
#endif
if( !(flags & WARP_INVERSE_MAP) )
invert(matM, matM);
Range range(0, dst.rows);
warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
parallel_for_(range, invoker);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment