Commit 92795ba4 authored by Ilya Lavrenov's avatar Ilya Lavrenov

parallel version of remap, resize, warpaffine, warpPerspective. Some…

parallel version of remap, resize, warpaffine, warpPerspective. Some optimization for  2x decimation in resize algorithm
parent f2a02fef
#include "perf_precomp.hpp"
using namespace std;
using namespace cv;
using namespace perf;
using namespace testing;
using std::tr1::make_tuple;
using std::tr1::get;
CV_ENUM(MatrixType, CV_16UC1, CV_16SC1, CV_32FC1)
CV_ENUM(MapType, CV_16SC2, CV_32FC1, CV_32FC2)
CV_ENUM(InterType, INTER_LINEAR, INTER_CUBIC, INTER_LANCZOS4, INTER_NEAREST)
typedef TestBaseWithParam< tr1::tuple<Size, MatrixType, MapType, InterType> > TestRemap;
PERF_TEST_P( TestRemap, Remap,
Combine(
Values( szVGA, sz1080p ),
ValuesIn( MatrixType::all() ),
ValuesIn( MapType::all() ),
ValuesIn( InterType::all() )
)
)
{
Size sz;
int src_type, map1_type, inter_type;
sz = get<0>(GetParam());
src_type = get<1>(GetParam());
map1_type = get<2>(GetParam());
inter_type = get<3>(GetParam());
Mat src(sz, src_type);
Mat map1(sz, map1_type);
Mat dst(sz, src_type);
Mat map2(map1_type == CV_32FC1 ? sz : Size(), CV_32FC1);
RNG rng;
rng.fill(src, RNG::UNIFORM, 0, 256);
for (int j = 0; j < map1.rows; ++j)
for (int i = 0; i < map1.cols; ++i)
switch (map1_type)
{
case CV_32FC1:
map1.at<float>(j, i) = src.cols - i;
map2.at<float>(j, i) = j;
break;
case CV_32FC2:
map1.at<Vec2f>(j, i)[0] = src.cols - i;
map1.at<Vec2f>(j, i)[1] = j;
break;
case CV_16SC2:
map1.at<Vec2s>(j, i)[0] = src.cols - i;
map1.at<Vec2s>(j, i)[1] = j;
break;
default:
CV_Assert(0);
}
declare.in(src, WARMUP_RNG).out(dst).time(20);
TEST_CYCLE() remap(src, dst, map1, map2, inter_type);
SANITY_CHECK(dst);
}
...@@ -59,11 +59,11 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear, ...@@ -59,11 +59,11 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
typedef tr1::tuple<MatType, Size, int> MatInfo_Size_Scale_t; typedef tr1::tuple<MatType, Size, int> MatInfo_Size_Scale_t;
typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale; typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;
PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast, PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast,
testing::Combine( testing::Combine(
testing::Values(CV_8UC1, CV_8UC4), testing::Values(CV_8UC1, CV_8UC4),
testing::Values(szVGA, szqHD, sz720p, sz1080p), testing::Values(szVGA, szqHD, sz720p, sz1080p),
testing::Values(2, 4) testing::Values(2)
) )
) )
{ {
...@@ -84,3 +84,31 @@ PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast, ...@@ -84,3 +84,31 @@ PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
//difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
SANITY_CHECK(dst, 1); SANITY_CHECK(dst, 1);
} }
typedef TestBaseWithParam<tr1::tuple<MatType, Size, double> > MatInfo_Size_Scale_Area;
PERF_TEST_P(MatInfo_Size_Scale_Area, ResizeArea,
testing::Combine(
testing::Values(CV_8UC1, CV_8UC4),
testing::Values(szVGA, szqHD, sz720p, sz1080p),
testing::Values(2.4, 3.4, 1.3)
)
)
{
int matType = get<0>(GetParam());
Size from = get<1>(GetParam());
double scale = get<2>(GetParam());
cv::Mat src(from, matType);
Size to(cvRound(from.width * scale), cvRound(from.height * scale));
cv::Mat dst(to, matType);
declare.in(src, WARMUP_RNG).out(dst);
TEST_CYCLE() resize(src, dst, dst.size(), 0, 0, INTER_AREA);
//difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
SANITY_CHECK(dst, 1);
}
...@@ -240,24 +240,22 @@ template<typename ST, typename DT, int bits> struct FixedPtCast ...@@ -240,24 +240,22 @@ template<typename ST, typename DT, int bits> struct FixedPtCast
* Resize * * Resize *
\****************************************************************************************/ \****************************************************************************************/
static void class resizeNNInvoker :
resizeNN( const Mat& src, Mat& dst, double fx, double fy ) public ParallelLoopBody
{ {
Size ssize = src.size(), dsize = dst.size(); public:
AutoBuffer<int> _x_ofs(dsize.width); resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
int* x_ofs = _x_ofs; ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
int pix_size = (int)src.elemSize(); ify(_ify)
int pix_size4 = (int)(pix_size / sizeof(int));
double ifx = 1./fx, ify = 1./fy;
int x, y;
for( x = 0; x < dsize.width; x++ )
{ {
int sx = cvFloor(x*ifx);
x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
} }
for( y = 0; y < dsize.height; y++ ) virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size();
int y, x, pix_size = (int)src.elemSize();
for( y = range.start; y < range.end; y++ )
{ {
uchar* D = dst.data + dst.step*y; uchar* D = dst.data + dst.step*y;
int sy = std::min(cvFloor(y*ify), ssize.height-1); int sy = std::min(cvFloor(y*ify), ssize.height-1);
...@@ -326,6 +324,35 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy ) ...@@ -326,6 +324,35 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
} }
} }
} }
}
private:
const Mat src;
Mat dst;
int* x_ofs, pix_size4;
double ify;
};
static void
resizeNN( const Mat& src, Mat& dst, double fx, double fy )
{
Size ssize = src.size(), dsize = dst.size();
AutoBuffer<int> _x_ofs(dsize.width);
int* x_ofs = _x_ofs;
int pix_size = (int)src.elemSize();
int pix_size4 = (int)(pix_size / sizeof(int));
double ifx = 1./fx, ify = 1./fy;
int x;
for( x = 0; x < dsize.width; x++ )
{
int sx = cvFloor(x*ifx);
x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
}
Range range(0, dsize.height);
resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
parallel_for_(range, invoker);
} }
...@@ -1092,33 +1119,35 @@ static inline int clip(int x, int a, int b) ...@@ -1092,33 +1119,35 @@ static inline int clip(int x, int a, int b)
static const int MAX_ESIZE=16; static const int MAX_ESIZE=16;
template<class HResize, class VResize> template <typename HResize, typename VResize>
static void resizeGeneric_( const Mat& src, Mat& dst, class resizeGeneric_Invoker :
const int* xofs, const void* _alpha, public ParallelLoopBody
const int* yofs, const void* _beta,
int xmin, int xmax, int ksize )
{ {
public:
typedef typename HResize::value_type T; typedef typename HResize::value_type T;
typedef typename HResize::buf_type WT; typedef typename HResize::buf_type WT;
typedef typename HResize::alpha_type AT; typedef typename HResize::alpha_type AT;
const AT* alpha = (const AT*)_alpha; resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
const AT* beta = (const AT*)_beta; const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
Size ssize = src.size(), dsize = dst.size(); int _ksize, int _xmin, int _xmax) :
int cn = src.channels(); ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
ssize.width *= cn; alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
dsize.width *= cn; ksize(_ksize), xmin(_xmin), xmax(_xmax)
{
}
virtual void operator() (const Range& range) const
{
int dy, cn = src.channels();
HResize hresize;
VResize vresize;
int bufstep = (int)alignSize(dsize.width, 16); int bufstep = (int)alignSize(dsize.width, 16);
AutoBuffer<WT> _buffer(bufstep*ksize); AutoBuffer<WT> _buffer(bufstep*ksize);
const T* srows[MAX_ESIZE]={0}; const T* srows[MAX_ESIZE]={0};
WT* rows[MAX_ESIZE]={0}; WT* rows[MAX_ESIZE]={0};
int prev_sy[MAX_ESIZE]; int prev_sy[MAX_ESIZE];
int dy;
xmin *= cn;
xmax *= cn;
HResize hresize;
VResize vresize;
for(int k = 0; k < ksize; k++ ) for(int k = 0; k < ksize; k++ )
{ {
...@@ -1126,8 +1155,9 @@ static void resizeGeneric_( const Mat& src, Mat& dst, ...@@ -1126,8 +1155,9 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
rows[k] = (WT*)_buffer + bufstep*k; rows[k] = (WT*)_buffer + bufstep*k;
} }
// image resize is a separable operation. In case of not too strong const AT* beta = _beta + ksize * range.start;
for( dy = 0; dy < dsize.height; dy++, beta += ksize )
for( dy = range.start; dy < range.end; dy++, beta += ksize )
{ {
int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2; int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
...@@ -1145,35 +1175,144 @@ static void resizeGeneric_( const Mat& src, Mat& dst, ...@@ -1145,35 +1175,144 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
} }
if( k1 == ksize ) if( k1 == ksize )
k0 = std::min(k0, k); // remember the first row that needs to be computed k0 = std::min(k0, k); // remember the first row that needs to be computed
srows[k] = (const T*)(src.data + src.step*sy); srows[k] = (T*)(src.data + src.step*sy);
prev_sy[k] = sy; prev_sy[k] = sy;
} }
if( k0 < ksize ) if( k0 < ksize )
hresize( srows + k0, rows + k0, ksize - k0, xofs, alpha, hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
ssize.width, dsize.width, cn, xmin, xmax ); ssize.width, dsize.width, cn, xmin, xmax );
vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width ); vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
} }
}
private:
const Mat src;
Mat dst;
const int* xofs, *yofs;
const AT* alpha, *_beta;
const Size ssize, dsize;
const int ksize, xmin, xmax;
};
template<class HResize, class VResize>
static void resizeGeneric_( const Mat& src, Mat& dst,
const int* xofs, const void* _alpha,
const int* yofs, const void* _beta,
int xmin, int xmax, int ksize )
{
typedef typename HResize::value_type T;
typedef typename HResize::buf_type WT;
typedef typename HResize::alpha_type AT;
const AT* beta = (const AT*)_beta;
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
ssize.width *= cn;
dsize.width *= cn;
xmin *= cn;
xmax *= cn;
// image resize is a separable operation. In case of not too strong
Range range(0, dsize.height);
resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
ssize, dsize, ksize, xmin, xmax);
parallel_for_(range, invoker);
} }
template <typename T, typename WT>
struct ResizeAreaFastNoVec
{
ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
};
template<typename T, typename WT> template <typename T, typename WT>
static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs, struct ResizeAreaFast_2x2_8u
int scale_x, int scale_y ) {
ResizeAreaFast_2x2_8u(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
{
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
}
int operator() (const T* S, T* D, int w) const
{
if( !fast_mode )
return 0;
const T* nextS = S + step;
int dx = 0;
if (cn == 1)
for( ; dx < w; ++dx )
{
int index = dx*2;
D[dx] = (S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2;
}
else if (cn == 3)
for( ; dx < w; dx += 3 )
{
int index = dx*2;
D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
}
else
{
assert(cn == 4);
for( ; dx < w; dx += 4 )
{
int index = dx*2;
D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
D[dx+3] = (S[index+3] + S[index+6] + nextS[index+3] + nextS[index+6] + 2) >> 2;
}
}
return dx;
}
private:
const int scale_x, scale_y;
const int cn;
bool fast_mode;
const int step;
};
template <typename T, typename WT, typename VecOp>
class resizeAreaFast_Invoker :
public ParallelLoopBody
{ {
public:
resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
{
}
virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size(); Size ssize = src.size(), dsize = dst.size();
int cn = src.channels(); int cn = src.channels();
int dy, dx, k = 0;
int area = scale_x*scale_y; int area = scale_x*scale_y;
float scale = 1.f/(scale_x*scale_y); float scale = 1.f/(area);
int dwidth1 = (ssize.width/scale_x)*cn; int dwidth1 = (ssize.width/scale_x)*cn;
dsize.width *= cn; dsize.width *= cn;
ssize.width *= cn; ssize.width *= cn;
int dy, dx, k = 0;
for( dy = 0; dy < dsize.height; dy++ ) VecOp vop(scale_x, scale_y, src.channels(), src.step/*, area_ofs*/);
for( dy = range.start; dy < range.end; dy++ )
{ {
T* D = (T*)(dst.data + dst.step*dy); T* D = (T*)(dst.data + dst.step*dy);
int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0; int sy0 = dy*scale_y;
int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
if( sy0 >= ssize.height ) if( sy0 >= ssize.height )
{ {
for( dx = 0; dx < dsize.width; dx++ ) for( dx = 0; dx < dsize.width; dx++ )
...@@ -1181,11 +1320,12 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int ...@@ -1181,11 +1320,12 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
continue; continue;
} }
for( dx = 0; dx < w; dx++ ) dx = vop((const T*)(src.data + src.step * sy0), D, w);
for( ; dx < w; dx++ )
{ {
const T* S = (const T*)(src.data + src.step*sy0) + xofs[dx]; const T* S = (const T*)(src.data + src.step * sy0) + xofs[dx];
WT sum = 0; WT sum = 0;
k=0; k = 0;
#if CV_ENABLE_UNROLLED #if CV_ENABLE_UNROLLED
for( ; k <= area - 4; k += 4 ) for( ; k <= area - 4; k += 4 )
sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]]; sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
...@@ -1193,7 +1333,7 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int ...@@ -1193,7 +1333,7 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
for( ; k < area; k++ ) for( ; k < area; k++ )
sum += S[ofs[k]]; sum += S[ofs[k]];
D[dx] = saturate_cast<T>(sum*scale); D[dx] = saturate_cast<T>(sum * scale);
} }
for( ; dx < dsize.width; dx++ ) for( ; dx < dsize.width; dx++ )
...@@ -1217,9 +1357,26 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int ...@@ -1217,9 +1357,26 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
} }
} }
D[dx] = saturate_cast<T>((float)sum/count); D[dx] = saturate_cast<WT>((float)sum/count);
} }
} }
}
private:
const Mat src;
Mat dst;
const int scale_x, scale_y;
const int *ofs, *xofs;
};
template<typename T, typename WT, typename VecOp>
static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
int scale_x, int scale_y )
{
Range range(0, dst.rows);
resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
scale_y, ofs, xofs);
parallel_for_(range, invoker);
} }
struct DecimateAlpha struct DecimateAlpha
...@@ -1228,24 +1385,46 @@ struct DecimateAlpha ...@@ -1228,24 +1385,46 @@ struct DecimateAlpha
float alpha; float alpha;
}; };
template<typename T, typename WT> template <typename T, typename WT>
static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_) class resizeArea_Invoker :
public ParallelLoopBody
{ {
public:
resizeArea_Invoker(const Mat& _src, Mat& _dst, const DecimateAlpha* _xofs,
int _xofs_count, double _scale_y_
#ifdef HAVE_TBB
, const int* _yofs, const int* _cur_dy_ofs
#endif
) :
ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs),
xofs_count(_xofs_count), scale_y_(_scale_y_)
#ifdef HAVE_TBB
, yofs(_yofs), cur_dy_ofs(_cur_dy_ofs)
#endif
{
}
virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size(); Size ssize = src.size(), dsize = dst.size();
int cn = src.channels(); int cn = src.channels();
dsize.width *= cn; dsize.width *= cn;
AutoBuffer<WT> _buffer(dsize.width*2); AutoBuffer<WT> _buffer(dsize.width*2);
WT *buf = _buffer, *sum = buf + dsize.width; WT *buf = _buffer, *sum = buf + dsize.width;
int k, sy, dx, cur_dy = 0; int k, sy, dx, cur_dy = 0, num = sizeof(WT) * dsize.width;
WT scale_y = (WT)scale_y_; WT scale_y = (WT)scale_y_;
CV_Assert( cn <= 4 ); CV_Assert( cn <= 4 );
for( dx = 0; dx < dsize.width; dx++ ) memset(buf, 0, num * 2);
buf[dx] = sum[dx] = 0;
for( sy = 0; sy < ssize.height; sy++ ) #ifdef HAVE_TBB
sy = yofs[range.start];
cur_dy = cur_dy_ofs[sy];
for( ; sy < range.start; sy++ )
{ {
const T* S = (const T*)(src.data + src.step*sy); const T* S = (const T*)(src.data + src.step * sy);
memset(buf, 0, num);
if( cn == 1 ) if( cn == 1 )
for( k = 0; k < xofs_count; k++ ) for( k = 0; k < xofs_count; k++ )
{ {
...@@ -1269,9 +1448,11 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in ...@@ -1269,9 +1448,11 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
int sxn = xofs[k].si; int sxn = xofs[k].si;
int dxn = xofs[k].di; int dxn = xofs[k].di;
WT alpha = xofs[k].alpha; WT alpha = xofs[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha; WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha; WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
WT t2 = buf[dxn+2] + S[sxn+2]*alpha; WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2; buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
} }
else else
...@@ -1280,35 +1461,30 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in ...@@ -1280,35 +1461,30 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
int sxn = xofs[k].si; int sxn = xofs[k].si;
int dxn = xofs[k].di; int dxn = xofs[k].di;
WT alpha = xofs[k].alpha; WT alpha = xofs[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha; WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha; WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn] = t0; buf[dxn+1] = t1;
t0 = buf[dxn+2] + S[sxn+2]*alpha; t0 = buf[dxn+2] + S[sxn+2]*alpha;
t1 = buf[dxn+3] + S[sxn+3]*alpha; t1 = buf[dxn+3] + S[sxn+3]*alpha;
buf[dxn+2] = t0; buf[dxn+3] = t1; buf[dxn+2] = t0; buf[dxn+3] = t1;
} }
if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 ) if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
{ {
WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y, (WT)0); WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0);
WT beta1 = 1 - beta;
T* D = (T*)(dst.data + dst.step*cur_dy);
if( fabs(beta) < 1e-3 ) if( fabs(beta) < 1e-3 )
{ {
if(cur_dy >= dsize.height) return; if(cur_dy >= dsize.height)
for( dx = 0; dx < dsize.width; dx++ ) break;
{ memset(sum, 0, num);
D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
sum[dx] = buf[dx] = 0;
}
} }
else else
for( dx = 0; dx < dsize.width; dx++ ) for( dx = 0; dx < dsize.width; dx++ )
{ sum[dx] = buf[dx] * beta;
D[dx] = saturate_cast<T>((sum[dx] + buf[dx]* beta1)/ min(scale_y, src.rows - cur_dy*scale_y));
sum[dx] = buf[dx]*beta;
buf[dx] = 0;
}
cur_dy++; cur_dy++;
} }
else else
...@@ -1318,132 +1494,151 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in ...@@ -1318,132 +1494,151 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
WT t0 = sum[dx] + buf[dx]; WT t0 = sum[dx] + buf[dx];
WT t1 = sum[dx+1] + buf[dx+1]; WT t1 = sum[dx+1] + buf[dx+1];
sum[dx] = t0; sum[dx+1] = t1; sum[dx] = t0; sum[dx+1] = t1;
buf[dx] = buf[dx+1] = 0;
} }
for( ; dx < dsize.width; dx++ ) for( ; dx < dsize.width; dx++ )
{
sum[dx] += buf[dx]; sum[dx] += buf[dx];
buf[dx] = 0;
} }
} }
}
}
static void resizeAreaFast_8u( const Mat& src, Mat& dst,
const int* ofs, const int* xofs,
int scale_x, int scale_y )
{
#if CV_SSE2
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif #endif
Size ssize = src.size(), dsize = dst.size(); for( sy = range.start; sy < range.end; sy++ )
int cn = src.channels();
int dy, dx, k = 0;
int area = scale_x*scale_y;
float scale = 1.f/(scale_x*scale_y);
int dwidth1 = (ssize.width/scale_x)*cn;
dsize.width *= cn;
ssize.width *= cn;
//avg values
for( dy = 0; dy < dsize.height; dy++ )
{
uchar* D = (uchar*)(dst.data + dst.step*dy);
int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
if( sy0 >= ssize.height )
{
for( dx = 0; dx < dsize.width; dx++ ) //memset(D,0, dsize.width);//warning, never executed -> not tested
D[dx] = 0;
continue;
}
dx = 0;
#if CV_SSE2
if( haveSSE2 )
{ {
const __m128 _scale = _mm_set1_ps(scale); const T* S = (const T*)(src.data + src.step * sy);
const __m128i _ucMAXs = _mm_set1_epi16(UCHAR_MAX); memset(buf, 0, num);
const uchar* _S[8];
for(; dx < w-8; dx+=8 ) if( cn == 1 )
for( k = 0; k < xofs_count; k++ )
{ {
__m128i _sum = _mm_setzero_si128(); int dxn = xofs[k].di;
__m128i _sum1 = _mm_setzero_si128(); WT alpha = xofs[k].alpha;
_S[0] = (const uchar*)(src.data + src.step*sy0) + xofs[dx]; buf[dxn] += S[xofs[k].si]*alpha;
_S[1] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+1]; }
_S[2] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+2]; else if( cn == 2 )
_S[3] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+3]; for( k = 0; k < xofs_count; k++ )
_S[4] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+4];
_S[5] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+5];
_S[6] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+6];
_S[7] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+7];
for( k = 0; k < area; k++ )
{ {
int ofsk = ofs[k]; int sxn = xofs[k].si;
__m128i _temp = _mm_set_epi32(_S[3][ofsk],_S[2][ofsk],_S[1][ofsk],_S[0][ofsk]); int dxn = xofs[k].di;
_sum = _mm_add_epi32(_sum, _temp); WT alpha = xofs[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha;
__m128i _temp1 = _mm_set_epi32(_S[7][ofsk],_S[6][ofsk],_S[5][ofsk],_S[4][ofsk]); WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
_sum1 = _mm_add_epi32(_sum1, _temp1); buf[dxn] = t0; buf[dxn+1] = t1;
} }
else if( cn == 3 )
for( k = 0; k < xofs_count; k++ )
{
int sxn = xofs[k].si;
int dxn = xofs[k].di;
WT alpha = xofs[k].alpha;
__m128i _tempSum = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum), _scale)); WT t0 = buf[dxn] + S[sxn]*alpha;
__m128i _tempSum1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale)); WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
_tempSum = _mm_packs_epi32(_tempSum, _tempSum1); buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
_tempSum = _mm_min_epi16(_ucMAXs, _tempSum);
_tempSum = _mm_packus_epi16(_tempSum, _tempSum);
_mm_storel_epi64((__m128i*)(D+dx),_tempSum);
}
} }
#endif else
for( k = 0; k < xofs_count; k++ )
for(; dx < w; dx++ )
{ {
const uchar* S = (const uchar*)(src.data + src.step*sy0) + xofs[dx]; int sxn = xofs[k].si;
int sum = 0; int dxn = xofs[k].di;
k=0; WT alpha = xofs[k].alpha;
#if CV_ENABLE_UNROLLED WT t0 = buf[dxn] + S[sxn]*alpha;
for( ; k <= area - 4; k += 4 ) WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
#endif
for( ; k < area; k++ ) buf[dxn] = t0; buf[dxn+1] = t1;
sum += S[ofs[k]];
t0 = buf[dxn+2] + S[sxn+2]*alpha;
t1 = buf[dxn+3] + S[sxn+3]*alpha;
D[dx] = saturate_cast<uchar>(sum*scale); buf[dxn+2] = t0; buf[dxn+3] = t1;
} }
for( ; dx < dsize.width; dx++ ) if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
{ {
int sum = 0; WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0);
int count = 0, sx0 = xofs[dx]; T* D = (T*)(dst.data + dst.step*cur_dy);
if( sx0 >= ssize.width ) if( fabs(beta) < 1e-3 )
D[dx] = 0;
for( int sy = 0; sy < scale_y; sy++ )
{ {
if( sy0 + sy >= ssize.height ) if(cur_dy >= dsize.height)
break; return;
const uchar* S = (const uchar*)(src.data + src.step*(sy0 + sy)) + sx0; for( dx = 0; dx < dsize.width; dx++ )
int sx = 0; D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
for( ; sx < scale_x*cn; sx += cn ) memset(sum, 0, num);
}
else
{ {
if( sx0 + sx >= ssize.width ) WT beta1 = 1 - beta;
break; for( dx = 0; dx < dsize.width; dx++ )
sum += S[sx]; {
count++; D[dx] = saturate_cast<T>((sum[dx] + buf[dx] * beta1)/ min(scale_y, src.rows - cur_dy * scale_y));
sum[dx] = buf[dx] * beta;
} }
} }
cur_dy++;
D[dx] = saturate_cast<uchar>((float)sum/count);
} }
else
{
for( dx = 0; dx <= dsize.width - 2; dx += 2 )
{
WT t0 = sum[dx] + buf[dx];
WT t1 = sum[dx+1] + buf[dx+1];
sum[dx] = t0; sum[dx+1] = t1;
} }
for( ; dx < dsize.width; dx++ )
sum[dx] += buf[dx];
}
}
}
private:
const Mat src;
Mat dst;
const DecimateAlpha* xofs;
const int xofs_count;
const double scale_y_;
#ifdef HAVE_TBB
const int *yofs, *cur_dy_ofs;
#endif
};
template<typename T, typename WT>
static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
{
#ifdef HAVE_TBB
Size ssize = src.size(), dsize = dst.size();
AutoBuffer<int> _yofs(2 * ssize.height);
int *yofs = _yofs, *cur_dy_ofs = _yofs + ssize.height;
int index = 0, cur_dy = 0, sy;
for( sy = 0; sy < ssize.height; sy++ )
{
bool reset = false;
cur_dy_ofs[sy] = cur_dy;
if( (cur_dy + 1)*scale_y_ <= sy + 1 || sy == ssize.height - 1 )
{
WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y_, 0.);
if( fabs(beta) < 1e-3 )
{
if(cur_dy >= dsize.height)
break;
reset = true;
}
cur_dy++;
}
yofs[sy] = index;
if (reset)
index = sy + 1;
}
#endif
Range range(0, src.rows);
resizeArea_Invoker<T, WT> invoker(src, dst, xofs, xofs_count, scale_y_
#ifdef HAVE_TBB
, yofs, cur_dy_ofs
#endif
);
parallel_for_(range, invoker);
} }
...@@ -1457,10 +1652,12 @@ typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst, ...@@ -1457,10 +1652,12 @@ typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
int scale_x, int scale_y ); int scale_x, int scale_y );
typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst, typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
const DecimateAlpha* xofs, int xofs_count, double scale_y_); const DecimateAlpha* xofs, int xofs_count,
double scale_y_);
} }
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
void cv::resize( InputArray _src, OutputArray _dst, Size dsize, void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
...@@ -1553,30 +1750,33 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, ...@@ -1553,30 +1750,33 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
static ResizeAreaFastFunc areafast_tab[] = static ResizeAreaFastFunc areafast_tab[] =
{ {
resizeAreaFast_8u, 0, resizeAreaFast_<uchar, int, ResizeAreaFast_2x2_8u<uchar, int> >,
resizeAreaFast_<ushort, float>, 0,
resizeAreaFast_<short, float>, resizeAreaFast_<ushort, float, ResizeAreaFastNoVec<ushort, float> >,
resizeAreaFast_<short, float, ResizeAreaFastNoVec<short, float> >,
0, 0,
resizeAreaFast_<float, float>, resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
resizeAreaFast_<double, double>, resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
0 0
}; };
static ResizeAreaFunc area_tab[] = static ResizeAreaFunc area_tab[] =
{ {
resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, resizeArea_<short, float>, resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
0, resizeArea_<float, float>, resizeArea_<double, double>, 0 resizeArea_<short, float>, 0, resizeArea_<float, float>,
resizeArea_<double, double>, 0
}; };
Mat src = _src.getMat(); Mat src = _src.getMat();
Size ssize = src.size(); Size ssize = src.size();
CV_Assert( ssize.area() > 0 ); CV_Assert( ssize.area() > 0 );
CV_Assert( !(dsize == Size()) || (inv_scale_x > 0 && inv_scale_y > 0) ); CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
if( dsize == Size() ) if( !dsize.area() )
{ {
dsize = Size(saturate_cast<int>(src.cols*inv_scale_x), dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
saturate_cast<int>(src.rows*inv_scale_y)); saturate_cast<int>(src.rows*inv_scale_y));
CV_Assert( dsize.area() );
} }
else else
{ {
...@@ -1602,15 +1802,24 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, ...@@ -1602,15 +1802,24 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
return; return;
} }
// true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
// In other cases it is emulated using some variant of bilinear interpolation
if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
{ {
int iscale_x = saturate_cast<int>(scale_x); int iscale_x = saturate_cast<int>(scale_x);
int iscale_y = saturate_cast<int>(scale_y); int iscale_y = saturate_cast<int>(scale_y);
if( std::abs(scale_x - iscale_x) < DBL_EPSILON && bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
std::abs(scale_y - iscale_y) < DBL_EPSILON ) std::abs(scale_y - iscale_y) < DBL_EPSILON;
// in case of scale_x && scale_y is equal to 2
// INTER_AREA (fast) also is equal to INTER_LINEAR
if ( interpolation == INTER_LINEAR &&
scale_x >= 1 && scale_y >= 1 && is_area_fast)
interpolation = INTER_AREA;
// true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
// In other cases it is emulated using some variant of bilinear interpolation
if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
{
if( is_area_fast )
{ {
int area = iscale_x*iscale_y; int area = iscale_x*iscale_y;
size_t srcstep = src.step / src.elemSize1(); size_t srcstep = src.step / src.elemSize1();
...@@ -1626,9 +1835,10 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, ...@@ -1626,9 +1835,10 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
for( dx = 0; dx < dsize.width; dx++ ) for( dx = 0; dx < dsize.width; dx++ )
{ {
sx = dx*iscale_x*cn; int j = dx * cn;
sx = iscale_x * j;
for( k = 0; k < cn; k++ ) for( k = 0; k < cn; k++ )
xofs[dx*cn + k] = sx + k; xofs[j + k] = sx + k;
} }
func( src, dst, ofs, xofs, iscale_x, iscale_y ); func( src, dst, ofs, xofs, iscale_x, iscale_y );
...@@ -1643,7 +1853,8 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, ...@@ -1643,7 +1853,8 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
for( dx = 0, k = 0; dx < dsize.width; dx++ ) for( dx = 0, k = 0; dx < dsize.width; dx++ )
{ {
double fsx1 = dx*scale_x, fsx2 = fsx1 + scale_x; double fsx1 = dx*scale_x;
double fsx2 = fsx1 + scale_x;
int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
sx1 = std::min(sx1, ssize.width-1); sx1 = std::min(sx1, ssize.width-1);
sx2 = std::min(sx2, ssize.width-1); sx2 = std::min(sx2, ssize.width-1);
...@@ -1672,9 +1883,11 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, ...@@ -1672,9 +1883,11 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
xofs[k++].alpha = (float)(min(fsx2 - sx2, 1.) / min(scale_x, src.cols - fsx1)); xofs[k++].alpha = (float)(min(fsx2 - sx2, 1.) / min(scale_x, src.cols - fsx1));
} }
} }
func( src, dst, xofs, k ,scale_y);
func( src, dst, xofs, k, scale_y);
return; return;
} }
}
int xmin = 0, xmax = dsize.width, width = dsize.width*cn; int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
bool area_mode = interpolation == INTER_AREA; bool area_mode = interpolation == INTER_AREA;
...@@ -2549,134 +2762,48 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy, ...@@ -2549,134 +2762,48 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
const Mat& _fxy, const void* _wtab, const Mat& _fxy, const void* _wtab,
int borderType, const Scalar& _borderValue); int borderType, const Scalar& _borderValue);
} class remapInvoker :
public ParallelLoopBody
void cv::remap( InputArray _src, OutputArray _dst,
InputArray _map1, InputArray _map2,
int interpolation, int borderType, const Scalar& borderValue )
{ {
static RemapNNFunc nn_tab[] = public:
{ remapInvoker(const Mat& _src, Mat _dst, const Mat& _map1, const Mat& _map2, const Mat *_m1,
remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>, const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
remapNearest<int>, remapNearest<float>, remapNearest<double>, 0 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
}; ParallelLoopBody(), src(_src), dst(_dst), map1(_map1), map2(_map2), m1(_m1), m2(_m2),
interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue),
static RemapFunc linear_tab[] = planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
{
remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
remapBilinear<Cast<float, float>, RemapNoVec, float>,
remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
};
static RemapFunc cubic_tab[] =
{
remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapBicubic<Cast<float, ushort>, float, 1>,
remapBicubic<Cast<float, short>, float, 1>, 0,
remapBicubic<Cast<float, float>, float, 1>,
remapBicubic<Cast<double, double>, float, 1>, 0
};
static RemapFunc lanczos4_tab[] =
{
remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapLanczos4<Cast<float, ushort>, float, 1>,
remapLanczos4<Cast<float, short>, float, 1>, 0,
remapLanczos4<Cast<float, float>, float, 1>,
remapLanczos4<Cast<double, double>, float, 1>, 0
};
Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
CV_Assert( (!map2.data || map2.size() == map1.size()));
_dst.create( map1.size(), src.type() );
Mat dst = _dst.getMat();
if( dst.data == src.data )
src = src.clone();
int depth = src.depth(), map_depth = map1.depth();
RemapNNFunc nnfunc = 0;
RemapFunc ifunc = 0;
const void* ctab = 0;
bool fixpt = depth == CV_8U;
bool planar_input = false;
if( interpolation == INTER_NEAREST )
{ {
nnfunc = nn_tab[depth];
CV_Assert( nnfunc != 0 );
if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
{
nnfunc( src, dst, map1, borderType, borderValue );
return;
}
}
else
{
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
if( interpolation == INTER_LINEAR )
ifunc = linear_tab[depth];
else if( interpolation == INTER_CUBIC )
ifunc = cubic_tab[depth];
else if( interpolation == INTER_LANCZOS4 )
ifunc = lanczos4_tab[depth];
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
CV_Assert( ifunc != 0 );
ctab = initInterTab2D( interpolation, fixpt );
} }
const Mat *m1 = &map1, *m2 = &map2; virtual void operator() (const Range& range) const
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
{
if( map1.type() != CV_16SC2 )
std::swap(m1, m2);
if( ifunc )
{ {
ifunc( src, dst, *m1, *m2, ctab, borderType, borderValue );
return;
}
}
else
{
CV_Assert( (map1.type() == CV_32FC2 && !map2.data) ||
(map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
planar_input = map1.channels() == 1;
}
int x, y, x1, y1; int x, y, x1, y1;
const int buf_size = 1 << 14; const int buf_size = 1 << 14;
int brows0 = std::min(128, dst.rows); int brows0 = std::min(128, dst.rows), map_depth = map1.depth();
int bcols0 = std::min(buf_size/brows0, dst.cols); int bcols0 = std::min(buf_size/brows0, dst.cols);
brows0 = std::min(buf_size/bcols0, dst.rows); brows0 = std::min(buf_size/bcols0, dst.rows);
#if CV_SSE2 #if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif #endif
Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa; Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
if( !nnfunc ) if( !nnfunc )
_bufa.create(brows0, bcols0, CV_16UC1); _bufa.create(brows0, bcols0, CV_16UC1);
for( y = 0; y < dst.rows; y += brows0 ) for( y = range.start; y < range.end; y += brows0 )
{ {
for( x = 0; x < dst.cols; x += bcols0 ) for( x = 0; x < dst.cols; x += bcols0 )
{ {
int brows = std::min(brows0, dst.rows - y); int brows = std::min(brows0, range.end - y);
int bcols = std::min(bcols0, dst.cols - x); int bcols = std::min(bcols0, dst.cols - x);
Mat dpart(dst, Rect(x, y, bcols, brows)); Mat dpart(dst, Rect(x, y, bcols, brows));
Mat bufxy(_bufxy, Rect(0, 0, bcols, brows)); Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
if( nnfunc ) if( nnfunc )
{ {
if( map_depth != CV_32F ) if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
bufxy = map1(Rect(x, y, bcols, brows));
else if( map_depth != CV_32F )
{ {
for( y1 = 0; y1 < brows; y1++ ) for( y1 = 0; y1 < brows; y1++ )
{ {
...@@ -2693,7 +2820,7 @@ void cv::remap( InputArray _src, OutputArray _dst, ...@@ -2693,7 +2820,7 @@ void cv::remap( InputArray _src, OutputArray _dst,
} }
} }
else if( !planar_input ) else if( !planar_input )
map1(Rect(0,0,bcols,brows)).convertTo(bufxy, bufxy.depth()); map1(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
else else
{ {
for( y1 = 0; y1 < brows; y1++ ) for( y1 = 0; y1 < brows; y1++ )
...@@ -2737,13 +2864,19 @@ void cv::remap( InputArray _src, OutputArray _dst, ...@@ -2737,13 +2864,19 @@ void cv::remap( InputArray _src, OutputArray _dst,
continue; continue;
} }
Mat bufa(_bufa, Rect(0,0,bcols, brows)); Mat bufa(_bufa, Rect(0, 0, bcols, brows));
for( y1 = 0; y1 < brows; y1++ ) for( y1 = 0; y1 < brows; y1++ )
{ {
short* XY = (short*)(bufxy.data + bufxy.step*y1); short* XY = (short*)(bufxy.data + bufxy.step*y1);
ushort* A = (ushort*)(bufa.data + bufa.step*y1); ushort* A = (ushort*)(bufa.data + bufa.step*y1);
if( planar_input ) if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
{
bufxy = m1->operator()(Rect(x, y, bcols, brows));
bufa = m2->operator()(Rect(x, y, bcols, brows));
}
else if( planar_input )
{ {
const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x; const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x; const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
...@@ -2815,6 +2948,118 @@ void cv::remap( InputArray _src, OutputArray _dst, ...@@ -2815,6 +2948,118 @@ void cv::remap( InputArray _src, OutputArray _dst,
ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue); ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue);
} }
} }
}
private:
const Mat src;
Mat dst;
const Mat map1, map2, *m1, *m2;
int interpolation, borderType;
const Scalar borderValue;
int planar_input;
RemapNNFunc nnfunc;
RemapFunc ifunc;
const void *ctab;
};
}
void cv::remap( InputArray _src, OutputArray _dst,
InputArray _map1, InputArray _map2,
int interpolation, int borderType, const Scalar& borderValue )
{
static RemapNNFunc nn_tab[] =
{
remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
};
static RemapFunc linear_tab[] =
{
remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
remapBilinear<Cast<float, float>, RemapNoVec, float>,
remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
};
static RemapFunc cubic_tab[] =
{
remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapBicubic<Cast<float, ushort>, float, 1>,
remapBicubic<Cast<float, short>, float, 1>, 0,
remapBicubic<Cast<float, float>, float, 1>,
remapBicubic<Cast<double, double>, float, 1>, 0
};
static RemapFunc lanczos4_tab[] =
{
remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapLanczos4<Cast<float, ushort>, float, 1>,
remapLanczos4<Cast<float, short>, float, 1>, 0,
remapLanczos4<Cast<float, float>, float, 1>,
remapLanczos4<Cast<double, double>, float, 1>, 0
};
Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
CV_Assert( map1.size().area() > 0 );
CV_Assert( !map2.data || (map2.size() == map1.size()));
_dst.create( map1.size(), src.type() );
Mat dst = _dst.getMat();
if( dst.data == src.data )
src = src.clone();
int depth = src.depth();
RemapNNFunc nnfunc = 0;
RemapFunc ifunc = 0;
const void* ctab = 0;
bool fixpt = depth == CV_8U;
bool planar_input = false;
if( interpolation == INTER_NEAREST )
{
nnfunc = nn_tab[depth];
CV_Assert( nnfunc != 0 );
}
else
{
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
if( interpolation == INTER_LINEAR )
ifunc = linear_tab[depth];
else if( interpolation == INTER_CUBIC )
ifunc = cubic_tab[depth];
else if( interpolation == INTER_LANCZOS4 )
ifunc = lanczos4_tab[depth];
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
CV_Assert( ifunc != 0 );
ctab = initInterTab2D( interpolation, fixpt );
}
const Mat *m1 = &map1, *m2 = &map2;
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
{
if( map1.type() != CV_16SC2 )
std::swap(m1, m2);
}
else
{
CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && !map2.data) ||
(map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
planar_input = map1.channels() == 1;
}
Range range(0, dst.rows);
remapInvoker invoker(src, dst, map1, map2, m1, m2, interpolation,
borderType, borderValue, planar_input, nnfunc, ifunc,
ctab);
parallel_for_(range, invoker);
} }
...@@ -2957,71 +3202,42 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -2957,71 +3202,42 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
} }
void cv::warpAffine( InputArray _src, OutputArray _dst, namespace cv
InputArray _M0, Size dsize,
int flags, int borderType, const Scalar& borderValue )
{ {
Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data )
src = src.clone();
const int BLOCK_SZ = 64;
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
double M[6];
Mat matM(2, 3, CV_64F, M);
int interpolation = flags & INTER_MAX;
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 ); class warpAffineInvoker :
M0.convertTo(matM, matM.type()); public ParallelLoopBody
{
#ifdef HAVE_TEGRA_OPTIMIZATION public:
if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) ) warpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
return; const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
#endif ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
if( !(flags & WARP_INVERSE_MAP) ) M(_M)
{ {
double D = M[0]*M[4] - M[1]*M[3];
D = D != 0 ? 1./D : 0;
double A11 = M[4]*D, A22=M[0]*D;
M[0] = A11; M[1] *= -D;
M[3] *= -D; M[4] = A22;
double b1 = -M[0]*M[2] - M[1]*M[5];
double b2 = -M[3]*M[2] - M[4]*M[5];
M[2] = b1; M[5] = b2;
} }
int x, y, x1, y1, width = dst.cols, height = dst.rows; virtual void operator() (const Range& range) const
AutoBuffer<int> _abdelta(width*2); {
int* adelta = &_abdelta[0], *bdelta = adelta + width; const int BLOCK_SZ = 64;
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
const int AB_BITS = MAX(10, (int)INTER_BITS); const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS; const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2; int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_SSE2 #if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif #endif
for( x = 0; x < width; x++ )
{
adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
}
int bh0 = std::min(BLOCK_SZ/2, height); int bh0 = std::min(BLOCK_SZ/2, dst.rows);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
for( y = 0; y < height; y += bh0 ) for( y = range.start; y < range.end; y += bh0 )
{ {
for( x = 0; x < width; x += bw0 ) for( x = 0; x < dst.cols; x += bw0 )
{ {
int bw = std::min( bw0, width - x); int bw = std::min( bw0, dst.cols - x);
int bh = std::min( bh0, height - y); int bh = std::min( bh0, range.end - y);
Mat _XY(bh, bw, CV_16SC2, XY), matA; Mat _XY(bh, bw, CV_16SC2, XY), matA;
Mat dpart(dst, Rect(x, y, bw, bh)); Mat dpart(dst, Rect(x, y, bw, bh));
...@@ -3099,51 +3315,107 @@ void cv::warpAffine( InputArray _src, OutputArray _dst, ...@@ -3099,51 +3315,107 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
} }
} }
} }
}
private:
const Mat src;
Mat dst;
int interpolation, borderType;
const Scalar borderValue;
int *adelta, *bdelta;
double *M;
};
} }
void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0, void cv::warpAffine( InputArray _src, OutputArray _dst,
Size dsize, int flags, int borderType, const Scalar& borderValue ) InputArray _M0, Size dsize,
int flags, int borderType, const Scalar& borderValue )
{ {
Mat src = _src.getMat(), M0 = _M0.getMat(); Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat(); Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 ); CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data ) if( dst.data == src.data )
src = src.clone(); src = src.clone();
const int BLOCK_SZ = 32; double M[6];
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ]; Mat matM(2, 3, CV_64F, M);
double M[9];
Mat matM(3, 3, CV_64F, M);
int interpolation = flags & INTER_MAX; int interpolation = flags & INTER_MAX;
if( interpolation == INTER_AREA ) if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR; interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 ); CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
M0.convertTo(matM, matM.type()); M0.convertTo(matM, matM.type());
#ifdef HAVE_TEGRA_OPTIMIZATION #ifdef HAVE_TEGRA_OPTIMIZATION
if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) ) if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
return; return;
#endif #endif
if( !(flags & WARP_INVERSE_MAP) ) if( !(flags & WARP_INVERSE_MAP) )
invert(matM, matM); {
double D = M[0]*M[4] - M[1]*M[3];
D = D != 0 ? 1./D : 0;
double A11 = M[4]*D, A22=M[0]*D;
M[0] = A11; M[1] *= -D;
M[3] *= -D; M[4] = A22;
double b1 = -M[0]*M[2] - M[1]*M[5];
double b2 = -M[3]*M[2] - M[4]*M[5];
M[2] = b1; M[5] = b2;
}
int x;
AutoBuffer<int> _abdelta(dst.cols*2);
int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
for( x = 0; x < dst.cols; x++ )
{
adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
}
Range range(0, dst.rows);
warpAffineInvoker invoker(src, dst, interpolation, borderType,
borderValue, adelta, bdelta, M);
parallel_for_(range, invoker);
}
namespace cv
{
class warpPerspectiveInvoker :
public ParallelLoopBody
{
public:
warpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
int _borderType, const Scalar &_borderValue) :
ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
borderType(_borderType), borderValue(_borderValue)
{
}
virtual void operator() (const Range& range) const
{
const int BLOCK_SZ = 32;
short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
int x, y, x1, y1, width = dst.cols, height = dst.rows; int x, y, x1, y1, width = dst.cols, height = dst.rows;
int bh0 = std::min(BLOCK_SZ/2, height); int bh0 = std::min(BLOCK_SZ/2, height);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
for( y = 0; y < height; y += bh0 ) for( y = range.start; y < range.end; y += bh0 )
{ {
for( x = 0; x < width; x += bw0 ) for( x = 0; x < width; x += bw0 )
{ {
int bw = std::min( bw0, width - x); int bw = std::min( bw0, width - x);
int bh = std::min( bh0, height - y); int bh = std::min( bh0, range.end - y); // height
Mat _XY(bh, bw, CV_16SC2, XY), matA; Mat _XY(bh, bw, CV_16SC2, XY), matA;
Mat dpart(dst, Rect(x, y, bw, bh)); Mat dpart(dst, Rect(x, y, bw, bh));
...@@ -3197,6 +3469,49 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0, ...@@ -3197,6 +3469,49 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
} }
} }
} }
}
private:
const Mat src;
Mat dst;
double* M;
int interpolation, borderType;
const Scalar borderValue;
};
}
void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
Size dsize, int flags, int borderType, const Scalar& borderValue )
{
Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data )
src = src.clone();
double M[9];
Mat matM(3, 3, CV_64F, M);
int interpolation = flags & INTER_MAX;
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
M0.convertTo(matM, matM.type());
#ifdef HAVE_TEGRA_OPTIMIZATION
if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
return;
#endif
if( !(flags & WARP_INVERSE_MAP) )
invert(matM, matM);
Range range(0, dst.rows);
warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
parallel_for_(range, invoker);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment