parallel version of remap, resize, warpaffine, warpPerspective. Some…

parallel version of remap, resize, warpaffine, warpPerspective. Some optimization for 2x decimation in resize algorithm

parallel version of remap, resize, warpaffine, warpPerspective. Some…
parallel version of remap, resize, warpaffine, warpPerspective. Some optimization for 2x decimation in resize algorithm
92795ba4 · Ilya Lavrenov · f2a02fef · 92795ba4 · 92795ba4 · 92795ba4
Commit 92795ba4 authored Aug 22, 2012 by Ilya Lavrenov
Showing with 769 additions and 358 deletions

perf_remap.cpp modules/imgproc/perf/perf_remap.cpp +68 -0

perf_resize.cpp modules/imgproc/perf/perf_resize.cpp +30 -2

imgwarp.cpp modules/imgproc/src/imgwarp.cpp +671 -356

No files found.
--- a/modules/imgproc/perf/perf_remap.cpp
+++ b/modules/imgproc/perf/perf_remap.cpp
+#include "perf_precomp.hpp"
+using namespace std;
+using namespace cv;
+using namespace perf;
+using namespace testing;
+using std::tr1::make_tuple;
+using std::tr1::get;
+CV_ENUM(MatrixType, CV_16UC1, CV_16SC1, CV_32FC1)
+CV_ENUM(MapType, CV_16SC2, CV_32FC1, CV_32FC2)
+CV_ENUM(InterType, INTER_LINEAR, INTER_CUBIC, INTER_LANCZOS4, INTER_NEAREST)
+typedef TestBaseWithParam< tr1::tuple<Size, MatrixType, MapType, InterType> > TestRemap;
+PERF_TEST_P( TestRemap, Remap,
+             Combine(
+                Values( szVGA, sz1080p ), 
+                ValuesIn( MatrixType::all() ), 
+                ValuesIn( MapType::all() ), 
+                ValuesIn( InterType::all() ) 
+             )
+)
+{
+    Size sz;
+    int src_type, map1_type, inter_type;
+    sz         = get<0>(GetParam());
+    src_type   = get<1>(GetParam());
+    map1_type  = get<2>(GetParam());
+    inter_type = get<3>(GetParam());
+    Mat src(sz, src_type);
+    Mat map1(sz, map1_type);
+    Mat dst(sz, src_type);
+    Mat map2(map1_type == CV_32FC1 ? sz : Size(), CV_32FC1);
+    RNG rng;
+    rng.fill(src, RNG::UNIFORM, 0, 256);
+    for (int j = 0; j < map1.rows; ++j)
+        for (int i = 0; i < map1.cols; ++i)
+            switch (map1_type)
+            {
+                case CV_32FC1:
+                    map1.at<float>(j, i) = src.cols - i;
+                    map2.at<float>(j, i) = j;
+                    break;
+                case CV_32FC2:
+                    map1.at<Vec2f>(j, i)[0] = src.cols - i;
+                    map1.at<Vec2f>(j, i)[1] = j;
+                    break;
+                case CV_16SC2:
+                    map1.at<Vec2s>(j, i)[0] = src.cols - i;
+                    map1.at<Vec2s>(j, i)[1] = j;
+                    break;
+                default:
+                    CV_Assert(0);
+            }
+    declare.in(src, WARMUP_RNG).out(dst).time(20);
+    TEST_CYCLE() remap(src, dst, map1, map2, inter_type);
+    SANITY_CHECK(dst);
+}
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -59,11 +59,11 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
 typedef tr1::tuple<MatType, Size, int> MatInfo_Size_Scale_t;
 typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;
-PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
+PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast,
            testing::Combine(
                testing::Values(CV_8UC1, CV_8UC4),
                testing::Values(szVGA, szqHD, sz720p, sz1080p),
-                testing::Values(2, 4)
+                testing::Values(2)
                )
            )
 {
@@ -84,3 +84,31 @@ PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
    //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
    SANITY_CHECK(dst, 1);
 }
+typedef TestBaseWithParam<tr1::tuple<MatType, Size, double> > MatInfo_Size_Scale_Area;
+PERF_TEST_P(MatInfo_Size_Scale_Area, ResizeArea,
+            testing::Combine(
+                testing::Values(CV_8UC1, CV_8UC4),
+                testing::Values(szVGA, szqHD, sz720p, sz1080p),
+                testing::Values(2.4, 3.4, 1.3)
+                )
+            )
+{
+    int matType = get<0>(GetParam());
+    Size from = get<1>(GetParam());
+    double scale = get<2>(GetParam());
+    cv::Mat src(from, matType);
+    Size to(cvRound(from.width * scale), cvRound(from.height * scale));
+    cv::Mat dst(to, matType);
+    declare.in(src, WARMUP_RNG).out(dst);
+    TEST_CYCLE() resize(src, dst, dst.size(), 0, 0, INTER_AREA);
+    //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
+    SANITY_CHECK(dst, 1);
+}
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -240,24 +240,22 @@ template<typename ST, typename DT, int bits> struct FixedPtCast
 *                                         Resize                                         *
 \****************************************************************************************/
-static void
+class resizeNNInvoker :
-resizeNN( const Mat& src, Mat& dst, double fx, double fy )
+    public ParallelLoopBody
 {
-    Size ssize = src.size(), dsize = dst.size();
+public:
-    AutoBuffer<int> _x_ofs(dsize.width);
+    resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
-    int* x_ofs = _x_ofs;
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
-    int pix_size = (int)src.elemSize();
+        ify(_ify)
-    int pix_size4 = (int)(pix_size / sizeof(int));
-    double ifx = 1./fx, ify = 1./fy;
-    int x, y;
-    for( x = 0; x < dsize.width; x++ )
    {
-        int sx = cvFloor(x*ifx);
-        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
    }
-    for( y = 0; y < dsize.height; y++ )
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x, pix_size = (int)src.elemSize();
+        for( y = range.start; y < range.end; y++ )
        {
            uchar* D = dst.data + dst.step*y;
            int sy = std::min(cvFloor(y*ify), ssize.height-1);
@@ -326,6 +324,35 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
                }
            }
        }
+    }
+private:
+    const Mat src;
+    Mat dst;
+    int* x_ofs, pix_size4;
+    double ify;
+};
+static void
+resizeNN( const Mat& src, Mat& dst, double fx, double fy )
+{
+    Size ssize = src.size(), dsize = dst.size();
+    AutoBuffer<int> _x_ofs(dsize.width);
+    int* x_ofs = _x_ofs;
+    int pix_size = (int)src.elemSize();
+    int pix_size4 = (int)(pix_size / sizeof(int));
+    double ifx = 1./fx, ify = 1./fy;
+    int x;
+    for( x = 0; x < dsize.width; x++ )
+    {
+        int sx = cvFloor(x*ifx);
+        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
+    }
+    Range range(0, dsize.height);
+    resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
+    parallel_for_(range, invoker);
 }
@@ -1092,33 +1119,35 @@ static inline int clip(int x, int a, int b)
 static const int MAX_ESIZE=16;
-template<class HResize, class VResize>
+template <typename HResize, typename VResize>
-static void resizeGeneric_( const Mat& src, Mat& dst,
+class resizeGeneric_Invoker :
-                            const int* xofs, const void* _alpha,
+    public ParallelLoopBody
-                            const int* yofs, const void* _beta,
-                            int xmin, int xmax, int ksize )
 {
+public:
    typedef typename HResize::value_type T;
    typedef typename HResize::buf_type WT;
    typedef typename HResize::alpha_type AT;
-    const AT* alpha = (const AT*)_alpha;
+    resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
-    const AT* beta = (const AT*)_beta;
+        const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
-    Size ssize = src.size(), dsize = dst.size();
+        int _ksize, int _xmin, int _xmax) :
-    int cn = src.channels();
+        ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
-    ssize.width *= cn;
+        alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
-    dsize.width *= cn;
+        ksize(_ksize), xmin(_xmin), xmax(_xmax)
+    {
+    }
+    virtual void operator() (const Range& range) const
+    {
+        int dy, cn = src.channels();
+        HResize hresize;
+        VResize vresize;
        int bufstep = (int)alignSize(dsize.width, 16);
        AutoBuffer<WT> _buffer(bufstep*ksize);
        const T* srows[MAX_ESIZE]={0};
        WT* rows[MAX_ESIZE]={0};
        int prev_sy[MAX_ESIZE];
-    int dy;
-    xmin *= cn;
-    xmax *= cn;
-    HResize hresize;
-    VResize vresize;
        for(int k = 0; k < ksize; k++ )
        {
@@ -1126,8 +1155,9 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
            rows[k] = (WT*)_buffer + bufstep*k;
        }
-    // image resize is a separable operation. In case of not too strong
+        const AT* beta = _beta + ksize * range.start;
-    for( dy = 0; dy < dsize.height; dy++, beta += ksize )
+        for( dy = range.start; dy < range.end; dy++, beta += ksize )
        {
            int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
@@ -1145,35 +1175,144 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
                }
                if( k1 == ksize )
                    k0 = std::min(k0, k); // remember the first row that needs to be computed
-            srows[k] = (const T*)(src.data + src.step*sy);
+                srows[k] = (T*)(src.data + src.step*sy);
                prev_sy[k] = sy;
            }
            if( k0 < ksize )
-            hresize( srows + k0, rows + k0, ksize - k0, xofs, alpha,
+                hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
                        ssize.width, dsize.width, cn, xmin, xmax );
            vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
        }
+    }
+private:
+    const Mat src;
+    Mat dst;
+    const int* xofs, *yofs;
+    const AT* alpha, *_beta;
+    const Size ssize, dsize;
+    const int ksize, xmin, xmax;
+};
+template<class HResize, class VResize>
+static void resizeGeneric_( const Mat& src, Mat& dst,
+                            const int* xofs, const void* _alpha,
+                            const int* yofs, const void* _beta,
+                            int xmin, int xmax, int ksize )
+{
+    typedef typename HResize::value_type T;
+    typedef typename HResize::buf_type WT;
+    typedef typename HResize::alpha_type AT;
+    const AT* beta = (const AT*)_beta;
+    Size ssize = src.size(), dsize = dst.size();
+    int cn = src.channels();
+    ssize.width *= cn;
+    dsize.width *= cn;
+    xmin *= cn;
+    xmax *= cn;
+    // image resize is a separable operation. In case of not too strong
+    Range range(0, dsize.height);
+    resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
+        ssize, dsize, ksize, xmin, xmax);
+    parallel_for_(range, invoker);
 }
+template <typename T, typename WT>
+struct ResizeAreaFastNoVec
+{
+    ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
+        int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
+    int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
+};
-template<typename T, typename WT>
+template <typename T, typename WT>
-static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
+struct ResizeAreaFast_2x2_8u
-                             int scale_x, int scale_y )
+{
+    ResizeAreaFast_2x2_8u(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
+        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
+    { 
+        fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); 
+    }
+    int operator() (const T* S, T* D, int w) const
+    {
+        if( !fast_mode )
+            return 0;
+        const T* nextS = S + step;
+        int dx = 0;
+        if (cn == 1)
+        for( ; dx < w; ++dx )
+        {
+            int index = dx*2;
+            D[dx] = (S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2;
+        }
+        else if (cn == 3)
+        for( ; dx < w; dx += 3 )
+        {
+            int index = dx*2;
+            D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
+            D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
+            D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
+        }
+        else
+        {
+            assert(cn == 4);
+            for( ; dx < w; dx += 4 )
+            {
+                int index = dx*2;
+                D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
+                D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
+                D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
+                D[dx+3] = (S[index+3] + S[index+6] + nextS[index+3] + nextS[index+6] + 2) >> 2;
+            }
+        }
+        return dx;
+    }
+private:
+    const int scale_x, scale_y;
+    const int cn;
+    bool fast_mode;
+    const int step;
+};
+template <typename T, typename WT, typename VecOp>
+class resizeAreaFast_Invoker :
+    public ParallelLoopBody
 {
+public:
+    resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
+        int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
+        ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
+        scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
+    {
+    }
+    virtual void operator() (const Range& range) const
+    {
        Size ssize = src.size(), dsize = dst.size();
        int cn = src.channels();
-    int dy, dx, k = 0;
        int area = scale_x*scale_y;
-    float scale = 1.f/(scale_x*scale_y);
+        float scale = 1.f/(area);
        int dwidth1 = (ssize.width/scale_x)*cn;
        dsize.width *= cn;
        ssize.width *= cn;
+        int dy, dx, k = 0;
-    for( dy = 0; dy < dsize.height; dy++ )
+        VecOp vop(scale_x, scale_y, src.channels(), src.step/*, area_ofs*/);
+        for( dy = range.start; dy < range.end; dy++ )
        {
            T* D = (T*)(dst.data + dst.step*dy);
-        int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
+            int sy0 = dy*scale_y;
+            int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
            if( sy0 >= ssize.height )
            {
                for( dx = 0; dx < dsize.width; dx++ )
@@ -1181,11 +1320,12 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
                continue;
            }
-        for( dx = 0; dx < w; dx++ )
+            dx = vop((const T*)(src.data + src.step * sy0), D, w);
+            for( ; dx < w; dx++ )
            {
-            const T* S = (const T*)(src.data + src.step*sy0) + xofs[dx];
+                const T* S = (const T*)(src.data + src.step * sy0) + xofs[dx];
                WT sum = 0;
-            k=0;
+                k = 0;
                #if CV_ENABLE_UNROLLED
                for( ; k <= area - 4; k += 4 )
                    sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
@@ -1193,7 +1333,7 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
                for( ; k < area; k++ )
                    sum += S[ofs[k]];
-            D[dx] = saturate_cast<T>(sum*scale);
+                D[dx] = saturate_cast<T>(sum * scale);
            }
            for( ; dx < dsize.width; dx++ )
@@ -1217,9 +1357,26 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
                    }
                }
-            D[dx] = saturate_cast<T>((float)sum/count);
+                D[dx] = saturate_cast<WT>((float)sum/count);
            }
        }
+    }
+private:
+    const Mat src;
+    Mat dst;
+    const int scale_x, scale_y;
+    const int *ofs, *xofs;
+};
+template<typename T, typename WT, typename VecOp>
+static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
+                             int scale_x, int scale_y )
+{
+    Range range(0, dst.rows);
+    resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x, 
+        scale_y, ofs, xofs);
+    parallel_for_(range, invoker);
 }
 struct DecimateAlpha
@@ -1228,24 +1385,46 @@ struct DecimateAlpha
    float alpha;
 };
-template<typename T, typename WT>
+template <typename T, typename WT>
-static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
+class resizeArea_Invoker :
+    public ParallelLoopBody
 {
+public:
+    resizeArea_Invoker(const Mat& _src, Mat& _dst, const DecimateAlpha* _xofs, 
+        int _xofs_count, double _scale_y_
+#ifdef HAVE_TBB
+        , const int* _yofs, const int* _cur_dy_ofs
+#endif
+        ) :
+        ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), 
+        xofs_count(_xofs_count), scale_y_(_scale_y_)
+#ifdef HAVE_TBB
+        , yofs(_yofs), cur_dy_ofs(_cur_dy_ofs)
+#endif
+    {
+    }
+    virtual void operator() (const Range& range) const
+    {
        Size ssize = src.size(), dsize = dst.size();
        int cn = src.channels();
        dsize.width *= cn;
        AutoBuffer<WT> _buffer(dsize.width*2);
        WT *buf = _buffer, *sum = buf + dsize.width;
-    int k, sy, dx, cur_dy = 0;
+        int k, sy, dx, cur_dy = 0, num = sizeof(WT) * dsize.width;
        WT scale_y = (WT)scale_y_;
        CV_Assert( cn <= 4 );
-    for( dx = 0; dx < dsize.width; dx++ )
+        memset(buf, 0, num * 2);
-        buf[dx] = sum[dx] = 0;
-    for( sy = 0; sy < ssize.height; sy++ )
+#ifdef HAVE_TBB
+        sy = yofs[range.start];
+        cur_dy = cur_dy_ofs[sy];
+        for( ; sy < range.start; sy++ )
        {
-        const T* S = (const T*)(src.data + src.step*sy);
+            const T* S = (const T*)(src.data + src.step * sy);
+            memset(buf, 0, num);
            if( cn == 1 )
                for( k = 0; k < xofs_count; k++ )
                {
@@ -1269,9 +1448,11 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
                    int sxn = xofs[k].si;
                    int dxn = xofs[k].di;
                    WT alpha = xofs[k].alpha;
                    WT t0 = buf[dxn] + S[sxn]*alpha;
                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
                    WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
                    buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
                }
            else
@@ -1280,35 +1461,30 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
                    int sxn = xofs[k].si;
                    int dxn = xofs[k].di;
                    WT alpha = xofs[k].alpha;
                    WT t0 = buf[dxn] + S[sxn]*alpha;
                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
                    buf[dxn] = t0; buf[dxn+1] = t1;
                    t0 = buf[dxn+2] + S[sxn+2]*alpha;
                    t1 = buf[dxn+3] + S[sxn+3]*alpha;
                    buf[dxn+2] = t0; buf[dxn+3] = t1;
                }
            if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
            {
-            WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y, (WT)0);
+                WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0); 
-            WT beta1 = 1 - beta;
-            T* D = (T*)(dst.data + dst.step*cur_dy);
                if( fabs(beta) < 1e-3 )
                {
-                if(cur_dy >= dsize.height) return;
+                    if(cur_dy >= dsize.height)
-                for( dx = 0; dx < dsize.width; dx++ )
+                        break;
-                {
+                    memset(sum, 0, num);
-                    D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
-                    sum[dx] = buf[dx] = 0;
-                }
                }
                else
                    for( dx = 0; dx < dsize.width; dx++ )
-                {
+                        sum[dx] = buf[dx] * beta;
-                    D[dx] = saturate_cast<T>((sum[dx] + buf[dx]* beta1)/ min(scale_y, src.rows - cur_dy*scale_y));
-                    sum[dx] = buf[dx]*beta;
-                    buf[dx] = 0;
-                }
                cur_dy++;
            }
            else
@@ -1318,132 +1494,151 @@ static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, in
                    WT t0 = sum[dx] + buf[dx];
                    WT t1 = sum[dx+1] + buf[dx+1];
                    sum[dx] = t0; sum[dx+1] = t1;
-                buf[dx] = buf[dx+1] = 0;
                }
                for( ; dx < dsize.width; dx++ )
-            {
                    sum[dx] += buf[dx];
-                buf[dx] = 0;
            }
        }
-    }
-}
-static void resizeAreaFast_8u( const Mat& src, Mat& dst,
-                               const int* ofs, const int* xofs,
-                               int scale_x, int scale_y )
-{
-#if CV_SSE2
-    bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
 #endif
-    Size ssize = src.size(), dsize = dst.size();
+        for( sy = range.start; sy < range.end; sy++ )
-    int cn = src.channels();
-    int dy, dx, k = 0;
-    int area = scale_x*scale_y;
-    float scale = 1.f/(scale_x*scale_y);
-    int dwidth1 = (ssize.width/scale_x)*cn;
-    dsize.width *= cn;
-    ssize.width *= cn;
-    //avg values
-    for( dy = 0; dy < dsize.height; dy++ )
-    {
-        uchar* D = (uchar*)(dst.data + dst.step*dy);
-        int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
-        if( sy0 >= ssize.height )
-        {
-            for( dx = 0; dx < dsize.width; dx++ ) //memset(D,0, dsize.width);//warning, never executed -> not tested
-                D[dx] = 0;
-            continue;
-        }
-        dx = 0;
-    #if CV_SSE2
-        if( haveSSE2 )
        {
-            const __m128 _scale = _mm_set1_ps(scale);
+            const T* S = (const T*)(src.data + src.step * sy);
-            const __m128i _ucMAXs = _mm_set1_epi16(UCHAR_MAX);
+            memset(buf, 0, num);
-            const uchar* _S[8];
-            for(; dx < w-8; dx+=8 )
+            if( cn == 1 )
+                for( k = 0; k < xofs_count; k++ )
                {
-                __m128i _sum = _mm_setzero_si128();
+                    int dxn = xofs[k].di;
-                __m128i _sum1 = _mm_setzero_si128();
+                    WT alpha = xofs[k].alpha;
-                _S[0] = (const uchar*)(src.data + src.step*sy0) + xofs[dx];
+                    buf[dxn] += S[xofs[k].si]*alpha;
-                _S[1] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+1];
+                }
-                _S[2] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+2];
+            else if( cn == 2 )
-                _S[3] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+3];
+                for( k = 0; k < xofs_count; k++ )
-                _S[4] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+4];
-                _S[5] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+5];
-                _S[6] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+6];
-                _S[7] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+7];
-                for( k = 0; k < area; k++ )
                {
-                    int ofsk = ofs[k];
+                    int sxn = xofs[k].si;
-                    __m128i _temp = _mm_set_epi32(_S[3][ofsk],_S[2][ofsk],_S[1][ofsk],_S[0][ofsk]);
+                    int dxn = xofs[k].di;
-                    _sum = _mm_add_epi32(_sum, _temp);
+                    WT alpha = xofs[k].alpha;
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
-                    __m128i _temp1 = _mm_set_epi32(_S[7][ofsk],_S[6][ofsk],_S[5][ofsk],_S[4][ofsk]);
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                    _sum1 = _mm_add_epi32(_sum1, _temp1);
+                    buf[dxn] = t0; buf[dxn+1] = t1;
                }
+            else if( cn == 3 )
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int sxn = xofs[k].si;
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
-                __m128i _tempSum =  _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum), _scale));
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
-                __m128i _tempSum1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale));
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                    WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
-                _tempSum = _mm_packs_epi32(_tempSum, _tempSum1);
+                    buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
-                _tempSum = _mm_min_epi16(_ucMAXs, _tempSum);
-                _tempSum = _mm_packus_epi16(_tempSum, _tempSum);
-                _mm_storel_epi64((__m128i*)(D+dx),_tempSum);
-            }
                }
-    #endif
+            else
+                for( k = 0; k < xofs_count; k++ )
-        for(; dx < w; dx++ )
                {
-            const uchar* S = (const uchar*)(src.data + src.step*sy0) + xofs[dx];
+                    int sxn = xofs[k].si;
-            int sum = 0;
+                    int dxn = xofs[k].di;
-            k=0;
+                    WT alpha = xofs[k].alpha;
-        #if CV_ENABLE_UNROLLED
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
-            for( ; k <= area - 4; k += 4 )
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
-        #endif
-            for( ; k < area; k++ )
+                    buf[dxn] = t0; buf[dxn+1] = t1;
-                sum += S[ofs[k]];
+                    t0 = buf[dxn+2] + S[sxn+2]*alpha;
+                    t1 = buf[dxn+3] + S[sxn+3]*alpha;
-            D[dx] = saturate_cast<uchar>(sum*scale);
+                    buf[dxn+2] = t0; buf[dxn+3] = t1;
                }
-        for( ; dx < dsize.width; dx++ )
+            if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
            {
-            int sum = 0;
+                WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0); 
-            int count = 0, sx0 = xofs[dx];
+                T* D = (T*)(dst.data + dst.step*cur_dy); 
-            if( sx0 >= ssize.width )
+                if( fabs(beta) < 1e-3 )
-                D[dx] = 0;
-            for( int sy = 0; sy < scale_y; sy++ )
                {
-                if( sy0 + sy >= ssize.height )
+                    if(cur_dy >= dsize.height)
-                    break;
+                        return;
-                const uchar* S = (const uchar*)(src.data + src.step*(sy0 + sy)) + sx0;
+                    for( dx = 0; dx < dsize.width; dx++ )
-                int sx = 0;
+                        D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
-                for( ; sx < scale_x*cn; sx += cn )
+                    memset(sum, 0, num);
+                }
+                else
                {
-                    if( sx0 + sx >= ssize.width )
+                    WT beta1 = 1 - beta;
-                        break;
+                    for( dx = 0; dx < dsize.width; dx++ )
-                    sum += S[sx];
+                    {
-                    count++;
+                        D[dx] = saturate_cast<T>((sum[dx] + buf[dx] * beta1)/ min(scale_y, src.rows - cur_dy * scale_y));
+                        sum[dx] = buf[dx] * beta;
                    }
                }
+                cur_dy++;
-            D[dx] = saturate_cast<uchar>((float)sum/count);
            }
+            else
+            {
+                for( dx = 0; dx <= dsize.width - 2; dx += 2 )
+                {
+                    WT t0 = sum[dx] + buf[dx]; 
+                    WT t1 = sum[dx+1] + buf[dx+1]; 
+                    sum[dx] = t0; sum[dx+1] = t1; 
                }
+                for( ; dx < dsize.width; dx++ )
+                    sum[dx] += buf[dx]; 
+            }
+        }
+    }
+private:
+    const Mat src;
+    Mat dst;
+    const DecimateAlpha* xofs;
+    const int xofs_count;
+    const double scale_y_;
+#ifdef HAVE_TBB
+    const int *yofs, *cur_dy_ofs;
+#endif
+};
+template<typename T, typename WT>
+static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
+{
+#ifdef HAVE_TBB
+    Size ssize = src.size(), dsize = dst.size();
+    AutoBuffer<int> _yofs(2 * ssize.height);
+    int *yofs = _yofs, *cur_dy_ofs = _yofs + ssize.height;
+    int index = 0, cur_dy = 0, sy;
+    for( sy = 0; sy < ssize.height; sy++ )
+    {
+        bool reset = false;
+        cur_dy_ofs[sy] = cur_dy;
+        if( (cur_dy + 1)*scale_y_ <= sy + 1 || sy == ssize.height - 1 )
+        {
+            WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y_, 0.);
+            if( fabs(beta) < 1e-3 )
+            {
+                if(cur_dy >= dsize.height)
+                    break;
+                reset = true;
+            }
+            cur_dy++;
+        }
+        yofs[sy] = index;
+        if (reset)
+            index = sy + 1;
+    }
+#endif
+    Range range(0, src.rows);
+    resizeArea_Invoker<T, WT> invoker(src, dst, xofs, xofs_count, scale_y_
+#ifdef HAVE_TBB
+        , yofs, cur_dy_ofs
+#endif
+        );
+    parallel_for_(range, invoker);
 }
@@ -1457,10 +1652,12 @@ typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
                                    int scale_x, int scale_y );
 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
-                                const DecimateAlpha* xofs, int xofs_count, double scale_y_);
+                                const DecimateAlpha* xofs, int xofs_count, 
+                                double scale_y_);
 }
 //////////////////////////////////////////////////////////////////////////////////////////
 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
@@ -1553,30 +1750,33 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
    static ResizeAreaFastFunc areafast_tab[] =
    {
-        resizeAreaFast_8u, 0,
+        resizeAreaFast_<uchar, int, ResizeAreaFast_2x2_8u<uchar, int> >, 
-        resizeAreaFast_<ushort, float>,
+        0,
-        resizeAreaFast_<short, float>,
+        resizeAreaFast_<ushort, float, ResizeAreaFastNoVec<ushort, float> >,
+        resizeAreaFast_<short, float, ResizeAreaFastNoVec<short, float> >,
        0,
-        resizeAreaFast_<float, float>,
+        resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
-        resizeAreaFast_<double, double>,
+        resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
        0
    };
    static ResizeAreaFunc area_tab[] =
    {
-        resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, resizeArea_<short, float>,
+        resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, 
-        0, resizeArea_<float, float>, resizeArea_<double, double>, 0
+        resizeArea_<short, float>, 0, resizeArea_<float, float>,
+        resizeArea_<double, double>, 0
    };
    Mat src = _src.getMat();
    Size ssize = src.size();
    CV_Assert( ssize.area() > 0 );
-    CV_Assert( !(dsize == Size()) || (inv_scale_x > 0 && inv_scale_y > 0) );
+    CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
-    if( dsize == Size() )
+    if( !dsize.area() )
    {
        dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
            saturate_cast<int>(src.rows*inv_scale_y));
+        CV_Assert( dsize.area() );
    }
    else
    {
@@ -1602,15 +1802,24 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
        return;
    }
-    // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
-    // In other cases it is emulated using some variant of bilinear interpolation
-    if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
    {
        int iscale_x = saturate_cast<int>(scale_x);
        int iscale_y = saturate_cast<int>(scale_y);
-        if( std::abs(scale_x - iscale_x) < DBL_EPSILON &&
+        bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
-            std::abs(scale_y - iscale_y) < DBL_EPSILON )
+                std::abs(scale_y - iscale_y) < DBL_EPSILON;
+        // in case of scale_x && scale_y is equal to 2 
+        // INTER_AREA (fast) also is equal to INTER_LINEAR
+        if ( interpolation == INTER_LINEAR &&
+            scale_x >= 1 && scale_y >= 1 && is_area_fast)
+            interpolation = INTER_AREA;
+        // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
+        // In other cases it is emulated using some variant of bilinear interpolation
+        if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
+        {
+            if( is_area_fast )
            {
                int area = iscale_x*iscale_y;
                size_t srcstep = src.step / src.elemSize1();
@@ -1626,9 +1835,10 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
                for( dx = 0; dx < dsize.width; dx++ )
                {
-                sx = dx*iscale_x*cn;
+                    int j = dx * cn;
+                    sx = iscale_x * j;
                    for( k = 0; k < cn; k++ )
-                    xofs[dx*cn + k] = sx + k;
+                        xofs[j + k] = sx + k;
                }
                func( src, dst, ofs, xofs, iscale_x, iscale_y );
@@ -1643,7 +1853,8 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
            for( dx = 0, k = 0; dx < dsize.width; dx++ )
            {
-            double fsx1 = dx*scale_x, fsx2 = fsx1 + scale_x;
+                double fsx1 = dx*scale_x;
+                double fsx2 = fsx1 + scale_x;
                int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
                sx1 = std::min(sx1, ssize.width-1);
                sx2 = std::min(sx2, ssize.width-1);
@@ -1672,9 +1883,11 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
                    xofs[k++].alpha = (float)(min(fsx2 - sx2, 1.) / min(scale_x, src.cols - fsx1));
                }
            }
-        func( src, dst, xofs, k ,scale_y);
+            func( src, dst, xofs, k, scale_y);
            return;
        }
+    }
    int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
    bool area_mode = interpolation == INTER_AREA;
@@ -2549,134 +2762,48 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
                          const Mat& _fxy, const void* _wtab,
                          int borderType, const Scalar& _borderValue);
-}
+class remapInvoker :
+    public ParallelLoopBody
-void cv::remap( InputArray _src, OutputArray _dst,
-                InputArray _map1, InputArray _map2,
-                int interpolation, int borderType, const Scalar& borderValue )
 {
-    static RemapNNFunc nn_tab[] =
+public:
-    {
+    remapInvoker(const Mat& _src, Mat _dst, const Mat& _map1, const Mat& _map2, const Mat *_m1, 
-        remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
+                 const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
-        remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
+                 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
-    };
+        ParallelLoopBody(), src(_src), dst(_dst), map1(_map1), map2(_map2), m1(_m1), m2(_m2),
+        interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue), 
-    static RemapFunc linear_tab[] =
+        planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
-    {
-        remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
-        remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
-        remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
-        remapBilinear<Cast<float, float>, RemapNoVec, float>,
-        remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
-    };
-    static RemapFunc cubic_tab[] =
-    {
-        remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
-        remapBicubic<Cast<float, ushort>, float, 1>,
-        remapBicubic<Cast<float, short>, float, 1>, 0,
-        remapBicubic<Cast<float, float>, float, 1>,
-        remapBicubic<Cast<double, double>, float, 1>, 0
-    };
-    static RemapFunc lanczos4_tab[] =
-    {
-        remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
-        remapLanczos4<Cast<float, ushort>, float, 1>,
-        remapLanczos4<Cast<float, short>, float, 1>, 0,
-        remapLanczos4<Cast<float, float>, float, 1>,
-        remapLanczos4<Cast<double, double>, float, 1>, 0
-    };
-    Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
-    CV_Assert( (!map2.data || map2.size() == map1.size()));
-    _dst.create( map1.size(), src.type() );
-    Mat dst = _dst.getMat();
-    if( dst.data == src.data )
-        src = src.clone();
-    int depth = src.depth(), map_depth = map1.depth();
-    RemapNNFunc nnfunc = 0;
-    RemapFunc ifunc = 0;
-    const void* ctab = 0;
-    bool fixpt = depth == CV_8U;
-    bool planar_input = false;
-    if( interpolation == INTER_NEAREST )
    {    
-        nnfunc = nn_tab[depth];
-        CV_Assert( nnfunc != 0 );
-        if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
-        {
-            nnfunc( src, dst, map1, borderType, borderValue );
-            return;
-        }
-    }
-    else
-    {
-        if( interpolation == INTER_AREA )
-            interpolation = INTER_LINEAR;
-        if( interpolation == INTER_LINEAR )
-            ifunc = linear_tab[depth];
-        else if( interpolation == INTER_CUBIC )
-            ifunc = cubic_tab[depth];
-        else if( interpolation == INTER_LANCZOS4 )
-            ifunc = lanczos4_tab[depth];
-        else
-            CV_Error( CV_StsBadArg, "Unknown interpolation method" );
-        CV_Assert( ifunc != 0 );
-        ctab = initInterTab2D( interpolation, fixpt );
    }   
-    const Mat *m1 = &map1, *m2 = &map2;
+    virtual void operator() (const Range& range) const
-    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
-        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
-    {
-        if( map1.type() != CV_16SC2 )
-            std::swap(m1, m2);
-        if( ifunc )
    {
-            ifunc( src, dst, *m1, *m2, ctab, borderType, borderValue );
-            return;
-        }
-    }
-    else
-    {
-        CV_Assert( (map1.type() == CV_32FC2 && !map2.data) ||
-            (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
-        planar_input = map1.channels() == 1;
-    }
        int x, y, x1, y1;
        const int buf_size = 1 << 14;
-    int brows0 = std::min(128, dst.rows);
+        int brows0 = std::min(128, dst.rows), map_depth = map1.depth();
        int bcols0 = std::min(buf_size/brows0, dst.cols);
        brows0 = std::min(buf_size/bcols0, dst.rows);
-#if CV_SSE2
+    #if CV_SSE2
        bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
-#endif
+    #endif
        Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
        if( !nnfunc )
            _bufa.create(brows0, bcols0, CV_16UC1);
-    for( y = 0; y < dst.rows; y += brows0 )
+        for( y = range.start; y < range.end; y += brows0 )
        {
            for( x = 0; x < dst.cols; x += bcols0 )
            {
-            int brows = std::min(brows0, dst.rows - y);
+                int brows = std::min(brows0, range.end - y);
                int bcols = std::min(bcols0, dst.cols - x);
                Mat dpart(dst, Rect(x, y, bcols, brows));
                Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
                if( nnfunc )
                {
-                if( map_depth != CV_32F )
+                    if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
+                        bufxy = map1(Rect(x, y, bcols, brows));
+                    else if( map_depth != CV_32F )
                    {
                        for( y1 = 0; y1 < brows; y1++ )
                        {
@@ -2693,7 +2820,7 @@ void cv::remap( InputArray _src, OutputArray _dst,
                        }
                    }
                    else if( !planar_input )
-                    map1(Rect(0,0,bcols,brows)).convertTo(bufxy, bufxy.depth());
+                        map1(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
                    else
                    {
                        for( y1 = 0; y1 < brows; y1++ )
@@ -2737,13 +2864,19 @@ void cv::remap( InputArray _src, OutputArray _dst,
                    continue;
                }
-            Mat bufa(_bufa, Rect(0,0,bcols, brows));
+                Mat bufa(_bufa, Rect(0, 0, bcols, brows));
                for( y1 = 0; y1 < brows; y1++ )
                {
                    short* XY = (short*)(bufxy.data + bufxy.step*y1);
                    ushort* A = (ushort*)(bufa.data + bufa.step*y1);
-                if( planar_input )
+                    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
+                        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
+                    {
+                        bufxy = m1->operator()(Rect(x, y, bcols, brows));
+                        bufa = m2->operator()(Rect(x, y, bcols, brows));
+                    }
+                    else if( planar_input )
                    {
                        const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
                        const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
@@ -2815,6 +2948,118 @@ void cv::remap( InputArray _src, OutputArray _dst,
                ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue);
            }
        }
+    }
+private:
+    const Mat src;
+    Mat dst;
+    const Mat map1, map2, *m1, *m2;
+    int interpolation, borderType;
+    const Scalar borderValue;
+    int planar_input;
+    RemapNNFunc nnfunc;
+    RemapFunc ifunc;
+    const void *ctab;
+};
+}
+void cv::remap( InputArray _src, OutputArray _dst,
+                InputArray _map1, InputArray _map2,
+                int interpolation, int borderType, const Scalar& borderValue )
+{
+    static RemapNNFunc nn_tab[] =
+    {
+        remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
+        remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
+    };
+    static RemapFunc linear_tab[] =
+    {
+        remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
+        remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
+        remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
+        remapBilinear<Cast<float, float>, RemapNoVec, float>,
+        remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
+    };
+    static RemapFunc cubic_tab[] =
+    {
+        remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
+        remapBicubic<Cast<float, ushort>, float, 1>,
+        remapBicubic<Cast<float, short>, float, 1>, 0,
+        remapBicubic<Cast<float, float>, float, 1>,
+        remapBicubic<Cast<double, double>, float, 1>, 0
+    };
+    static RemapFunc lanczos4_tab[] =
+    {
+        remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
+        remapLanczos4<Cast<float, ushort>, float, 1>,
+        remapLanczos4<Cast<float, short>, float, 1>, 0,
+        remapLanczos4<Cast<float, float>, float, 1>,
+        remapLanczos4<Cast<double, double>, float, 1>, 0
+    };
+    Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
+    CV_Assert( map1.size().area() > 0 );
+    CV_Assert( !map2.data || (map2.size() == map1.size()));
+    _dst.create( map1.size(), src.type() );
+    Mat dst = _dst.getMat();
+    if( dst.data == src.data )
+        src = src.clone();
+    int depth = src.depth();
+    RemapNNFunc nnfunc = 0;
+    RemapFunc ifunc = 0;
+    const void* ctab = 0;
+    bool fixpt = depth == CV_8U;
+    bool planar_input = false;
+    if( interpolation == INTER_NEAREST )
+    {
+        nnfunc = nn_tab[depth];
+        CV_Assert( nnfunc != 0 );
+    }
+    else
+    {
+        if( interpolation == INTER_AREA )
+            interpolation = INTER_LINEAR;
+        if( interpolation == INTER_LINEAR )
+            ifunc = linear_tab[depth];
+        else if( interpolation == INTER_CUBIC )
+            ifunc = cubic_tab[depth];
+        else if( interpolation == INTER_LANCZOS4 )
+            ifunc = lanczos4_tab[depth];
+        else
+            CV_Error( CV_StsBadArg, "Unknown interpolation method" );
+        CV_Assert( ifunc != 0 );
+        ctab = initInterTab2D( interpolation, fixpt );
+    }
+    const Mat *m1 = &map1, *m2 = &map2;
+    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
+        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
+    {
+        if( map1.type() != CV_16SC2 )
+            std::swap(m1, m2);
+    }
+    else
+    {
+        CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && !map2.data) ||
+            (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
+        planar_input = map1.channels() == 1;
+    }
+    Range range(0, dst.rows);
+    remapInvoker invoker(src, dst, map1, map2, m1, m2, interpolation, 
+                         borderType, borderValue, planar_input, nnfunc, ifunc,
+                         ctab);
+    parallel_for_(range, invoker);
 }
@@ -2957,71 +3202,42 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
 }
-void cv::warpAffine( InputArray _src, OutputArray _dst,
+namespace cv
-                     InputArray _M0, Size dsize,
-                     int flags, int borderType, const Scalar& borderValue )
 {
-    Mat src = _src.getMat(), M0 = _M0.getMat();
-    _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
-    Mat dst = _dst.getMat();
-    CV_Assert( src.cols > 0 && src.rows > 0 );
-    if( dst.data == src.data )
-        src = src.clone();
-    const int BLOCK_SZ = 64;
-    short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
-    double M[6];
-    Mat matM(2, 3, CV_64F, M);
-    int interpolation = flags & INTER_MAX;
-    if( interpolation == INTER_AREA )
-        interpolation = INTER_LINEAR;
-    CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
+class warpAffineInvoker :   
-    M0.convertTo(matM, matM.type());
+    public ParallelLoopBody
+{
-#ifdef HAVE_TEGRA_OPTIMIZATION
+public:
-    if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
+    warpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType, 
-        return;
+                      const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
-#endif
+        ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
+        borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
-    if( !(flags & WARP_INVERSE_MAP) )
+        M(_M)
    {
-        double D = M[0]*M[4] - M[1]*M[3];
-        D = D != 0 ? 1./D : 0;
-        double A11 = M[4]*D, A22=M[0]*D;
-        M[0] = A11; M[1] *= -D;
-        M[3] *= -D; M[4] = A22;
-        double b1 = -M[0]*M[2] - M[1]*M[5];
-        double b2 = -M[3]*M[2] - M[4]*M[5];
-        M[2] = b1; M[5] = b2;
    }
-    int x, y, x1, y1, width = dst.cols, height = dst.rows;
+    virtual void operator() (const Range& range) const
-    AutoBuffer<int> _abdelta(width*2);
+    {
-    int* adelta = &_abdelta[0], *bdelta = adelta + width;
+        const int BLOCK_SZ = 64;
+        short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
        const int AB_BITS = MAX(10, (int)INTER_BITS);
        const int AB_SCALE = 1 << AB_BITS;  
-    int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2;
+        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
-#if CV_SSE2
+    #if CV_SSE2
        bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
-#endif
+    #endif
-    for( x = 0; x < width; x++ )
-    {
-        adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
-        bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
-    }
-    int bh0 = std::min(BLOCK_SZ/2, height);
+        int bh0 = std::min(BLOCK_SZ/2, dst.rows);
-    int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
+        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
-    bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
+        bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
-    for( y = 0; y < height; y += bh0 )
+        for( y = range.start; y < range.end; y += bh0 )
        {
-        for( x = 0; x < width; x += bw0 )
+            for( x = 0; x < dst.cols; x += bw0 )
            {
-            int bw = std::min( bw0, width - x);
+                int bw = std::min( bw0, dst.cols - x);
-            int bh = std::min( bh0, height - y);
+                int bh = std::min( bh0, range.end - y);
                Mat _XY(bh, bw, CV_16SC2, XY), matA;
                Mat dpart(dst, Rect(x, y, bw, bh));
@@ -3099,51 +3315,107 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
                }
            }
        }
+    }
+private:
+    const Mat src;
+    Mat dst;
+    int interpolation, borderType;
+    const Scalar borderValue;
+    int *adelta, *bdelta;
+    double *M;
+};
 }
-void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
+void cv::warpAffine( InputArray _src, OutputArray _dst,
-                          Size dsize, int flags, int borderType, const Scalar& borderValue )
+                     InputArray _M0, Size dsize,
+                     int flags, int borderType, const Scalar& borderValue )
 {
    Mat src = _src.getMat(), M0 = _M0.getMat();
    _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
    Mat dst = _dst.getMat();
    CV_Assert( src.cols > 0 && src.rows > 0 );
    if( dst.data == src.data )
        src = src.clone();
-    const int BLOCK_SZ = 32;
+    double M[6];
-    short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
+    Mat matM(2, 3, CV_64F, M);
-    double M[9];
-    Mat matM(3, 3, CV_64F, M);
    int interpolation = flags & INTER_MAX;
    if( interpolation == INTER_AREA )
        interpolation = INTER_LINEAR;
-    CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
+    CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
    M0.convertTo(matM, matM.type());
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
+    if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
        return;
 #endif
    if( !(flags & WARP_INVERSE_MAP) )
-         invert(matM, matM);
+    {
+        double D = M[0]*M[4] - M[1]*M[3];
+        D = D != 0 ? 1./D : 0;
+        double A11 = M[4]*D, A22=M[0]*D;
+        M[0] = A11; M[1] *= -D;
+        M[3] *= -D; M[4] = A22;
+        double b1 = -M[0]*M[2] - M[1]*M[5];
+        double b2 = -M[3]*M[2] - M[4]*M[5];
+        M[2] = b1; M[5] = b2;
+    }
+    int x;
+    AutoBuffer<int> _abdelta(dst.cols*2);
+    int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    const int AB_SCALE = 1 << AB_BITS;
+    for( x = 0; x < dst.cols; x++ )
+    {
+        adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
+        bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
+    }
+    Range range(0, dst.rows);
+    warpAffineInvoker invoker(src, dst, interpolation, borderType,
+                              borderValue, adelta, bdelta, M);
+    parallel_for_(range, invoker);
+}
+namespace cv
+{
+class warpPerspectiveInvoker :
+    public ParallelLoopBody
+{
+public:
+    warpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
+                           int _borderType, const Scalar &_borderValue) :
+        ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
+        borderType(_borderType), borderValue(_borderValue)
+    {
+    }
+    virtual void operator() (const Range& range) const
+    {
+        const int BLOCK_SZ = 32;
+        short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
        int x, y, x1, y1, width = dst.cols, height = dst.rows;
        int bh0 = std::min(BLOCK_SZ/2, height);
        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
        bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
-    for( y = 0; y < height; y += bh0 )
+        for( y = range.start; y < range.end; y += bh0 )
        {
            for( x = 0; x < width; x += bw0 )
            {
                int bw = std::min( bw0, width - x);
-            int bh = std::min( bh0, height - y);
+                int bh = std::min( bh0, range.end - y); // height
                Mat _XY(bh, bw, CV_16SC2, XY), matA;
                Mat dpart(dst, Rect(x, y, bw, bh));
@@ -3197,6 +3469,49 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
                }
            }
        }
+    }
+private:
+    const Mat src;
+    Mat dst;
+    double* M;
+    int interpolation, borderType;
+    const Scalar borderValue;
+};
+}
+void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
+                          Size dsize, int flags, int borderType, const Scalar& borderValue )
+{
+    Mat src = _src.getMat(), M0 = _M0.getMat();
+    _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
+    Mat dst = _dst.getMat();
+    CV_Assert( src.cols > 0 && src.rows > 0 );
+    if( dst.data == src.data )
+        src = src.clone();
+    double M[9];
+    Mat matM(3, 3, CV_64F, M);
+    int interpolation = flags & INTER_MAX;
+    if( interpolation == INTER_AREA )
+        interpolation = INTER_LINEAR;
+    CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
+    M0.convertTo(matM, matM.type());
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
+        return;
+#endif
+    if( !(flags & WARP_INVERSE_MAP) )
+         invert(matM, matM);
+    Range range(0, dst.rows);
+    warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
+    parallel_for_(range, invoker);
 }