parallel version of remap, resize, warpaffine, warpPerspective. Some…

parallel version of remap, resize, warpaffine, warpPerspective. Some optimization for 2x decimation in resize algorithm

parallel version of remap, resize, warpaffine, warpPerspective. Some…
parallel version of remap, resize, warpaffine, warpPerspective. Some optimization for 2x decimation in resize algorithm
92795ba4 · Ilya Lavrenov · f2a02fef · 92795ba4 · 92795ba4 · 92795ba4
Commit 92795ba4 authored Aug 22, 2012 by Ilya Lavrenov
Showing with 1157 additions and 746 deletions

perf_remap.cpp modules/imgproc/perf/perf_remap.cpp +68 -0

perf_resize.cpp modules/imgproc/perf/perf_resize.cpp +30 -2

imgwarp.cpp modules/imgproc/src/imgwarp.cpp +1059 -744

No files found.
--- a/modules/imgproc/perf/perf_remap.cpp
+++ b/modules/imgproc/perf/perf_remap.cpp
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+using namespace testing;
+using std::tr1::make_tuple;
+using std::tr1::get;
+
+CV_ENUM(MatrixType, CV_16UC1, CV_16SC1, CV_32FC1)
+CV_ENUM(MapType, CV_16SC2, CV_32FC1, CV_32FC2)
+CV_ENUM(InterType, INTER_LINEAR, INTER_CUBIC, INTER_LANCZOS4, INTER_NEAREST)
+
+typedef TestBaseWithParam< tr1::tuple<Size, MatrixType, MapType, InterType> > TestRemap;
+
+PERF_TEST_P( TestRemap, Remap,
+             Combine(
+                Values( szVGA, sz1080p ), 
+                ValuesIn( MatrixType::all() ), 
+                ValuesIn( MapType::all() ), 
+                ValuesIn( InterType::all() ) 
+             )
+)
+{
+    Size sz;
+    int src_type, map1_type, inter_type;
+
+    sz         = get<0>(GetParam());
+    src_type   = get<1>(GetParam());
+    map1_type  = get<2>(GetParam());
+    inter_type = get<3>(GetParam());
+
+    Mat src(sz, src_type);
+    Mat map1(sz, map1_type);
+    Mat dst(sz, src_type);
+    
+    Mat map2(map1_type == CV_32FC1 ? sz : Size(), CV_32FC1);
+        
+    RNG rng;
+    rng.fill(src, RNG::UNIFORM, 0, 256);
+    
+    for (int j = 0; j < map1.rows; ++j)
+        for (int i = 0; i < map1.cols; ++i)
+            switch (map1_type)
+            {
+                case CV_32FC1:
+                    map1.at<float>(j, i) = src.cols - i;
+                    map2.at<float>(j, i) = j;
+                    break;
+                case CV_32FC2:
+                    map1.at<Vec2f>(j, i)[0] = src.cols - i;
+                    map1.at<Vec2f>(j, i)[1] = j;
+                    break;
+                case CV_16SC2:
+                    map1.at<Vec2s>(j, i)[0] = src.cols - i;
+                    map1.at<Vec2s>(j, i)[1] = j;
+                    break;
+                default:
+                    CV_Assert(0);
+            }
+
+
+    declare.in(src, WARMUP_RNG).out(dst).time(20);
+
+    TEST_CYCLE() remap(src, dst, map1, map2, inter_type);
+
+    SANITY_CHECK(dst);
+}
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -59,11 +59,11 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
 typedef tr1::tuple<MatType, Size, int> MatInfo_Size_Scale_t;
 typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;

-PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
+PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast,
            testing::Combine(
                testing::Values(CV_8UC1, CV_8UC4),
                testing::Values(szVGA, szqHD, sz720p, sz1080p),
-                testing::Values(2, 4)
+                testing::Values(2)
                )
            )
 {
@@ -84,3 +84,31 @@ PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
    //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
    SANITY_CHECK(dst, 1);
 }
+
+
+typedef TestBaseWithParam<tr1::tuple<MatType, Size, double> > MatInfo_Size_Scale_Area;
+
+PERF_TEST_P(MatInfo_Size_Scale_Area, ResizeArea,
+            testing::Combine(
+                testing::Values(CV_8UC1, CV_8UC4),
+                testing::Values(szVGA, szqHD, sz720p, sz1080p),
+                testing::Values(2.4, 3.4, 1.3)
+                )
+            )
+{
+    int matType = get<0>(GetParam());
+    Size from = get<1>(GetParam());
+    double scale = get<2>(GetParam());
+
+    cv::Mat src(from, matType);
+    
+    Size to(cvRound(from.width * scale), cvRound(from.height * scale));
+    cv::Mat dst(to, matType);
+
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    TEST_CYCLE() resize(src, dst, dst.size(), 0, 0, INTER_AREA);
+
+    //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
+    SANITY_CHECK(dst, 1);
+}
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -240,6 +240,99 @@ template<typename ST, typename DT, int bits> struct FixedPtCast
 *                                         Resize                                         *
 \****************************************************************************************/

+class resizeNNInvoker :
+    public ParallelLoopBody
+{
+public:
+    resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
+        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
+        ify(_ify)
+    {
+    }
+    
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int y, x, pix_size = (int)src.elemSize();
+        
+        for( y = range.start; y < range.end; y++ )
+        {
+            uchar* D = dst.data + dst.step*y;
+            int sy = std::min(cvFloor(y*ify), ssize.height-1);
+            const uchar* S = src.data + src.step*sy;
+
+            switch( pix_size )
+            {
+            case 1:
+                for( x = 0; x <= dsize.width - 2; x += 2 )
+                {
+                    uchar t0 = S[x_ofs[x]];
+                    uchar t1 = S[x_ofs[x+1]];
+                    D[x] = t0;
+                    D[x+1] = t1;
+                }
+
+                for( ; x < dsize.width; x++ )
+                    D[x] = S[x_ofs[x]];
+                break;
+            case 2:
+                for( x = 0; x < dsize.width; x++ )
+                    *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
+                break;
+            case 3:
+                for( x = 0; x < dsize.width; x++, D += 3 )
+                {
+                    const uchar* _tS = S + x_ofs[x];
+                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
+                }
+                break;
+            case 4:
+                for( x = 0; x < dsize.width; x++ )
+                    *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
+                break;
+            case 6:
+                for( x = 0; x < dsize.width; x++, D += 6 )
+                {
+                    const ushort* _tS = (const ushort*)(S + x_ofs[x]);
+                    ushort* _tD = (ushort*)D;
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
+                }
+                break;
+            case 8:
+                for( x = 0; x < dsize.width; x++, D += 8 )
+                {
+                    const int* _tS = (const int*)(S + x_ofs[x]);
+                    int* _tD = (int*)D;
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1];
+                }
+                break;
+            case 12:
+                for( x = 0; x < dsize.width; x++, D += 12 )
+                {
+                    const int* _tS = (const int*)(S + x_ofs[x]);
+                    int* _tD = (int*)D;
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
+                }
+                break;
+            default:
+                for( x = 0; x < dsize.width; x++, D += pix_size )
+                {
+                    const int* _tS = (const int*)(S + x_ofs[x]);
+                    int* _tD = (int*)D;
+                    for( int k = 0; k < pix_size4; k++ )
+                        _tD[k] = _tS[k];
+                }
+            }
+        }
+    }
+    
+private:
+    const Mat src;
+    Mat dst;
+    int* x_ofs, pix_size4;
+    double ify;
+};
+
 static void
 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
 {
@@ -249,83 +342,17 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
    int pix_size = (int)src.elemSize();
    int pix_size4 = (int)(pix_size / sizeof(int));
    double ifx = 1./fx, ify = 1./fy;
-    int x, y;
+    int x;

    for( x = 0; x < dsize.width; x++ )
    {
        int sx = cvFloor(x*ifx);
        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
    }
-
-    for( y = 0; y < dsize.height; y++ )
-    {
-        uchar* D = dst.data + dst.step*y;
-        int sy = std::min(cvFloor(y*ify), ssize.height-1);
-        const uchar* S = src.data + src.step*sy;
-
-        switch( pix_size )
-        {
-        case 1:
-            for( x = 0; x <= dsize.width - 2; x += 2 )
-            {
-                uchar t0 = S[x_ofs[x]];
-                uchar t1 = S[x_ofs[x+1]];
-                D[x] = t0;
-                D[x+1] = t1;
-            }
-
-            for( ; x < dsize.width; x++ )
-                D[x] = S[x_ofs[x]];
-            break;
-        case 2:
-            for( x = 0; x < dsize.width; x++ )
-                *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
-            break;
-        case 3:
-            for( x = 0; x < dsize.width; x++, D += 3 )
-            {
-                const uchar* _tS = S + x_ofs[x];
-                D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
-            }
-            break;
-        case 4:
-            for( x = 0; x < dsize.width; x++ )
-                *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
-            break;
-        case 6:
-            for( x = 0; x < dsize.width; x++, D += 6 )
-            {
-                const ushort* _tS = (const ushort*)(S + x_ofs[x]);
-                ushort* _tD = (ushort*)D;
-                _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
-            }
-            break;
-        case 8:
-            for( x = 0; x < dsize.width; x++, D += 8 )
-            {
-                const int* _tS = (const int*)(S + x_ofs[x]);
-                int* _tD = (int*)D;
-                _tD[0] = _tS[0]; _tD[1] = _tS[1];
-            }
-            break;
-        case 12:
-            for( x = 0; x < dsize.width; x++, D += 12 )
-            {
-                const int* _tS = (const int*)(S + x_ofs[x]);
-                int* _tD = (int*)D;
-                _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
-            }
-            break;
-        default:
-            for( x = 0; x < dsize.width; x++, D += pix_size )
-            {
-                const int* _tS = (const int*)(S + x_ofs[x]);
-                int* _tD = (int*)D;
-                for( int k = 0; k < pix_size4; k++ )
-                    _tD[k] = _tS[k];
-            }
-        }
-    }
+    
+    Range range(0, dsize.height);
+    resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
+    parallel_for_(range, invoker);
 }


@@ -1092,6 +1119,82 @@ static inline int clip(int x, int a, int b)

 static const int MAX_ESIZE=16;

+template <typename HResize, typename VResize>
+class resizeGeneric_Invoker :
+    public ParallelLoopBody
+{
+public:
+    typedef typename HResize::value_type T;
+    typedef typename HResize::buf_type WT;
+    typedef typename HResize::alpha_type AT;
+    
+    resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
+        const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
+        int _ksize, int _xmin, int _xmax) :
+        ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
+        alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
+        ksize(_ksize), xmin(_xmin), xmax(_xmax)
+    {
+    }
+    
+    virtual void operator() (const Range& range) const
+    {
+        int dy, cn = src.channels();
+        HResize hresize;
+        VResize vresize;
+        
+        int bufstep = (int)alignSize(dsize.width, 16);
+        AutoBuffer<WT> _buffer(bufstep*ksize);
+        const T* srows[MAX_ESIZE]={0};
+        WT* rows[MAX_ESIZE]={0};
+        int prev_sy[MAX_ESIZE];
+
+        for(int k = 0; k < ksize; k++ )
+        {
+            prev_sy[k] = -1;
+            rows[k] = (WT*)_buffer + bufstep*k;
+        }
+        
+        const AT* beta = _beta + ksize * range.start;
+        
+        for( dy = range.start; dy < range.end; dy++, beta += ksize )
+        {
+            int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
+
+            for(int k = 0; k < ksize; k++ )
+            {
+                int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
+                for( k1 = std::max(k1, k); k1 < ksize; k1++ )
+                {
+                    if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
+                    {
+                        if( k1 > k )
+                            memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
+                        break;
+                    }
+                }
+                if( k1 == ksize )
+                    k0 = std::min(k0, k); // remember the first row that needs to be computed
+                srows[k] = (T*)(src.data + src.step*sy);
+                prev_sy[k] = sy;
+            }
+
+            if( k0 < ksize )
+                hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
+                        ssize.width, dsize.width, cn, xmin, xmax );
+            vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
+        }
+    }
+    
+private:
+    const Mat src;
+    Mat dst;
+    const int* xofs, *yofs;
+    const AT* alpha, *_beta;
+    const Size ssize, dsize;
+    const int ksize, xmin, xmax;
+};
+
 template<class HResize, class VResize>
 static void resizeGeneric_( const Mat& src, Mat& dst,
                            const int* xofs, const void* _alpha,
@@ -1102,124 +1205,178 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
    typedef typename HResize::buf_type WT;
    typedef typename HResize::alpha_type AT;

-    const AT* alpha = (const AT*)_alpha;
    const AT* beta = (const AT*)_beta;
    Size ssize = src.size(), dsize = dst.size();
    int cn = src.channels();
    ssize.width *= cn;
    dsize.width *= cn;
-    int bufstep = (int)alignSize(dsize.width, 16);
-    AutoBuffer<WT> _buffer(bufstep*ksize);
-    const T* srows[MAX_ESIZE]={0};
-    WT* rows[MAX_ESIZE]={0};
-    int prev_sy[MAX_ESIZE];
-    int dy;
    xmin *= cn;
    xmax *= cn;
+    // image resize is a separable operation. In case of not too strong
+    
+    Range range(0, dsize.height);
+    resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
+        ssize, dsize, ksize, xmin, xmax);
+    parallel_for_(range, invoker);
+}

-    HResize hresize;
-    VResize vresize;
+template <typename T, typename WT>
+struct ResizeAreaFastNoVec
+{
+    ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
+        int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
+    int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
+};

-    for(int k = 0; k < ksize; k++ )
-    {
-        prev_sy[k] = -1;
-        rows[k] = (WT*)_buffer + bufstep*k;
+template <typename T, typename WT>
+struct ResizeAreaFast_2x2_8u
+{
+    ResizeAreaFast_2x2_8u(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
+        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
+    { 
+        fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); 
    }
-
-    // image resize is a separable operation. In case of not too strong
-    for( dy = 0; dy < dsize.height; dy++, beta += ksize )
+    
+    int operator() (const T* S, T* D, int w) const
    {
-        int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
-
-        for(int k = 0; k < ksize; k++ )
+        if( !fast_mode )
+            return 0;
+        
+        const T* nextS = S + step;
+        int dx = 0;
+        
+        if (cn == 1)
+        for( ; dx < w; ++dx )
+        {
+            int index = dx*2;
+            D[dx] = (S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2;
+        }
+        else if (cn == 3)
+        for( ; dx < w; dx += 3 )
+        {
+            int index = dx*2;
+            D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
+            D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
+            D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
+        }
+        else
        {
-            int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
-            for( k1 = std::max(k1, k); k1 < ksize; k1++ )
+            assert(cn == 4);
+            for( ; dx < w; dx += 4 )
            {
-                if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
-                {
-                    if( k1 > k )
-                        memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
-                    break;
-                }
+                int index = dx*2;
+                D[dx] = (S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2;
+                D[dx+1] = (S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2;
+                D[dx+2] = (S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2;
+                D[dx+3] = (S[index+3] + S[index+6] + nextS[index+3] + nextS[index+6] + 2) >> 2;
            }
-            if( k1 == ksize )
-                k0 = std::min(k0, k); // remember the first row that needs to be computed
-            srows[k] = (const T*)(src.data + src.step*sy);
-            prev_sy[k] = sy;
        }
-
-        if( k0 < ksize )
-            hresize( srows + k0, rows + k0, ksize - k0, xofs, alpha,
-                     ssize.width, dsize.width, cn, xmin, xmax );
-        vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
+        
+        return dx;
    }
-}
-
+    
+private:
+    const int scale_x, scale_y;
+    const int cn;
+    bool fast_mode;
+    const int step;
+};

-template<typename T, typename WT>
-static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
-                             int scale_x, int scale_y )
+template <typename T, typename WT, typename VecOp>
+class resizeAreaFast_Invoker :
+    public ParallelLoopBody
 {
-    Size ssize = src.size(), dsize = dst.size();
-    int cn = src.channels();
-    int dy, dx, k = 0;
-    int area = scale_x*scale_y;
-    float scale = 1.f/(scale_x*scale_y);
-    int dwidth1 = (ssize.width/scale_x)*cn;
-    dsize.width *= cn;
-    ssize.width *= cn;
-
-    for( dy = 0; dy < dsize.height; dy++ )
+public:
+    resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
+        int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
+        ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
+        scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
    {
-        T* D = (T*)(dst.data + dst.step*dy);
-        int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
-        if( sy0 >= ssize.height )
+    }
+    
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int cn = src.channels();
+        int area = scale_x*scale_y;
+        float scale = 1.f/(area);
+        int dwidth1 = (ssize.width/scale_x)*cn;
+        dsize.width *= cn;
+        ssize.width *= cn;
+        int dy, dx, k = 0;
+        
+        VecOp vop(scale_x, scale_y, src.channels(), src.step/*, area_ofs*/);
+        
+        for( dy = range.start; dy < range.end; dy++ )
        {
-            for( dx = 0; dx < dsize.width; dx++ )
-                D[dx] = 0;
-            continue;
-        }
+            T* D = (T*)(dst.data + dst.step*dy);
+            int sy0 = dy*scale_y;
+            int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
+            
+            if( sy0 >= ssize.height )
+            {
+                for( dx = 0; dx < dsize.width; dx++ )
+                    D[dx] = 0;
+                continue;
+            }

-        for( dx = 0; dx < w; dx++ )
-        {
-            const T* S = (const T*)(src.data + src.step*sy0) + xofs[dx];
-            WT sum = 0;
-            k=0;
-            #if CV_ENABLE_UNROLLED
-            for( ; k <= area - 4; k += 4 )
-                sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
-            #endif
-            for( ; k < area; k++ )
-                sum += S[ofs[k]];
-
-            D[dx] = saturate_cast<T>(sum*scale);
-        }
+            dx = vop((const T*)(src.data + src.step * sy0), D, w);
+            for( ; dx < w; dx++ )
+            {
+                const T* S = (const T*)(src.data + src.step * sy0) + xofs[dx];
+                WT sum = 0;
+                k = 0;
+                #if CV_ENABLE_UNROLLED
+                for( ; k <= area - 4; k += 4 )
+                    sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
+                #endif
+                for( ; k < area; k++ )
+                    sum += S[ofs[k]];

-        for( ; dx < dsize.width; dx++ )
-        {
-            WT sum = 0;
-            int count = 0, sx0 = xofs[dx];
-            if( sx0 >= ssize.width )
-                D[dx] = 0;
+                D[dx] = saturate_cast<T>(sum * scale);
+            }

-            for( int sy = 0; sy < scale_y; sy++ )
+            for( ; dx < dsize.width; dx++ )
            {
-                if( sy0 + sy >= ssize.height )
-                    break;
-                const T* S = (const T*)(src.data + src.step*(sy0 + sy)) + sx0;
-                for( int sx = 0; sx < scale_x*cn; sx += cn )
+                WT sum = 0;
+                int count = 0, sx0 = xofs[dx];
+                if( sx0 >= ssize.width )
+                    D[dx] = 0;
+
+                for( int sy = 0; sy < scale_y; sy++ )
                {
-                    if( sx0 + sx >= ssize.width )
+                    if( sy0 + sy >= ssize.height )
                        break;
-                    sum += S[sx];
-                    count++;
+                    const T* S = (const T*)(src.data + src.step*(sy0 + sy)) + sx0;
+                    for( int sx = 0; sx < scale_x*cn; sx += cn )
+                    {
+                        if( sx0 + sx >= ssize.width )
+                            break;
+                        sum += S[sx];
+                        count++;
+                    }
                }
-            }

-            D[dx] = saturate_cast<T>((float)sum/count);
+                D[dx] = saturate_cast<WT>((float)sum/count);
+            }
        }
    }
+    
+private:
+    const Mat src;
+    Mat dst;
+    const int scale_x, scale_y;
+    const int *ofs, *xofs;
+};
+
+template<typename T, typename WT, typename VecOp>
+static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
+                             int scale_x, int scale_y )
+{
+    Range range(0, dst.rows);
+    resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x, 
+        scale_y, ofs, xofs);
+    parallel_for_(range, invoker);
 }

 struct DecimateAlpha
@@ -1228,222 +1385,260 @@ struct DecimateAlpha
    float alpha;
 };

-template<typename T, typename WT>
-static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
+template <typename T, typename WT>
+class resizeArea_Invoker :
+    public ParallelLoopBody
 {
-    Size ssize = src.size(), dsize = dst.size();
-    int cn = src.channels();
-    dsize.width *= cn;
-    AutoBuffer<WT> _buffer(dsize.width*2);
-    WT *buf = _buffer, *sum = buf + dsize.width;
-    int k, sy, dx, cur_dy = 0;
-    WT scale_y = (WT)scale_y_;
-
-    CV_Assert( cn <= 4 );
-    for( dx = 0; dx < dsize.width; dx++ )
-        buf[dx] = sum[dx] = 0;
-
-    for( sy = 0; sy < ssize.height; sy++ )
+public:
+    resizeArea_Invoker(const Mat& _src, Mat& _dst, const DecimateAlpha* _xofs, 
+        int _xofs_count, double _scale_y_
+#ifdef HAVE_TBB
+        , const int* _yofs, const int* _cur_dy_ofs
+#endif
+        ) :
+        ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), 
+        xofs_count(_xofs_count), scale_y_(_scale_y_)
+#ifdef HAVE_TBB
+        , yofs(_yofs), cur_dy_ofs(_cur_dy_ofs)
+#endif
    {
-        const T* S = (const T*)(src.data + src.step*sy);
-        if( cn == 1 )
-            for( k = 0; k < xofs_count; k++ )
-            {
-                int dxn = xofs[k].di;
-                WT alpha = xofs[k].alpha;
-                buf[dxn] += S[xofs[k].si]*alpha;
-            }
-        else if( cn == 2 )
-            for( k = 0; k < xofs_count; k++ )
-            {
-                int sxn = xofs[k].si;
-                int dxn = xofs[k].di;
-                WT alpha = xofs[k].alpha;
-                WT t0 = buf[dxn] + S[sxn]*alpha;
-                WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                buf[dxn] = t0; buf[dxn+1] = t1;
-            }
-        else if( cn == 3 )
-            for( k = 0; k < xofs_count; k++ )
-            {
-                int sxn = xofs[k].si;
-                int dxn = xofs[k].di;
-                WT alpha = xofs[k].alpha;
-                WT t0 = buf[dxn] + S[sxn]*alpha;
-                WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
-                buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
-            }
-        else
-            for( k = 0; k < xofs_count; k++ )
-            {
-                int sxn = xofs[k].si;
-                int dxn = xofs[k].di;
-                WT alpha = xofs[k].alpha;
-                WT t0 = buf[dxn] + S[sxn]*alpha;
-                WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
-                buf[dxn] = t0; buf[dxn+1] = t1;
-                t0 = buf[dxn+2] + S[sxn+2]*alpha;
-                t1 = buf[dxn+3] + S[sxn+3]*alpha;
-                buf[dxn+2] = t0; buf[dxn+3] = t1;
-            }
-
-        if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
+    }
+    
+    virtual void operator() (const Range& range) const
+    {
+        Size ssize = src.size(), dsize = dst.size();
+        int cn = src.channels();
+        dsize.width *= cn;
+        AutoBuffer<WT> _buffer(dsize.width*2);
+        WT *buf = _buffer, *sum = buf + dsize.width;
+        int k, sy, dx, cur_dy = 0, num = sizeof(WT) * dsize.width;
+        WT scale_y = (WT)scale_y_;
+
+        CV_Assert( cn <= 4 );
+        memset(buf, 0, num * 2);
+        
+#ifdef HAVE_TBB
+        sy = yofs[range.start];
+        cur_dy = cur_dy_ofs[sy];
+        for( ; sy < range.start; sy++ )
        {
-            WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y, (WT)0);
-            WT beta1 = 1 - beta;
-            T* D = (T*)(dst.data + dst.step*cur_dy);
-            if( fabs(beta) < 1e-3 )
+            const T* S = (const T*)(src.data + src.step * sy);
+            memset(buf, 0, num);
+        
+            if( cn == 1 )
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    buf[dxn] += S[xofs[k].si]*alpha;
+                }
+            else if( cn == 2 )
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int sxn = xofs[k].si;
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                    buf[dxn] = t0; buf[dxn+1] = t1;
+                }
+            else if( cn == 3 )
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int sxn = xofs[k].si;
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                    WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
+                    
+                    buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
+                }
+            else
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int sxn = xofs[k].si;
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                    
+                    buf[dxn] = t0; buf[dxn+1] = t1;
+                    
+                    t0 = buf[dxn+2] + S[sxn+2]*alpha;
+                    t1 = buf[dxn+3] + S[sxn+3]*alpha;
+                    
+                    buf[dxn+2] = t0; buf[dxn+3] = t1;
+                }
+                 
+            if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
            {
-                if(cur_dy >= dsize.height) return;
-                for( dx = 0; dx < dsize.width; dx++ )
+                WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0); 
+                if( fabs(beta) < 1e-3 )
                {
-                    D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
-                    sum[dx] = buf[dx] = 0;
+                    if(cur_dy >= dsize.height)
+                        break;
+                    memset(sum, 0, num);
                }
+                else
+                    for( dx = 0; dx < dsize.width; dx++ )
+                        sum[dx] = buf[dx] * beta;
+                cur_dy++;
            }
            else
-                for( dx = 0; dx < dsize.width; dx++ )
+            {
+                for( dx = 0; dx <= dsize.width - 2; dx += 2 )
                {
-                    D[dx] = saturate_cast<T>((sum[dx] + buf[dx]* beta1)/ min(scale_y, src.rows - cur_dy*scale_y));
-                    sum[dx] = buf[dx]*beta;
-                    buf[dx] = 0;
+                    WT t0 = sum[dx] + buf[dx];
+                    WT t1 = sum[dx+1] + buf[dx+1];
+                    sum[dx] = t0; sum[dx+1] = t1;
                }
-            cur_dy++;
+                for( ; dx < dsize.width; dx++ )
+                    sum[dx] += buf[dx];
+            }
        }
-        else
+#endif
+        
+        for( sy = range.start; sy < range.end; sy++ )
        {
-            for( dx = 0; dx <= dsize.width - 2; dx += 2 )
+            const T* S = (const T*)(src.data + src.step * sy);
+            memset(buf, 0, num);
+        
+            if( cn == 1 )
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    buf[dxn] += S[xofs[k].si]*alpha;
+                }
+            else if( cn == 2 )
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int sxn = xofs[k].si;
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                    buf[dxn] = t0; buf[dxn+1] = t1;
+                }
+            else if( cn == 3 )
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int sxn = xofs[k].si;
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                    WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
+                    
+                    buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
+                }
+            else
+                for( k = 0; k < xofs_count; k++ )
+                {
+                    int sxn = xofs[k].si;
+                    int dxn = xofs[k].di;
+                    WT alpha = xofs[k].alpha;
+                    
+                    WT t0 = buf[dxn] + S[sxn]*alpha;
+                    WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
+                    
+                    buf[dxn] = t0; buf[dxn+1] = t1;
+                    
+                    t0 = buf[dxn+2] + S[sxn+2]*alpha;
+                    t1 = buf[dxn+3] + S[sxn+3]*alpha;
+                    
+                    buf[dxn+2] = t0; buf[dxn+3] = t1;
+                }
+            
+            if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
            {
-                WT t0 = sum[dx] + buf[dx];
-                WT t1 = sum[dx+1] + buf[dx+1];
-                sum[dx] = t0; sum[dx+1] = t1;
-                buf[dx] = buf[dx+1] = 0;
+                WT beta = std::max(sy + 1 - (cur_dy + 1) * scale_y, (WT)0); 
+                T* D = (T*)(dst.data + dst.step*cur_dy); 
+                if( fabs(beta) < 1e-3 )
+                {
+                    if(cur_dy >= dsize.height)
+                        return;
+                    for( dx = 0; dx < dsize.width; dx++ )
+                        D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y));
+                    memset(sum, 0, num);
+                }
+                else
+                {
+                    WT beta1 = 1 - beta;
+                    for( dx = 0; dx < dsize.width; dx++ )
+                    {
+                        D[dx] = saturate_cast<T>((sum[dx] + buf[dx] * beta1)/ min(scale_y, src.rows - cur_dy * scale_y));
+                        sum[dx] = buf[dx] * beta;
+                    }
+                }
+                cur_dy++;
            }
-            for( ; dx < dsize.width; dx++ )
+            else
            {
-                sum[dx] += buf[dx];
-                buf[dx] = 0;
+                for( dx = 0; dx <= dsize.width - 2; dx += 2 )
+                {
+                    WT t0 = sum[dx] + buf[dx]; 
+                    WT t1 = sum[dx+1] + buf[dx+1]; 
+                    sum[dx] = t0; sum[dx+1] = t1; 
+                }
+                for( ; dx < dsize.width; dx++ )
+                    sum[dx] += buf[dx]; 
            }
        }
    }
-}
    
-    
-static void resizeAreaFast_8u( const Mat& src, Mat& dst,
-                               const int* ofs, const int* xofs,
-                               int scale_x, int scale_y )
-{
-#if CV_SSE2
-    bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+private:
+    const Mat src;
+    Mat dst;
+    const DecimateAlpha* xofs;
+    const int xofs_count;
+    const double scale_y_;
+#ifdef HAVE_TBB
+    const int *yofs, *cur_dy_ofs;
 #endif
-    
+};
+
+template<typename T, typename WT>
+static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
+{
+#ifdef HAVE_TBB
    Size ssize = src.size(), dsize = dst.size();
-    int cn = src.channels();
-    int dy, dx, k = 0;
-    int area = scale_x*scale_y;
-    float scale = 1.f/(scale_x*scale_y);
-    int dwidth1 = (ssize.width/scale_x)*cn;
-    dsize.width *= cn;
-    ssize.width *= cn;
-    //avg values
-    for( dy = 0; dy < dsize.height; dy++ )
+    AutoBuffer<int> _yofs(2 * ssize.height);
+    int *yofs = _yofs, *cur_dy_ofs = _yofs + ssize.height;
+    int index = 0, cur_dy = 0, sy;
+    
+    for( sy = 0; sy < ssize.height; sy++ )
    {
-        uchar* D = (uchar*)(dst.data + dst.step*dy);
-        int sy0 = dy*scale_y, w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
-        if( sy0 >= ssize.height )
-        {
-            for( dx = 0; dx < dsize.width; dx++ ) //memset(D,0, dsize.width);//warning, never executed -> not tested
-                D[dx] = 0;
-            continue;
-        }
-        dx = 0;
-        
-    #if CV_SSE2
-        if( haveSSE2 )
+        bool reset = false;
+        cur_dy_ofs[sy] = cur_dy;
+        if( (cur_dy + 1)*scale_y_ <= sy + 1 || sy == ssize.height - 1 )
        {
-            const __m128 _scale = _mm_set1_ps(scale);
-            const __m128i _ucMAXs = _mm_set1_epi16(UCHAR_MAX);
-            const uchar* _S[8];
-            
-            for(; dx < w-8; dx+=8 )
+            WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y_, 0.);
+            if( fabs(beta) < 1e-3 )
            {
-                __m128i _sum = _mm_setzero_si128();
-                __m128i _sum1 = _mm_setzero_si128();
-                _S[0] = (const uchar*)(src.data + src.step*sy0) + xofs[dx];
-                _S[1] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+1];
-                _S[2] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+2];
-                _S[3] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+3];
-                
-                _S[4] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+4];
-                _S[5] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+5];
-                _S[6] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+6];
-                _S[7] = (const uchar*)(src.data + src.step*sy0) + xofs[dx+7];
-                
-                for( k = 0; k < area; k++ )
-                {
-                    int ofsk = ofs[k];
-                    __m128i _temp = _mm_set_epi32(_S[3][ofsk],_S[2][ofsk],_S[1][ofsk],_S[0][ofsk]);
-                    _sum = _mm_add_epi32(_sum, _temp);
-                    
-                    __m128i _temp1 = _mm_set_epi32(_S[7][ofsk],_S[6][ofsk],_S[5][ofsk],_S[4][ofsk]);
-                    _sum1 = _mm_add_epi32(_sum1, _temp1);
-                }
-                
-                __m128i _tempSum =  _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum), _scale));
-                __m128i _tempSum1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale));
-                
-                _tempSum = _mm_packs_epi32(_tempSum, _tempSum1);
-                _tempSum = _mm_min_epi16(_ucMAXs, _tempSum);
-                _tempSum = _mm_packus_epi16(_tempSum, _tempSum);
-                _mm_storel_epi64((__m128i*)(D+dx),_tempSum);
-            }
-        }
-    #endif
-        
-        for(; dx < w; dx++ )
-        {
-            const uchar* S = (const uchar*)(src.data + src.step*sy0) + xofs[dx];
-            int sum = 0;
-            k=0;
-            
-        #if CV_ENABLE_UNROLLED
-            for( ; k <= area - 4; k += 4 )
-                sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
-        #endif
-            
-            for( ; k < area; k++ )
-                sum += S[ofs[k]];
-            
-            
-            D[dx] = saturate_cast<uchar>(sum*scale);
-        }
-        
-        for( ; dx < dsize.width; dx++ )
-        {
-            int sum = 0;
-            int count = 0, sx0 = xofs[dx];
-            if( sx0 >= ssize.width )
-                D[dx] = 0;
-            
-            for( int sy = 0; sy < scale_y; sy++ )
-            {
-                if( sy0 + sy >= ssize.height )
+                if(cur_dy >= dsize.height)
                    break;
-                const uchar* S = (const uchar*)(src.data + src.step*(sy0 + sy)) + sx0;
-                int sx = 0;
-                for( ; sx < scale_x*cn; sx += cn )
-                {
-                    if( sx0 + sx >= ssize.width )
-                        break;
-                    sum += S[sx];
-                    count++;
-                }
+                reset = true;
            }
-            
-            D[dx] = saturate_cast<uchar>((float)sum/count);
+            cur_dy++;
        }
+        yofs[sy] = index;
+        if (reset)
+            index = sy + 1;
    }
+#endif
+    
+    Range range(0, src.rows);
+    resizeArea_Invoker<T, WT> invoker(src, dst, xofs, xofs_count, scale_y_
+#ifdef HAVE_TBB
+        , yofs, cur_dy_ofs
+#endif
+        );
+    parallel_for_(range, invoker);
 }


@@ -1457,10 +1652,12 @@ typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
                                    int scale_x, int scale_y );

 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
-                                const DecimateAlpha* xofs, int xofs_count, double scale_y_);
+                                const DecimateAlpha* xofs, int xofs_count, 
+                                double scale_y_);

 }

+
 //////////////////////////////////////////////////////////////////////////////////////////

 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
@@ -1553,30 +1750,33 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,

    static ResizeAreaFastFunc areafast_tab[] =
    {
-        resizeAreaFast_8u, 0,
-        resizeAreaFast_<ushort, float>,
-        resizeAreaFast_<short, float>,
+        resizeAreaFast_<uchar, int, ResizeAreaFast_2x2_8u<uchar, int> >, 
+        0,
+        resizeAreaFast_<ushort, float, ResizeAreaFastNoVec<ushort, float> >,
+        resizeAreaFast_<short, float, ResizeAreaFastNoVec<short, float> >,
        0,
-        resizeAreaFast_<float, float>,
-        resizeAreaFast_<double, double>,
+        resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
+        resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
        0
    };

    static ResizeAreaFunc area_tab[] =
    {
-        resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, resizeArea_<short, float>,
-        0, resizeArea_<float, float>, resizeArea_<double, double>, 0
+        resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, 
+        resizeArea_<short, float>, 0, resizeArea_<float, float>,
+        resizeArea_<double, double>, 0
    };

    Mat src = _src.getMat();
    Size ssize = src.size();

    CV_Assert( ssize.area() > 0 );
-    CV_Assert( !(dsize == Size()) || (inv_scale_x > 0 && inv_scale_y > 0) );
-    if( dsize == Size() )
+    CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
+    if( !dsize.area() )
    {
        dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
            saturate_cast<int>(src.rows*inv_scale_y));
+        CV_Assert( dsize.area() );
    }
    else
    {
@@ -1601,79 +1801,92 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
        resizeNN( src, dst, inv_scale_x, inv_scale_y );
        return;
    }
-
-    // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
-    // In other cases it is emulated using some variant of bilinear interpolation
-    if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
+    
    {
        int iscale_x = saturate_cast<int>(scale_x);
        int iscale_y = saturate_cast<int>(scale_y);
-
-        if( std::abs(scale_x - iscale_x) < DBL_EPSILON &&
-            std::abs(scale_y - iscale_y) < DBL_EPSILON )
+        
+        bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
+                std::abs(scale_y - iscale_y) < DBL_EPSILON;
+            
+        // in case of scale_x && scale_y is equal to 2 
+        // INTER_AREA (fast) also is equal to INTER_LINEAR
+        if ( interpolation == INTER_LINEAR &&
+            scale_x >= 1 && scale_y >= 1 && is_area_fast)
+            interpolation = INTER_AREA;
+
+        // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
+        // In other cases it is emulated using some variant of bilinear interpolation
+        if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
        {
-            int area = iscale_x*iscale_y;
-            size_t srcstep = src.step / src.elemSize1();
-            AutoBuffer<int> _ofs(area + dsize.width*cn);
-            int* ofs = _ofs;
-            int* xofs = ofs + area;
-            ResizeAreaFastFunc func = areafast_tab[depth];
-            CV_Assert( func != 0 );
-
-            for( sy = 0, k = 0; sy < iscale_y; sy++ )
-                for( sx = 0; sx < iscale_x; sx++ )
-                    ofs[k++] = (int)(sy*srcstep + sx*cn);
-
-            for( dx = 0; dx < dsize.width; dx++ )
+            if( is_area_fast )
            {
-                sx = dx*iscale_x*cn;
-                for( k = 0; k < cn; k++ )
-                    xofs[dx*cn + k] = sx + k;
-            }
+                int area = iscale_x*iscale_y;
+                size_t srcstep = src.step / src.elemSize1();
+                AutoBuffer<int> _ofs(area + dsize.width*cn);
+                int* ofs = _ofs;
+                int* xofs = ofs + area;
+                ResizeAreaFastFunc func = areafast_tab[depth];
+                CV_Assert( func != 0 );
+
+                for( sy = 0, k = 0; sy < iscale_y; sy++ )
+                    for( sx = 0; sx < iscale_x; sx++ )
+                        ofs[k++] = (int)(sy*srcstep + sx*cn);

-            func( src, dst, ofs, xofs, iscale_x, iscale_y );
-            return;
-        }
+                for( dx = 0; dx < dsize.width; dx++ )
+                {
+                    int j = dx * cn;
+                    sx = iscale_x * j;
+                    for( k = 0; k < cn; k++ )
+                        xofs[j + k] = sx + k;
+                }

-        ResizeAreaFunc func = area_tab[depth];
-        CV_Assert( func != 0 && cn <= 4 );
+                func( src, dst, ofs, xofs, iscale_x, iscale_y );
+                return;
+            }

-        AutoBuffer<DecimateAlpha> _xofs(ssize.width*2);
-        DecimateAlpha* xofs = _xofs;
+            ResizeAreaFunc func = area_tab[depth];
+            CV_Assert( func != 0 && cn <= 4 );

-        for( dx = 0, k = 0; dx < dsize.width; dx++ )
-        {
-            double fsx1 = dx*scale_x, fsx2 = fsx1 + scale_x;
-            int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
-            sx1 = std::min(sx1, ssize.width-1);
-            sx2 = std::min(sx2, ssize.width-1);
+            AutoBuffer<DecimateAlpha> _xofs(ssize.width*2);
+            DecimateAlpha* xofs = _xofs;

-            if( sx1 > fsx1 )
+            for( dx = 0, k = 0; dx < dsize.width; dx++ )
            {
-                assert( k < ssize.width*2 );
-                xofs[k].di = dx*cn;
-                xofs[k].si = (sx1-1)*cn;
-                xofs[k++].alpha = (float)((sx1 - fsx1) / min(scale_x, src.cols - fsx1));
-            }
+                double fsx1 = dx*scale_x;
+                double fsx2 = fsx1 + scale_x;
+                int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
+                sx1 = std::min(sx1, ssize.width-1);
+                sx2 = std::min(sx2, ssize.width-1);

-            for( sx = sx1; sx < sx2; sx++ )
-            {
-                assert( k < ssize.width*2 );
-                xofs[k].di = dx*cn;
-                xofs[k].si = sx*cn;
-                xofs[k++].alpha = float(1.0 / min(scale_x, src.cols - fsx1));
-            }
+                if( sx1 > fsx1 )
+                {
+                    assert( k < ssize.width*2 );
+                    xofs[k].di = dx*cn;
+                    xofs[k].si = (sx1-1)*cn;
+                    xofs[k++].alpha = (float)((sx1 - fsx1) / min(scale_x, src.cols - fsx1));
+                }

-            if( fsx2 - sx2 > 1e-3 )
-            {
-                assert( k < ssize.width*2 );
-                xofs[k].di = dx*cn;
-                xofs[k].si = sx2*cn;
-                xofs[k++].alpha = (float)(min(fsx2 - sx2, 1.) / min(scale_x, src.cols - fsx1));
+                for( sx = sx1; sx < sx2; sx++ )
+                {
+                    assert( k < ssize.width*2 );
+                    xofs[k].di = dx*cn;
+                    xofs[k].si = sx*cn;
+                    xofs[k++].alpha = float(1.0 / min(scale_x, src.cols - fsx1));
+                }
+
+                if( fsx2 - sx2 > 1e-3 )
+                {
+                    assert( k < ssize.width*2 );
+                    xofs[k].di = dx*cn;
+                    xofs[k].si = sx2*cn;
+                    xofs[k++].alpha = (float)(min(fsx2 - sx2, 1.) / min(scale_x, src.cols - fsx1));
+                }
            }
+            
+            func( src, dst, xofs, k, scale_y);
+            return;
        }
-        func( src, dst, xofs, k ,scale_y);
-        return;
    }

    int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
@@ -2549,6 +2762,206 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
                          const Mat& _fxy, const void* _wtab,
                          int borderType, const Scalar& _borderValue);

+class remapInvoker :
+    public ParallelLoopBody
+{
+public:
+    remapInvoker(const Mat& _src, Mat _dst, const Mat& _map1, const Mat& _map2, const Mat *_m1, 
+                 const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
+                 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
+        ParallelLoopBody(), src(_src), dst(_dst), map1(_map1), map2(_map2), m1(_m1), m2(_m2),
+        interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue), 
+        planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
+    {    
+    }   
+     
+    virtual void operator() (const Range& range) const
+    {
+        int x, y, x1, y1;
+        const int buf_size = 1 << 14;
+        int brows0 = std::min(128, dst.rows), map_depth = map1.depth();
+        int bcols0 = std::min(buf_size/brows0, dst.cols);
+        brows0 = std::min(buf_size/bcols0, dst.rows);
+    #if CV_SSE2
+        bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    #endif
+
+        Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
+        if( !nnfunc )
+            _bufa.create(brows0, bcols0, CV_16UC1);
+
+        for( y = range.start; y < range.end; y += brows0 )
+        {
+            for( x = 0; x < dst.cols; x += bcols0 )
+            {
+                int brows = std::min(brows0, range.end - y);
+                int bcols = std::min(bcols0, dst.cols - x);
+                Mat dpart(dst, Rect(x, y, bcols, brows));
+                Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
+
+                if( nnfunc )
+                {
+                    if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
+                        bufxy = map1(Rect(x, y, bcols, brows));
+                    else if( map_depth != CV_32F )
+                    {
+                        for( y1 = 0; y1 < brows; y1++ )
+                        {
+                            short* XY = (short*)(bufxy.data + bufxy.step*y1);
+                            const short* sXY = (const short*)(m1->data + m1->step*(y+y1)) + x*2;
+                            const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
+
+                            for( x1 = 0; x1 < bcols; x1++ )
+                            {
+                                int a = sA[x1] & (INTER_TAB_SIZE2-1);
+                                XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
+                                XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
+                            }
+                        }
+                    }
+                    else if( !planar_input )
+                        map1(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
+                    else
+                    {
+                        for( y1 = 0; y1 < brows; y1++ )
+                        {
+                            short* XY = (short*)(bufxy.data + bufxy.step*y1);
+                            const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
+                            const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
+                            x1 = 0;
+
+                        #if CV_SSE2
+                            if( useSIMD )
+                            {
+                                for( ; x1 <= bcols - 8; x1 += 8 )
+                                {
+                                    __m128 fx0 = _mm_loadu_ps(sX + x1);
+                                    __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
+                                    __m128 fy0 = _mm_loadu_ps(sY + x1);
+                                    __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
+                                    __m128i ix0 = _mm_cvtps_epi32(fx0);
+                                    __m128i ix1 = _mm_cvtps_epi32(fx1);
+                                    __m128i iy0 = _mm_cvtps_epi32(fy0);
+                                    __m128i iy1 = _mm_cvtps_epi32(fy1);
+                                    ix0 = _mm_packs_epi32(ix0, ix1);
+                                    iy0 = _mm_packs_epi32(iy0, iy1);
+                                    ix1 = _mm_unpacklo_epi16(ix0, iy0);
+                                    iy1 = _mm_unpackhi_epi16(ix0, iy0);
+                                    _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
+                                    _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
+                                }
+                            }
+                        #endif
+
+                            for( ; x1 < bcols; x1++ )
+                            {
+                                XY[x1*2] = saturate_cast<short>(sX[x1]);
+                                XY[x1*2+1] = saturate_cast<short>(sY[x1]);
+                            }
+                        }
+                    }
+                    nnfunc( src, dpart, bufxy, borderType, borderValue );
+                    continue;
+                }
+
+                Mat bufa(_bufa, Rect(0, 0, bcols, brows));
+                for( y1 = 0; y1 < brows; y1++ )
+                {
+                    short* XY = (short*)(bufxy.data + bufxy.step*y1);
+                    ushort* A = (ushort*)(bufa.data + bufa.step*y1);
+
+                    if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
+                        (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
+                    {
+                        bufxy = m1->operator()(Rect(x, y, bcols, brows));
+                        bufa = m2->operator()(Rect(x, y, bcols, brows));
+                    }
+                    else if( planar_input )
+                    {
+                        const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
+                        const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
+
+                        x1 = 0;
+                    #if CV_SSE2
+                        if( useSIMD )
+                        {
+                            __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
+                            __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
+                            for( ; x1 <= bcols - 8; x1 += 8 )
+                            {
+                                __m128 fx0 = _mm_loadu_ps(sX + x1);
+                                __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
+                                __m128 fy0 = _mm_loadu_ps(sY + x1);
+                                __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
+                                __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
+                                __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
+                                __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
+                                __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
+                                __m128i mx0 = _mm_and_si128(ix0, mask);
+                                __m128i mx1 = _mm_and_si128(ix1, mask);
+                                __m128i my0 = _mm_and_si128(iy0, mask);
+                                __m128i my1 = _mm_and_si128(iy1, mask);
+                                mx0 = _mm_packs_epi32(mx0, mx1);
+                                my0 = _mm_packs_epi32(my0, my1);
+                                my0 = _mm_slli_epi16(my0, INTER_BITS);
+                                mx0 = _mm_or_si128(mx0, my0);
+                                _mm_storeu_si128((__m128i*)(A + x1), mx0);
+                                ix0 = _mm_srai_epi32(ix0, INTER_BITS);
+                                ix1 = _mm_srai_epi32(ix1, INTER_BITS);
+                                iy0 = _mm_srai_epi32(iy0, INTER_BITS);
+                                iy1 = _mm_srai_epi32(iy1, INTER_BITS);
+                                ix0 = _mm_packs_epi32(ix0, ix1);
+                                iy0 = _mm_packs_epi32(iy0, iy1);
+                                ix1 = _mm_unpacklo_epi16(ix0, iy0);
+                                iy1 = _mm_unpackhi_epi16(ix0, iy0);
+                                _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
+                                _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
+                            }
+                        }
+                    #endif
+
+                        for( ; x1 < bcols; x1++ )
+                        {
+                            int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
+                            int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
+                            int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
+                            XY[x1*2] = (short)(sx >> INTER_BITS);
+                            XY[x1*2+1] = (short)(sy >> INTER_BITS);
+                            A[x1] = (ushort)v;
+                        }
+                    }
+                    else
+                    {
+                        const float* sXY = (const float*)(map1.data + map1.step*(y+y1)) + x*2;
+
+                        for( x1 = 0; x1 < bcols; x1++ )
+                        {
+                            int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
+                            int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
+                            int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
+                            XY[x1*2] = (short)(sx >> INTER_BITS);
+                            XY[x1*2+1] = (short)(sy >> INTER_BITS);
+                            A[x1] = (ushort)v;
+                        }
+                    }
+                }
+                ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue);
+            }
+        }
+    }
+    
+private:
+    const Mat src;
+    Mat dst;
+    const Mat map1, map2, *m1, *m2;
+    int interpolation, borderType;
+    const Scalar borderValue;
+    int planar_input;
+    RemapNNFunc nnfunc;
+    RemapFunc ifunc;
+    const void *ctab;
+};
+
 }

 void cv::remap( InputArray _src, OutputArray _dst,
@@ -2590,14 +3003,15 @@ void cv::remap( InputArray _src, OutputArray _dst,

    Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();

-    CV_Assert( (!map2.data || map2.size() == map1.size()));
+    CV_Assert( map1.size().area() > 0 );
+    CV_Assert( !map2.data || (map2.size() == map1.size()));

    _dst.create( map1.size(), src.type() );
    Mat dst = _dst.getMat();
    if( dst.data == src.data )
        src = src.clone();

-    int depth = src.depth(), map_depth = map1.depth();
+    int depth = src.depth();
    RemapNNFunc nnfunc = 0;
    RemapFunc ifunc = 0;
    const void* ctab = 0;
@@ -2608,12 +3022,6 @@ void cv::remap( InputArray _src, OutputArray _dst,
    {
        nnfunc = nn_tab[depth];
        CV_Assert( nnfunc != 0 );
-
-        if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
-        {
-            nnfunc( src, dst, map1, borderType, borderValue );
-            return;
-        }
    }
    else
    {
@@ -2639,182 +3047,19 @@ void cv::remap( InputArray _src, OutputArray _dst,
    {
        if( map1.type() != CV_16SC2 )
            std::swap(m1, m2);
-        if( ifunc )
-        {
-            ifunc( src, dst, *m1, *m2, ctab, borderType, borderValue );
-            return;
-        }
    }
    else
    {
-        CV_Assert( (map1.type() == CV_32FC2 && !map2.data) ||
+        CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && !map2.data) ||
            (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
        planar_input = map1.channels() == 1;
    }

-    int x, y, x1, y1;
-    const int buf_size = 1 << 14;
-    int brows0 = std::min(128, dst.rows);
-    int bcols0 = std::min(buf_size/brows0, dst.cols);
-    brows0 = std::min(buf_size/bcols0, dst.rows);
-#if CV_SSE2
-    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
-#endif
-
-    Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
-    if( !nnfunc )
-        _bufa.create(brows0, bcols0, CV_16UC1);
-
-    for( y = 0; y < dst.rows; y += brows0 )
-    {
-        for( x = 0; x < dst.cols; x += bcols0 )
-        {
-            int brows = std::min(brows0, dst.rows - y);
-            int bcols = std::min(bcols0, dst.cols - x);
-            Mat dpart(dst, Rect(x, y, bcols, brows));
-            Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
-
-            if( nnfunc )
-            {
-                if( map_depth != CV_32F )
-                {
-                    for( y1 = 0; y1 < brows; y1++ )
-                    {
-                        short* XY = (short*)(bufxy.data + bufxy.step*y1);
-                        const short* sXY = (const short*)(m1->data + m1->step*(y+y1)) + x*2;
-                        const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
-
-                        for( x1 = 0; x1 < bcols; x1++ )
-                        {
-                            int a = sA[x1] & (INTER_TAB_SIZE2-1);
-                            XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
-                            XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
-                        }
-                    }
-                }
-                else if( !planar_input )
-                    map1(Rect(0,0,bcols,brows)).convertTo(bufxy, bufxy.depth());
-                else
-                {
-                    for( y1 = 0; y1 < brows; y1++ )
-                    {
-                        short* XY = (short*)(bufxy.data + bufxy.step*y1);
-                        const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
-                        const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
-                        x1 = 0;
-
-                    #if CV_SSE2
-                        if( useSIMD )
-                        {
-                            for( ; x1 <= bcols - 8; x1 += 8 )
-                            {
-                                __m128 fx0 = _mm_loadu_ps(sX + x1);
-                                __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
-                                __m128 fy0 = _mm_loadu_ps(sY + x1);
-                                __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
-                                __m128i ix0 = _mm_cvtps_epi32(fx0);
-                                __m128i ix1 = _mm_cvtps_epi32(fx1);
-                                __m128i iy0 = _mm_cvtps_epi32(fy0);
-                                __m128i iy1 = _mm_cvtps_epi32(fy1);
-                                ix0 = _mm_packs_epi32(ix0, ix1);
-                                iy0 = _mm_packs_epi32(iy0, iy1);
-                                ix1 = _mm_unpacklo_epi16(ix0, iy0);
-                                iy1 = _mm_unpackhi_epi16(ix0, iy0);
-                                _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
-                                _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
-                            }
-                        }
-                    #endif
-
-                        for( ; x1 < bcols; x1++ )
-                        {
-                            XY[x1*2] = saturate_cast<short>(sX[x1]);
-                            XY[x1*2+1] = saturate_cast<short>(sY[x1]);
-                        }
-                    }
-                }
-                nnfunc( src, dpart, bufxy, borderType, borderValue );
-                continue;
-            }
-
-            Mat bufa(_bufa, Rect(0,0,bcols, brows));
-            for( y1 = 0; y1 < brows; y1++ )
-            {
-                short* XY = (short*)(bufxy.data + bufxy.step*y1);
-                ushort* A = (ushort*)(bufa.data + bufa.step*y1);
-
-                if( planar_input )
-                {
-                    const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
-                    const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
-
-                    x1 = 0;
-                #if CV_SSE2
-                    if( useSIMD )
-                    {
-                        __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
-                        __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
-                        for( ; x1 <= bcols - 8; x1 += 8 )
-                        {
-                            __m128 fx0 = _mm_loadu_ps(sX + x1);
-                            __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
-                            __m128 fy0 = _mm_loadu_ps(sY + x1);
-                            __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
-                            __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
-                            __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
-                            __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
-                            __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
-                            __m128i mx0 = _mm_and_si128(ix0, mask);
-                            __m128i mx1 = _mm_and_si128(ix1, mask);
-                            __m128i my0 = _mm_and_si128(iy0, mask);
-                            __m128i my1 = _mm_and_si128(iy1, mask);
-                            mx0 = _mm_packs_epi32(mx0, mx1);
-                            my0 = _mm_packs_epi32(my0, my1);
-                            my0 = _mm_slli_epi16(my0, INTER_BITS);
-                            mx0 = _mm_or_si128(mx0, my0);
-                            _mm_storeu_si128((__m128i*)(A + x1), mx0);
-                            ix0 = _mm_srai_epi32(ix0, INTER_BITS);
-                            ix1 = _mm_srai_epi32(ix1, INTER_BITS);
-                            iy0 = _mm_srai_epi32(iy0, INTER_BITS);
-                            iy1 = _mm_srai_epi32(iy1, INTER_BITS);
-                            ix0 = _mm_packs_epi32(ix0, ix1);
-                            iy0 = _mm_packs_epi32(iy0, iy1);
-                            ix1 = _mm_unpacklo_epi16(ix0, iy0);
-                            iy1 = _mm_unpackhi_epi16(ix0, iy0);
-                            _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
-                            _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
-                        }
-                    }
-                #endif
-
-                    for( ; x1 < bcols; x1++ )
-                    {
-                        int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
-                        int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
-                        int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
-                        XY[x1*2] = (short)(sx >> INTER_BITS);
-                        XY[x1*2+1] = (short)(sy >> INTER_BITS);
-                        A[x1] = (ushort)v;
-                    }
-                }
-                else
-                {
-                    const float* sXY = (const float*)(map1.data + map1.step*(y+y1)) + x*2;
-
-                    for( x1 = 0; x1 < bcols; x1++ )
-                    {
-                        int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
-                        int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
-                        int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
-                        XY[x1*2] = (short)(sx >> INTER_BITS);
-                        XY[x1*2+1] = (short)(sy >> INTER_BITS);
-                        A[x1] = (ushort)v;
-                    }
-                }
-            }
-            ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue);
-        }
-    }
+    Range range(0, dst.rows);
+    remapInvoker invoker(src, dst, map1, map2, m1, m2, interpolation, 
+                         borderType, borderValue, planar_input, nnfunc, ifunc,
+                         ctab);
+    parallel_for_(range, invoker);
 }


@@ -2956,7 +3201,134 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
    }
 }

+    
+namespace cv
+{
+
+class warpAffineInvoker :   
+    public ParallelLoopBody
+{
+public:
+    warpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType, 
+                      const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
+        ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
+        borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
+        M(_M)
+    {
+    }
+    
+    virtual void operator() (const Range& range) const
+    {
+        const int BLOCK_SZ = 64;
+        short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
+        const int AB_BITS = MAX(10, (int)INTER_BITS);
+        const int AB_SCALE = 1 << AB_BITS;  
+        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
+    #if CV_SSE2
+        bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    #endif
+        
+        int bh0 = std::min(BLOCK_SZ/2, dst.rows);
+        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
+        bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);

+        for( y = range.start; y < range.end; y += bh0 )
+        {
+            for( x = 0; x < dst.cols; x += bw0 )
+            {
+                int bw = std::min( bw0, dst.cols - x);
+                int bh = std::min( bh0, range.end - y);
+
+                Mat _XY(bh, bw, CV_16SC2, XY), matA;
+                Mat dpart(dst, Rect(x, y, bw, bh));
+
+                for( y1 = 0; y1 < bh; y1++ )
+                {
+                    short* xy = XY + y1*bw*2;
+                    int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
+                    int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
+
+                    if( interpolation == INTER_NEAREST )
+                        for( x1 = 0; x1 < bw; x1++ )
+                        {
+                            int X = (X0 + adelta[x+x1]) >> AB_BITS;
+                            int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
+                            xy[x1*2] = saturate_cast<short>(X);
+                            xy[x1*2+1] = saturate_cast<short>(Y);
+                        }
+                    else
+                    {
+                        short* alpha = A + y1*bw;
+                        x1 = 0;
+                    #if CV_SSE2
+                        if( useSIMD )
+                        {
+                            __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
+                            __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
+                            for( ; x1 <= bw - 8; x1 += 8 )
+                            {
+                                __m128i tx0, tx1, ty0, ty1;
+                                tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
+                                ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
+                                tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
+                                ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
+
+                                tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
+                                ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
+                                tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
+                                ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
+
+                                __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
+                                                            _mm_and_si128(tx1, fxy_mask));
+                                __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
+                                                            _mm_and_si128(ty1, fxy_mask));
+                                tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
+                                                            _mm_srai_epi32(tx1, INTER_BITS));
+                                ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
+                                                    _mm_srai_epi32(ty1, INTER_BITS));
+                                fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
+
+                                _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
+                                _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
+                                _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
+                            }
+                        }
+                    #endif
+                        for( ; x1 < bw; x1++ )
+                        {
+                            int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
+                            int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
+                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                                    (X & (INTER_TAB_SIZE-1)));
+                        }
+                    }
+                }
+
+                if( interpolation == INTER_NEAREST )
+                    remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
+                else
+                {
+                    Mat _matA(bh, bw, CV_16U, A);
+                    remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
+                }
+            }
+        }
+    }
+    
+private:
+    const Mat src;
+    Mat dst;
+    int interpolation, borderType;
+    const Scalar borderValue;
+    int *adelta, *bdelta;
+    double *M;
+};
+    
+}
+    
+    
 void cv::warpAffine( InputArray _src, OutputArray _dst,
                     InputArray _M0, Size dsize,
                     int flags, int borderType, const Scalar& borderValue )
@@ -2968,8 +3340,6 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
    if( dst.data == src.data )
        src = src.clone();

-    const int BLOCK_SZ = 64;
-    short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
    double M[6];
    Mat matM(2, 3, CV_64F, M);
    int interpolation = flags & INTER_MAX;
@@ -2996,112 +3366,121 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
        M[2] = b1; M[5] = b2;
    }

-    int x, y, x1, y1, width = dst.cols, height = dst.rows;
-    AutoBuffer<int> _abdelta(width*2);
-    int* adelta = &_abdelta[0], *bdelta = adelta + width;
+    int x;
+    AutoBuffer<int> _abdelta(dst.cols*2);
+    int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
    const int AB_BITS = MAX(10, (int)INTER_BITS);
    const int AB_SCALE = 1 << AB_BITS;
-    int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2;
-#if CV_SSE2
-    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
-#endif

-    for( x = 0; x < width; x++ )
+    for( x = 0; x < dst.cols; x++ )
    {
        adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
        bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
    }

-    int bh0 = std::min(BLOCK_SZ/2, height);
-    int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
-    bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
+    Range range(0, dst.rows);
+    warpAffineInvoker invoker(src, dst, interpolation, borderType,
+                              borderValue, adelta, bdelta, M);
+    parallel_for_(range, invoker);
+}

-    for( y = 0; y < height; y += bh0 )
-    {
-        for( x = 0; x < width; x += bw0 )
-        {
-            int bw = std::min( bw0, width - x);
-            int bh = std::min( bh0, height - y);

-            Mat _XY(bh, bw, CV_16SC2, XY), matA;
-            Mat dpart(dst, Rect(x, y, bw, bh));
+namespace cv
+{

-            for( y1 = 0; y1 < bh; y1++ )
+class warpPerspectiveInvoker :
+    public ParallelLoopBody
+{
+public:
+    
+    warpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
+                           int _borderType, const Scalar &_borderValue) :
+        ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
+        borderType(_borderType), borderValue(_borderValue)
+    {
+    }
+    
+    virtual void operator() (const Range& range) const
+    {
+        const int BLOCK_SZ = 32;
+        short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
+        int x, y, x1, y1, width = dst.cols, height = dst.rows;
+        
+        int bh0 = std::min(BLOCK_SZ/2, height);
+        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
+        bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
+        
+        for( y = range.start; y < range.end; y += bh0 )
+        {
+            for( x = 0; x < width; x += bw0 )
            {
-                short* xy = XY + y1*bw*2;
-                int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
-                int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
-
-                if( interpolation == INTER_NEAREST )
-                    for( x1 = 0; x1 < bw; x1++ )
-                    {
-                        int X = (X0 + adelta[x+x1]) >> AB_BITS;
-                        int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
-                        xy[x1*2] = saturate_cast<short>(X);
-                        xy[x1*2+1] = saturate_cast<short>(Y);
-                    }
-                else
+                int bw = std::min( bw0, width - x);
+                int bh = std::min( bh0, range.end - y); // height
+                
+                Mat _XY(bh, bw, CV_16SC2, XY), matA;
+                Mat dpart(dst, Rect(x, y, bw, bh));
+                
+                for( y1 = 0; y1 < bh; y1++ )
                {
-                    short* alpha = A + y1*bw;
-                    x1 = 0;
-                #if CV_SSE2
-                    if( useSIMD )
-                    {
-                        __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
-                        __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
-                        for( ; x1 <= bw - 8; x1 += 8 )
+                    short* xy = XY + y1*bw*2;
+                    double X0 = M[0]*x + M[1]*(y + y1) + M[2];
+                    double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
+                    double W0 = M[6]*x + M[7]*(y + y1) + M[8];
+                    
+                    if( interpolation == INTER_NEAREST )
+                        for( x1 = 0; x1 < bw; x1++ )
                        {
-                            __m128i tx0, tx1, ty0, ty1;
-                            tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
-                            ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
-                            tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
-                            ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
-
-                            tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
-                            ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
-                            tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
-                            ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
-
-                            __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
-                                                          _mm_and_si128(tx1, fxy_mask));
-                            __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
-                                                          _mm_and_si128(ty1, fxy_mask));
-                            tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
-                                                          _mm_srai_epi32(tx1, INTER_BITS));
-                            ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
-                                                  _mm_srai_epi32(ty1, INTER_BITS));
-                            fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
-
-                            _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
-                            _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
-                            _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
+                            double W = W0 + M[6]*x1;
+                            W = W ? 1./W : 0;
+                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+                            int X = saturate_cast<int>(fX);
+                            int Y = saturate_cast<int>(fY);
+                            
+                            xy[x1*2] = saturate_cast<short>(X);
+                            xy[x1*2+1] = saturate_cast<short>(Y);
                        }
-                    }
-                #endif
-                    for( ; x1 < bw; x1++ )
+                    else
                    {
-                        int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                        int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                        xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                        xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                        alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                (X & (INTER_TAB_SIZE-1)));
+                        short* alpha = A + y1*bw;
+                        for( x1 = 0; x1 < bw; x1++ )
+                        {
+                            double W = W0 + M[6]*x1;
+                            W = W ? INTER_TAB_SIZE/W : 0;
+                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+                            int X = saturate_cast<int>(fX);
+                            int Y = saturate_cast<int>(fY);
+                            
+                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                                                (X & (INTER_TAB_SIZE-1)));
+                        }
                    }
                }
-            }
-
-            if( interpolation == INTER_NEAREST )
-                remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
-            else
-            {
-                Mat _matA(bh, bw, CV_16U, A);
-                remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
+                
+                if( interpolation == INTER_NEAREST )
+                    remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
+                else
+                {
+                    Mat _matA(bh, bw, CV_16U, A);
+                    remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
+                }
            }
        }
    }
+    
+private:
+    const Mat src;
+    Mat dst;
+    double* M;
+    int interpolation, borderType;
+    const Scalar borderValue;
+};
+    
 }

-
 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
                          Size dsize, int flags, int borderType, const Scalar& borderValue )
 {
@@ -3113,8 +3492,6 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
    if( dst.data == src.data )
        src = src.clone();

-    const int BLOCK_SZ = 32;
-    short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
    double M[9];
    Mat matM(3, 3, CV_64F, M);
    int interpolation = flags & INTER_MAX;
@@ -3132,71 +3509,9 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
    if( !(flags & WARP_INVERSE_MAP) )
         invert(matM, matM);

-    int x, y, x1, y1, width = dst.cols, height = dst.rows;
-
-    int bh0 = std::min(BLOCK_SZ/2, height);
-    int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
-    bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
-
-    for( y = 0; y < height; y += bh0 )
-    {
-        for( x = 0; x < width; x += bw0 )
-        {
-            int bw = std::min( bw0, width - x);
-            int bh = std::min( bh0, height - y);
-
-            Mat _XY(bh, bw, CV_16SC2, XY), matA;
-            Mat dpart(dst, Rect(x, y, bw, bh));
-
-            for( y1 = 0; y1 < bh; y1++ )
-            {
-                short* xy = XY + y1*bw*2;
-                double X0 = M[0]*x + M[1]*(y + y1) + M[2];
-                double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
-                double W0 = M[6]*x + M[7]*(y + y1) + M[8];
-
-                if( interpolation == INTER_NEAREST )
-                    for( x1 = 0; x1 < bw; x1++ )
-                    {
-                        double W = W0 + M[6]*x1;
-                        W = W ? 1./W : 0;
-                        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                        int X = saturate_cast<int>(fX);
-                        int Y = saturate_cast<int>(fY);
-
-                        xy[x1*2] = saturate_cast<short>(X);
-                        xy[x1*2+1] = saturate_cast<short>(Y);
-                    }
-                else
-                {
-                    short* alpha = A + y1*bw;
-                    for( x1 = 0; x1 < bw; x1++ )
-                    {
-                        double W = W0 + M[6]*x1;
-                        W = W ? INTER_TAB_SIZE/W : 0;
-                        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                        int X = saturate_cast<int>(fX);
-                        int Y = saturate_cast<int>(fY);
-
-                        xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                        xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                        alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                (X & (INTER_TAB_SIZE-1)));
-                    }
-                }
-            }
-
-            if( interpolation == INTER_NEAREST )
-                remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
-            else
-            {
-                Mat _matA(bh, bw, CV_16U, A);
-                remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
-            }
-        }
-    }
+    Range range(0, dst.rows);
+    warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
+    parallel_for_(range, invoker);
 }