fixed build issues related to changes in IPP calls.

2057f2c4 · Vladimir Dudnik · 8e776837 · 2057f2c4 · 2057f2c4 · 2057f2c4
Commit 2057f2c4 authored Apr 18, 2011 by Vladimir Dudnik
Showing with 404 additions and 302 deletions

arithm.cpp modules/core/src/arithm.cpp +326 -226

matmul.cpp modules/core/src/matmul.cpp +69 -68

precomp.hpp modules/core/src/precomp.hpp +7 -7

system.cpp modules/core/src/system.cpp +2 -1

No files found.
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -56,7 +56,7 @@ struct IPPArithmInitializer
 {
    IPPArithmInitializer(void)
    {
-        IppStatus status = ippStaticInit();
+        ippStaticInit();
    }
 };

@@ -64,19 +64,19 @@ IPPArithmInitializer ippArithmInitializer;
 #endif

 struct NOP {};
-    
+
 template<typename T, class Op, class Op8>
 void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
 {
    Op8 op8;
    Op op;
-    
+
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
                        src2 += step2/sizeof(src2[0]),
                        dst += step/sizeof(dst[0]) )
    {
        int x = 0;
-        
+
    #if CV_SSE2
        if( USE_SSE2 )
        {
@@ -97,7 +97,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
            }
        }
    #endif
-        
+
        for( ; x <= sz.width - 4; x += 4 )
        {
            T v0 = op(src1[x], src2[x]);
@@ -107,7 +107,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-        
+
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
@@ -119,13 +119,13 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2,
 {
    Op16 op16;
    Op op;
-    
+
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
        src2 += step2/sizeof(src2[0]),
        dst += step/sizeof(dst[0]) )
    {
        int x = 0;
-        
+
    #if CV_SSE2
        if( USE_SSE2 )
        {
@@ -147,7 +147,7 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2,
        }
        else
    #endif
-        
+
        for( ; x <= sz.width - 4; x += 4 )
        {
            T v0 = op(src1[x], src2[x]);
@@ -157,26 +157,26 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2,
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-        
+
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
 }

-    
+
 template<class Op, class Op32>
 void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
               int* dst, size_t step, Size sz)
 {
    Op32 op32;
    Op op;
-    
+
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
        src2 += step2/sizeof(src2[0]),
        dst += step/sizeof(dst[0]) )
    {
        int x = 0;
-        
+
 #if CV_SSE2
        if( USE_SSE2 )
        {
@@ -202,7 +202,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
                }
        }
 #endif
-        
+
        for( ; x <= sz.width - 4; x += 4 )
        {
            int v0 = op(src1[x], src2[x]);
@@ -212,26 +212,26 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-        
+
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
 }

-    
+
 template<class Op, class Op32>
 void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
               float* dst, size_t step, Size sz)
 {
    Op32 op32;
    Op op;
-    
+
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
        src2 += step2/sizeof(src2[0]),
        dst += step/sizeof(dst[0]) )
    {
        int x = 0;
-        
+
    #if CV_SSE2
        if( USE_SSE2 )
        {
@@ -266,7 +266,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-        
+
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
@@ -278,13 +278,13 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step
 {
    Op64 op64;
    Op op;
-    
+
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
        src2 += step2/sizeof(src2[0]),
        dst += step/sizeof(dst[0]) )
    {
        int x = 0;
-        
+
    #if CV_SSE2
        if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
            for( ; x <= sz.width - 4; x += 4 )
@@ -307,14 +307,14 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-        
+
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
 }
-    
+
 #if CV_SSE2
-    
+
 struct _VAdd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu8(a,b); }};
 struct _VSub8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu8(a,b); }};
 struct _VMin8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }};
@@ -410,7 +410,7 @@ struct _VAbsDiff32s
        __m128i m = _mm_cmpgt_epi32(b, a);
        return _mm_sub_epi32(_mm_xor_si128(d, m), m);
    }
-};    
+};

 struct _VAdd32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_add_ps(a,b); }};
 struct _VSub32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_sub_ps(a,b); }};
@@ -429,7 +429,7 @@ struct _VAdd64f { __m128d operator()(const __m128d& a, const __m128d& b) const {
 struct _VSub64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_sub_pd(a,b); }};
 struct _VMin64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_min_pd(a,b); }};
 struct _VMax64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_max_pd(a,b); }};
-    
+
 static int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
 struct _VAbsDiff64f
 {
@@ -437,13 +437,13 @@ struct _VAbsDiff64f
    {
        return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
    }
-};    
-    
+};
+
 struct _VAnd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_and_si128(a,b); }};
 struct _VOr8u  { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_or_si128(a,b); }};
 struct _VXor8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_xor_si128(a,b); }};
 struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { return _mm_andnot_si128(_mm_setzero_si128(),a); }};
-    
+
 #endif

 #if CV_SSE2
@@ -451,12 +451,12 @@ struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { re
 #else
 #define IF_SIMD(op) NOP
 #endif
-    
+
 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
 { return CV_FAST_CAST_8U(a + b); }
 template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
 { return CV_FAST_CAST_8U(a - b); }
-    
+
 template<typename T> struct OpAbsDiff
 {
    typedef T type1;
@@ -470,7 +470,7 @@ template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const

 template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
 { return saturate_cast<schar>(std::abs(a - b)); }
-    
+
 template<typename T, typename WT=T> struct OpAbsDiffS
 {
    typedef T type1;
@@ -510,19 +510,19 @@ template<typename T> struct OpNot
    typedef T rtype;
    T operator()( T a, T ) const { return ~a; }
 };
-    
+
 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
 {
    if( sz.height == 1 )
        step1 = step2 = step = sz.width*elemSize;
 }
-    
+
 static void add8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
           (vBinOp8<uchar, OpAdd<uchar>, IF_SIMD(_VAdd8u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -538,7 +538,7 @@ static void add16u( const ushort* src1, size_t step1,
                    ushort* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
            (vBinOp16<ushort, OpAdd<ushort>, IF_SIMD(_VAdd16u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -547,7 +547,7 @@ static void add16s( const short* src1, size_t step1,
                    short* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
           (vBinOp16<short, OpAdd<short>, IF_SIMD(_VAdd16s)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -563,7 +563,7 @@ static void add32f( const float* src1, size_t step1,
                    float* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_32f_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
           (vBinOp32f<OpAdd<float>, IF_SIMD(_VAdd32f)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -579,7 +579,7 @@ static void sub8u( const uchar* src1, size_t step1,
                   uchar* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
           (vBinOp8<uchar, OpSub<uchar>, IF_SIMD(_VSub8u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -595,7 +595,7 @@ static void sub16u( const ushort* src1, size_t step1,
                    ushort* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
           (vBinOp16<ushort, OpSub<ushort>, IF_SIMD(_VSub16u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -604,7 +604,7 @@ static void sub16s( const short* src1, size_t step1,
                    short* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
           (vBinOp16<short, OpSub<short>, IF_SIMD(_VSub16s)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -620,7 +620,7 @@ static void sub32f( const float* src1, size_t step1,
                   float* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_32f_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz),
           (vBinOp32f<OpSub<float>, IF_SIMD(_VSub32f)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -629,18 +629,36 @@ static void sub64f( const double* src1, size_t step1,
                    double* dst, size_t step, Size sz, void* )
 {
    vBinOp64f<OpSub<double>, IF_SIMD(_VSub64f)>(src1, step1, src2, step2, dst, step, sz);
-}    
+}

 template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
 template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
-    
+
 static void max8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp8<uchar, OpMax<uchar>, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    uchar* s1 = (uchar*)src1;
+    uchar* s2 = (uchar*)src2;
+    uchar* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMaxEvery_8u(s1, s2, d, sz.width);
+      s1 += step1;
+      s2 += step2;
+      d  += step;
+    }
+  }
+#else
+  vBinOp8<uchar, OpMax<uchar>, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMaxEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp8<uchar, OpMax<uchar>, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz)));
 }

 static void max8s( const schar* src1, size_t step1,
@@ -654,18 +672,34 @@ static void max16u( const ushort* src1, size_t step1,
                    const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<ushort, OpMax<ushort>, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    ushort* s1 = (ushort*)src1;
+    ushort* s2 = (ushort*)src2;
+    ushort* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMaxEvery_16u(s1, s2, d, sz.width);
+      s1 = (ushort*)((uchar*)s1 + step1);
+      s2 = (ushort*)((uchar*)s2 + step2);
+      d  = (ushort*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp16<ushort, OpMax<ushort>, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMaxEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp16<ushort, OpMax<ushort>, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz)));
 }

 static void max16s( const short* src1, size_t step1,
                    const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<short, OpMax<short>, IF_SIMD(_VMax16s)>(src1, step1, src2, step2, dst, step, sz)));
+    vBinOp16<short, OpMax<short>, IF_SIMD(_VMax16s)>(src1, step1, src2, step2, dst, step, sz);
 }

 static void max32s( const int* src1, size_t step1,
@@ -679,9 +713,26 @@ static void max32f( const float* src1, size_t step1,
                    const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp32f<OpMax<float>, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    float* s1 = (float*)src1;
+    float* s2 = (float*)src2;
+    float* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMaxEvery_32f(s1, s2, d, sz.width);
+      s1 = (float*)((uchar*)s1 + step1);
+      s2 = (float*)((uchar*)s2 + step2);
+      d  = (float*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp32f<OpMax<float>, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp32f<OpMax<float>, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz)));
 }

 static void max64f( const double* src1, size_t step1,
@@ -695,9 +746,27 @@ static void min8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp8<uchar, OpMin<uchar>, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    uchar* s1 = (uchar*)src1;
+    uchar* s2 = (uchar*)src2;
+    uchar* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMinEvery_8u(s1, s2, d, sz.width);
+      s1 += step1;
+      s2 += step2;
+      d  += step;
+    }
+  }
+#else
+  vBinOp8<uchar, OpMin<uchar>, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMinEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp8<uchar, OpMin<uchar>, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz)));
 }

 static void min8s( const schar* src1, size_t step1,
@@ -711,18 +780,34 @@ static void min16u( const ushort* src1, size_t step1,
                    const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<ushort, OpMin<ushort>, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    ushort* s1 = (ushort*)src1;
+    ushort* s2 = (ushort*)src2;
+    ushort* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMinEvery_16u(s1, s2, d, sz.width);
+      s1 = (ushort*)((uchar*)s1 + step1);
+      s2 = (ushort*)((uchar*)s2 + step2);
+      d  = (ushort*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp16<ushort, OpMin<ushort>, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMinEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp16<ushort, OpMin<ushort>, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz)));
 }

 static void min16s( const short* src1, size_t step1,
                    const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<short, OpMin<short>, IF_SIMD(_VMin16s)>(src1, step1, src2, step2, dst, step, sz)));
+    vBinOp16<short, OpMin<short>, IF_SIMD(_VMin16s)>(src1, step1, src2, step2, dst, step, sz);
 }

 static void min32s( const int* src1, size_t step1,
@@ -736,9 +821,26 @@ static void min32f( const float* src1, size_t step1,
                    const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp32f<OpMin<float>, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    float* s1 = (float*)src1;
+    float* s2 = (float*)src2;
+    float* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMinEvery_32f(s1, s2, d, sz.width);
+      s1 = (float*)((uchar*)s1 + step1);
+      s2 = (float*)((uchar*)s2 + step2);
+      d  = (float*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp32f<OpMin<float>, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp32f<OpMin<float>, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz)));
 }

 static void min64f( const double* src1, size_t step1,
@@ -746,14 +848,14 @@ static void min64f( const double* src1, size_t step1,
                    double* dst, size_t step, Size sz, void* )
 {
    vBinOp64f<OpMin<double>, IF_SIMD(_VMin64f)>(src1, step1, src2, step2, dst, step, sz);
-}    
+}

 static void absdiff8u( const uchar* src1, size_t step1,
                       const uchar* src2, size_t step2,
                       uchar* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
           (vBinOp8<uchar, OpAbsDiff<uchar>, IF_SIMD(_VAbsDiff8u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -769,7 +871,7 @@ static void absdiff16u( const ushort* src1, size_t step1,
                        ushort* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
           (vBinOp16<ushort, OpAbsDiff<ushort>, IF_SIMD(_VAbsDiff16u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -777,9 +879,7 @@ static void absdiff16s( const short* src1, size_t step1,
                        const short* src2, size_t step2,
                        short* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<short, OpAbsDiff<short>, IF_SIMD(_VAbsDiff16s)>(src1, step1, src2, step2, dst, step, sz)));
+    vBinOp16<short, OpAbsDiff<short>, IF_SIMD(_VAbsDiff16s)>(src1, step1, src2, step2, dst, step, sz);
 }

 static void absdiff32s( const int* src1, size_t step1,
@@ -794,7 +894,7 @@ static void absdiff32f( const float* src1, size_t step1,
                        float* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
           (vBinOp32f<OpAbsDiff<float>, IF_SIMD(_VAbsDiff32f)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -803,15 +903,15 @@ static void absdiff64f( const double* src1, size_t step1,
                        double* dst, size_t step, Size sz, void* )
 {
    vBinOp64f<OpAbsDiff<double>, IF_SIMD(_VAbsDiff64f)>(src1, step1, src2, step2, dst, step, sz);
-}    
-    
+}
+

 static void and8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
           (vBinOp8<uchar, OpAnd<uchar>, IF_SIMD(_VAnd8u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -820,7 +920,7 @@ static void or8u( const uchar* src1, size_t step1,
                  uchar* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
           (vBinOp8<uchar, OpOr<uchar>, IF_SIMD(_VOr8u)>(src1, step1, src2, step2, dst, step, sz)));
 }

@@ -829,23 +929,23 @@ static void xor8u( const uchar* src1, size_t step1,
                   uchar* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
           (vBinOp8<uchar, OpXor<uchar>, IF_SIMD(_VXor8u)>(src1, step1, src2, step2, dst, step, sz)));
-}    
+}

 static void not8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
 {
    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiNot_8u_C1R(src1, (int)step1, dst, (IppiSize&)sz),
+           ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, (IppiSize&)sz),
           (vBinOp8<uchar, OpNot<uchar>, IF_SIMD(_VNot8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
-    
+
 /****************************************************************************************\
 *                                   logical operations                                   *
 \****************************************************************************************/
-    
+
 static inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
 {
    if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() )
@@ -856,7 +956,7 @@ static inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
    return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) ||
        (sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
 }
-    
+
 static void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
 {
    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
@@ -872,9 +972,9 @@ static void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, si
    }
    for( size_t i = esz; i < blocksize*esz; i++ )
        scbuf[i] = scbuf[i - esz];
-    
+
 }
-    
+
 void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _dst,
               const InputArray& _mask, const BinaryFunc* tab, bool bitwise)
 {
@@ -883,7 +983,7 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
    bool haveMask = !_mask.empty(), haveScalar = false;
    BinaryFunc func;
    int c;
-    
+
    if( src1.dims <= 2 && src2.dims <= 2 && kind1 == kind2 &&
        src1.size() == src2.size() && src1.type() == src2.type() && !haveMask )
    {
@@ -899,12 +999,12 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
            func = tab[src1.depth()];
            c = src1.channels();
        }
-            
+
        Size sz = getContinuousSize(src1, src2, dst, c);
        func(src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, 0);
        return;
    }
-    
+
    if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 ||
        src1.size != src2.size || src1.type() != src2.type() )
    {
@@ -917,13 +1017,13 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                      "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
    }
-    
+
    size_t esz = src1.elemSize();
    size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
    int cn = src1.channels();
    BinaryFunc copymask = 0;
    Mat mask;
-    
+
    if( haveMask )
    {
        mask = _mask.getMat();
@@ -931,13 +1031,13 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        CV_Assert( mask.size == src1.size );
        copymask = getCopyMaskFunc(esz);
    }
-    
+
    AutoBuffer<uchar> _buf;
    uchar *scbuf = 0, *maskbuf = 0;
-    
+
    _dst.create(src1.dims, src1.size, src1.type());
    Mat dst = _dst.getMat();
-    
+
    if( bitwise )
    {
        func = *tab;
@@ -948,35 +1048,35 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        func = tab[src1.depth()];
        c = cn;
    }
-    
+
    if( !haveScalar )
    {
        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
        uchar* ptrs[4];
-        
+
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;
-        
+
        if( haveMask )
        {
            blocksize = std::min(blocksize, blocksize0);
            _buf.allocate(blocksize*esz);
            maskbuf = _buf;
        }
-        
+
        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
            for( size_t j = 0; j < total; j += blocksize )
            {
                int bsz = (int)std::min(total - j, blocksize);
-                
-                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 );                
+
+                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 );
                if( haveMask )
                {
                    copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
                    ptrs[3] += bsz;
                }
-                
+
                bsz *= (int)esz;
                ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
            }
@@ -986,41 +1086,41 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
    {
        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
        uchar* ptrs[3];
-        
+
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
-        
+
        _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
        scbuf = _buf;
        maskbuf = alignPtr(scbuf + blocksize*esz, 16);
-        
+
        convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
-        
+
        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
            for( size_t j = 0; j < total; j += blocksize )
            {
                int bsz = (int)std::min(total - j, blocksize);
-                
+
                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*c, 1), 0 );
                if( haveMask )
                {
                    copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
                    ptrs[2] += bsz;
                }
-                
+
                bsz *= (int)esz;
                ptrs[0] += bsz; ptrs[1] += bsz;
            }
        }
    }
 }
-    
+
 static BinaryFunc maxTab[] =
 {
    (BinaryFunc)max8u, (BinaryFunc)max8s, (BinaryFunc)max16u, (BinaryFunc)max16s,
    (BinaryFunc)max32s, (BinaryFunc)max32f, (BinaryFunc)max64f, 0
-};    
+};

 static BinaryFunc minTab[] =
 {
@@ -1029,7 +1129,7 @@ static BinaryFunc minTab[] =
 };

 }
-    
+
 void cv::bitwise_and(const InputArray& a, const InputArray& b, OutputArray c, const InputArray& mask)
 {
    BinaryFunc f = and8u;
@@ -1068,26 +1168,26 @@ void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
 {
    OutputArray _dst(dst);
    binary_op(src1, src2, _dst, InputArray(), maxTab, false );
-}    
-    
+}
+
 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
 {
    OutputArray _dst(dst);
    binary_op(src1, src2, _dst, InputArray(), minTab, false );
 }
-    
+
 void cv::max(const Mat& src1, double src2, Mat& dst)
 {
    OutputArray _dst(dst);
    binary_op(src1, src2, _dst, InputArray(), maxTab, false );
-}    
+}

 void cv::min(const Mat& src1, double src2, Mat& dst)
 {
    OutputArray _dst(dst);
    binary_op(src1, src2, _dst, InputArray(), minTab, false );
 }
-    
+
 /****************************************************************************************\
 *                                      add/subtract                                      *
 \****************************************************************************************/
@@ -1101,7 +1201,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
    int kind1 = _src1.kind(), kind2 = _src2.kind();
    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
    bool haveMask = !_mask.empty();
-    
+
    if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 &&
        src1.size() == src2.size() && src1.type() == src2.type() &&
        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == src1.depth())) ||
@@ -1113,9 +1213,9 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        tab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
        return;
    }
-    
+
    bool haveScalar = false, swapped12 = false;
-    
+
    if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 ||
        src1.size != src2.size || src1.channels() != src2.channels() )
    {
@@ -1131,10 +1231,10 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                     "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
    }
-    
+
    int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth(), wtype;
    BinaryFunc cvtsrc1 = 0, cvtsrc2 = 0, cvtdst = 0;
-    
+
    if( dtype < 0 )
    {
        if( _dst.fixedType() )
@@ -1149,7 +1249,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        }
    }
    dtype = CV_MAT_DEPTH(dtype);
-    
+
    if( depth1 == depth2 && dtype == depth1 )
        wtype = dtype;
    else if( !muldiv )
@@ -1157,7 +1257,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
                depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
        wtype = std::max(wtype, dtype);
-    
+
        // when the result of addition should be converted to an integer type,
        // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
        // instead of converting the other input to floating-point and then converting the operation result back to integers.
@@ -1169,20 +1269,20 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        wtype = std::max(depth1, std::max(depth2, CV_32F));
        wtype = std::max(wtype, dtype);
    }
-    
+
    cvtsrc1 = depth1 == wtype ? 0 : getConvertFunc(depth1, wtype);
    cvtsrc2 = depth2 == depth1 ? cvtsrc1 : depth2 == wtype ? 0 : getConvertFunc(depth2, wtype);
    cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
-    
+
    dtype = CV_MAKETYPE(dtype, cn);
    wtype = CV_MAKETYPE(wtype, cn);
-    
+
    size_t esz1 = src1.elemSize(), esz2 = src2.elemSize();
    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
    BinaryFunc copymask = 0;
    Mat mask;
-    
+
    if( haveMask )
    {
        mask = _mask.getMat();
@@ -1190,23 +1290,23 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        CV_Assert( mask.size == src1.size );
        copymask = getCopyMaskFunc(dsz);
    }
-    
+
    AutoBuffer<uchar> _buf;
    uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
    size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0);
-    
+
    _dst.create(src1.dims, src1.size, src1.type());
    Mat dst = _dst.getMat();
    BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
-    
+
    if( !haveScalar )
    {
        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
        uchar* ptrs[4];
-        
+
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;
-        
+
        if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
            blocksize = std::min(blocksize, blocksize0);

@@ -1221,7 +1321,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
            buf = alignPtr(buf + blocksize*wsz, 16);
        if( haveMask )
            maskbuf = buf;
-        
+
        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
            for( size_t j = 0; j < total; j += blocksize )
@@ -1242,7 +1342,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                    cvtsrc2( sptr2, 0, 0, 0, buf2, 0, bszn, 0 );
                    sptr2 = buf2;
                }
-                
+
                if( !haveMask && !cvtdst )
                    func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata );
                else
@@ -1270,10 +1370,10 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
    {
        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
        uchar* ptrs[3];
-        
+
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
-        
+
        _buf.allocate(bufesz*blocksize + 64);
        buf = _buf;
        if( cvtsrc1 )
@@ -1284,9 +1384,9 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
            buf = alignPtr(buf + blocksize*wsz, 16);
        if( haveMask )
            maskbuf = buf;
-        
+
        convertAndUnrollScalar( src2, wtype, buf2, blocksize);
-        
+
        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
            for( size_t j = 0; j < total; j += blocksize )
@@ -1296,16 +1396,16 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                const uchar *sptr1 = ptrs[0];
                const uchar* sptr2 = buf2;
                uchar* dptr = ptrs[1];
-                
+
                if( cvtsrc1 )
                {
                    cvtsrc1( sptr1, 0, 0, 0, buf1, 0, bszn, 0 );
                    sptr1 = buf1;
                }
-                
+
                if( swapped12 )
                    std::swap(sptr1, sptr2);
-                
+
                if( !haveMask && !cvtdst )
                    func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata );
                else
@@ -1330,13 +1430,13 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
        }
    }
 }
- 
+
 static BinaryFunc addTab[] =
 {
    (BinaryFunc)add8u, (BinaryFunc)add8s, (BinaryFunc)add16u, (BinaryFunc)add16s,
    (BinaryFunc)add32s, (BinaryFunc)add32f, (BinaryFunc)add64f, 0
 };
-    
+
 static BinaryFunc subTab[] =
 {
    (BinaryFunc)sub8u, (BinaryFunc)sub8s, (BinaryFunc)sub16u, (BinaryFunc)sub16s,
@@ -1348,10 +1448,10 @@ static BinaryFunc absdiffTab[] =
    (BinaryFunc)absdiff8u, (BinaryFunc)absdiff8s, (BinaryFunc)absdiff16u,
    (BinaryFunc)absdiff16s, (BinaryFunc)absdiff32s, (BinaryFunc)absdiff32f,
    (BinaryFunc)absdiff64f, 0
-};    
+};

 }
-    
+
 void cv::add( const InputArray& src1, const InputArray& src2, OutputArray dst,
          const InputArray& mask, int dtype )
 {
@@ -1367,7 +1467,7 @@ void cv::subtract( const InputArray& src1, const InputArray& src2, OutputArray d
 void cv::absdiff( const InputArray& src1, const InputArray& src2, OutputArray dst )
 {
    arithm_op(src1, src2, dst, InputArray(), -1, absdiffTab);
-}    
+}

 /****************************************************************************************\
 *                                    multiply/divide                                     *
@@ -1437,7 +1537,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    step /= sizeof(dst[0]);
-    
+
    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
    {
        int i = 0;
@@ -1450,12 +1550,12 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
                double d = scale/(a * b);
                b *= d;
                a *= d;
-                
+
                T z0 = saturate_cast<T>(src2[i+1] * ((double)src1[i] * b));
                T z1 = saturate_cast<T>(src2[i] * ((double)src1[i+1] * b));
                T z2 = saturate_cast<T>(src2[i+3] * ((double)src1[i+2] * a));
                T z3 = saturate_cast<T>(src2[i+2] * ((double)src1[i+3] * a));
-                
+
                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
@@ -1465,12 +1565,12 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
                T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
                T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
                T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
-                
+
                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }
-        
+
        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
    }
@@ -1482,7 +1582,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
 {
    step2 /= sizeof(src2[0]);
    step /= sizeof(dst[0]);
-    
+
    for( ; size.height--; src2 += step2, dst += step )
    {
        int i = 0;
@@ -1495,12 +1595,12 @@ recip_( const T*, size_t, const T* src2, size_t step2,
                double d = scale/(a * b);
                b *= d;
                a *= d;
-                
+
                T z0 = saturate_cast<T>(src2[i+1] * b);
                T z1 = saturate_cast<T>(src2[i] * b);
                T z2 = saturate_cast<T>(src2[i+3] * a);
                T z3 = saturate_cast<T>(src2[i+2] * a);
-                
+
                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
@@ -1515,13 +1615,13 @@ recip_( const T*, size_t, const T* src2, size_t step2,
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }
-        
+
        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
    }
 }
-    
-    
+
+
 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* scale)
 {
@@ -1551,7 +1651,7 @@ static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2
 {
    mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
-    
+
 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* scale)
 {
@@ -1563,7 +1663,7 @@ static void mul64f( const double* src1, size_t step1, const double* src2, size_t
 {
    mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
-    
+
 static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* scale)
 {
@@ -1650,8 +1750,8 @@ static void recip64f( const double* src1, size_t step1, const double* src2, size
 {
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
-    
-    
+
+
 static BinaryFunc mulTab[] =
 {
    (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
@@ -1673,9 +1773,9 @@ static BinaryFunc recipTab[] =
    (BinaryFunc)recip64f, 0
 };

-    
+
 }
-    
+
 void cv::multiply(const InputArray& src1, const InputArray& src2,
                  OutputArray dst, double scale, int dtype)
 {
@@ -1692,8 +1792,8 @@ void cv::divide(double scale, const InputArray& src2,
                OutputArray dst, int dtype)
 {
    arithm_op(src2, src2, dst, InputArray(), dtype, recipTab, true, &scale);
-}    
-    
+}
+
 /****************************************************************************************\
 *                                      addWeighted                                       *
 \****************************************************************************************/
@@ -1739,34 +1839,34 @@ addWeighted8u( const uchar* src1, size_t step1,
 {
    const double* scalars = (const double*)_scalars;
    float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
-    
+
    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
    {
        int x = 0;
-        
+
 #if CV_SSE2
        if( USE_SSE2 )
        {
            __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
            __m128i z = _mm_setzero_si128();
-            
+
            for( ; x <= size.width - 8; x += 8 )
            {
                __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
                __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
-                
+
                __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
                __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
                __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
                __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
-                
+
                u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
                u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
                u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
-                
+
                u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
                u = _mm_packus_epi16(u, u);
-                
+
                _mm_storel_epi64((__m128i*)(dst + x), u);
            }
        }
@@ -1837,9 +1937,9 @@ static BinaryFunc addWeightedTab[] =
    (BinaryFunc)addWeighted16s, (BinaryFunc)addWeighted32s, (BinaryFunc)addWeighted32f,
    (BinaryFunc)addWeighted64f, 0
 };
-    
+
 }
-    
+
 void cv::addWeighted( const InputArray& src1, double alpha, const InputArray& src2,
                      double beta, double gamma, OutputArray dst, int dtype )
 {
@@ -1847,7 +1947,7 @@ void cv::addWeighted( const InputArray& src1, double alpha, const InputArray& sr
    arithm_op(src1, src2, dst, InputArray(), dtype, addWeightedTab, true, scalars);
 }

-    
+
 /****************************************************************************************\
 *                                          compare                                       *
 \****************************************************************************************/
@@ -1867,7 +1967,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
        std::swap(step1, step2);
        code = code == CMP_GE ? CMP_LE : CMP_GT;
    }
-    
+
    if( code == CMP_GT || code == CMP_LE )
    {
        int m = code == CMP_GT ? 0 : 255;
@@ -1884,7 +1984,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                t1 = -(src1[x+3] > src2[x+3]) ^ m;
                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
            }
-            
+
            for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
        }
@@ -1905,14 +2005,14 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                t1 = -(src1[x+3] == src2[x+3]) ^ m;
                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
            }
-            
+
            for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
        }
    }
 }
-    
-    
+
+
 static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                  uchar* dst, size_t step, Size size, void* _cmpop)
 {
@@ -1953,8 +2053,8 @@ static void cmp64f(const double* src1, size_t step1, const double* src2, size_t
                  uchar* dst, size_t step, Size size, void* _cmpop)
 {
    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
-}    
-    
+}
+
 static BinaryFunc cmpTab[] =
 {
    (BinaryFunc)cmp8u, (BinaryFunc)cmp8s, (BinaryFunc)cmp16u,
@@ -1962,7 +2062,7 @@ static BinaryFunc cmpTab[] =
    (BinaryFunc)cmp64f, 0
 };

-    
+
 static double getMinVal(int depth)
 {
    static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
@@ -1973,18 +2073,18 @@ static double getMaxVal(int depth)
 {
    static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
    return tab[depth];
-}    
-   
 }
-    
+
+}
+
 void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _dst, int op)
 {
    CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
               op == CMP_NE || op == CMP_GE || op == CMP_GT );
-    
+
    int kind1 = _src1.kind(), kind2 = _src2.kind();
    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
-    
+
    if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
    {
        _dst.create(src1.size(), CV_8UC1);
@@ -1993,9 +2093,9 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
        cmpTab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, &op);
        return;
    }
-    
+
    bool haveScalar = false;
-    
+
    if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 ||
        src1.size != src2.size || src1.type() != src2.type() )
    {
@@ -2012,26 +2112,26 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
                     "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
    }
-    
+
    int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
    if( cn != 1 )
        CV_Error( CV_StsUnsupportedFormat, "compare() can only process single-channel arrays" );
-        
+
    size_t esz = src1.elemSize();
    size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
-    
+
    _dst.create(src1.dims, src1.size, CV_8U);
    Mat dst = _dst.getMat();
    BinaryFunc func = cmpTab[depth1];
-    
+
    if( !haveScalar )
    {
        const Mat* arrays[] = { &src1, &src2, &dst, 0 };
        uchar* ptrs[3];
-        
+
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size;
-        
+
        for( size_t i = 0; i < it.nplanes; i++, ++it )
            func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
    }
@@ -2039,10 +2139,10 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
    {
        const Mat* arrays[] = { &src1, &dst, 0 };
        uchar* ptrs[2];
-        
+
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
-        
+
        AutoBuffer<uchar> _buf(blocksize*esz);
        uchar *buf = _buf;

@@ -2057,13 +2157,13 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
                dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
                return;
            }
-            
+
            if( fval > getMaxVal(depth1) )
            {
                dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
                return;
            }
-            
+
            int ival = cvRound(fval);
            if( fval != ival )
            {
@@ -2079,7 +2179,7 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
            }
            convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
        }
-        
+
        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
            for( size_t j = 0; j < total; j += blocksize )
@@ -2092,7 +2192,7 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
        }
    }
 }
-    
+
 /****************************************************************************************\
 *                                        inRange                                         *
 \****************************************************************************************/
@@ -2108,7 +2208,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    step3 /= sizeof(src3[0]);
-    
+
    for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
    {
        int x = 0;
@@ -2122,13 +2222,13 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
            t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
            dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
        }
-            
+
        for( ; x < size.width; x++ )
            dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
    }
 }

-    
+
 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                      const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
 {
@@ -2169,7 +2269,7 @@ static void inRange64f(const double* src1, size_t step1, const double* src2, siz
                       const double* src3, size_t step3, uchar* dst, size_t step, Size size)
 {
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
-}    
+}

 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
 {
@@ -2187,14 +2287,14 @@ static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
    else
        for( i = j = 0; i < len; i++, j += cn )
            dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
-    
+
    for( ; k < cn; k += 4 )
    {
        for( i = 0, j = k; i < len; i++, j += cn )
            dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
    }
 }
-    
+
 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                             const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );

@@ -2204,7 +2304,7 @@ static InRangeFunc inRangeTab[] =
    (InRangeFunc)inRange16s, (InRangeFunc)inRange32s, (InRangeFunc)inRange32f,
    (InRangeFunc)inRange64f, 0
 };
-    
+
 }

 void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
@@ -2212,9 +2312,9 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
 {
    int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
    Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
-    
+
    bool lbScalar = false, ubScalar = false;
-    
+
    if( (lkind == InputArray::MATX && skind != InputArray::MATX) ||
        src.size != lb.size || src.type() != lb.type() )
    {
@@ -2223,7 +2323,7 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
                     "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
        lbScalar = true;
    }
-    
+
    if( (ukind == InputArray::MATX && skind != InputArray::MATX) ||
        src.size != ub.size || src.type() != ub.type() )
    {
@@ -2232,47 +2332,47 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
                     "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
        ubScalar = true;
    }
-    
+
    CV_Assert( ((int)lbScalar ^ (int)ubScalar) == 0 );
-    
+
    int cn = src.channels(), depth = src.depth();
-    
+
    size_t esz = src.elemSize();
    size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
-    
+
    _dst.create(src.dims, src.size, CV_8U);
    Mat dst = _dst.getMat();
    InRangeFunc func = inRangeTab[depth];
-    
+
    const Mat* arrays_sc[] = { &src, &dst, 0 };
    const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
    uchar* ptrs[4];
-    
+
    NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
    size_t total = it.size, blocksize = std::min(total, blocksize0);
-    
+
    AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
    uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0;
    buf = alignPtr(buf + blocksize*cn, 16);
-    
+
    if( lbScalar && ubScalar )
    {
        lbuf = buf;
        ubuf = buf = alignPtr(buf + blocksize*esz, 16);
-        
+
        CV_Assert( lb.type() == ub.type() );
        int scdepth = lb.depth();
-        
+
        if( scdepth != depth && depth < CV_32S )
        {
            int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
            int* iubuf = ilbuf + cn;
-            
+
            BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
            sccvtfunc(lb.data, 0, 0, 0, (uchar*)ilbuf, 0, Size(cn, 1), 0);
            sccvtfunc(ub.data, 0, 0, 0, (uchar*)iubuf, 0, Size(cn, 1), 0);
            int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
-            
+
            for( int k = 0; k < cn; k++ )
            {
                if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
@@ -2281,11 +2381,11 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
            lb = Mat(cn, 1, CV_32S, ilbuf);
            ub = Mat(cn, 1, CV_32S, iubuf);
        }
-        
+
        convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
        convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
    }
-    
+
    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        for( size_t j = 0; j < total; j += blocksize )

--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -646,8 +646,8 @@ static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step,
 {
    GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
 }
-    
-    
+
+
 static void GEMMStore_32f( const float* c_data, size_t c_step,
          const double* d_buf, size_t d_buf_step,
          float* d_data, size_t d_step, Size d_size,
@@ -664,7 +664,7 @@ static void GEMMStore_64f( const double* c_data, size_t c_step,
 {
    GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
 }
-    
+

 static void GEMMStore_32fc( const Complexf* c_data, size_t c_step,
                          const Complexd* d_buf, size_t d_buf_step,
@@ -1130,7 +1130,7 @@ void cv::gemm( const InputArray& matA, const InputArray& matB, double alpha,
        int dm0, dn0, dk0;
        size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
        int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
-        
+
        if( !is_a_t )
            a_step0 = A.step, a_step1 = elem_size;
        else
@@ -1273,7 +1273,7 @@ template<typename T, typename WT> static void
 transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
 {
    int x;
-    
+
    if( scn == 2 && dcn == 2 )
    {
        for( x = 0; x < len*2; x += 2 )
@@ -1352,7 +1352,7 @@ load4x4Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3, _
 }

 #endif
-    
+
 static void
 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
 {
@@ -1379,7 +1379,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
        __m128i m2 = _mm_setr_epi16(0, m20, m21, m22, m20, m21, m22, 0);
        __m128i m3 = _mm_setr_epi32(m03, m13, m23, 0);
        int x = 0;
-        
+
        for( ; x <= (len - 8)*3; x += 8*3 )
        {
            __m128i z = _mm_setzero_si128(), t0, t1, t2, r0, r1;
@@ -1470,14 +1470,14 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
        return;
    }
 #endif
-    
+
    transform_(src, dst, m, len, scn, dcn);
 }

 static void
 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SSE2    
+#if CV_SSE2
    if( USE_SSE2 && scn == 3 && dcn == 3 )
    {
        __m128 m0, m1, m2, m3;
@@ -1536,11 +1536,11 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
        return;
    }
 #endif
-    
+
    transform_(src, dst, m, len, scn, dcn);
 }
-    
-    
+
+
 static void
 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
 {
@@ -1574,12 +1574,12 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
            }
            return;
        }
-        
+
        if( scn == 4 && dcn == 4 )
        {
            __m128 m0, m1, m2, m3, m4;
            load4x4Matrix(m, m0, m1, m2, m3, m4);
-        
+
            for( ; x < len*4; x += 4 )
            {
                __m128 x0 = _mm_loadu_ps(src + x);
@@ -1616,18 +1616,18 @@ transform_32s(const int* src, int* dst, const double* m, int len, int scn, int d
 {
    transform_(src, dst, m, len, scn, dcn);
 }
-    
+
 static void
 transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
 {
    transform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 template<typename T, typename WT> static void
 diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
 {
    int x;
-    
+
    if( cn == 2 )
    {
        for( x = 0; x < len*2; x += 2 )
@@ -1674,8 +1674,8 @@ static void
 diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
 {
    diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 static void
 diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
 {
@@ -1686,8 +1686,8 @@ static void
 diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
 {
    diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 static void
 diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
 {
@@ -1704,17 +1704,17 @@ static void
 diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
 {
    diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 static void
 diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
 {
    diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
-    
+}
+
+
 typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int );
-    
+
 static TransformFunc transformTab[] =
 {
    (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
@@ -1728,23 +1728,23 @@ static TransformFunc diagTransformTab[] =
    (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
    (TransformFunc)diagtransform_64f, 0
 };
-    
+
 }
-    
+
 void cv::transform( const InputArray& _src, OutputArray _dst, const InputArray& _mtx )
 {
    Mat src = _src.getMat(), m = _mtx.getMat();
    int depth = src.depth(), scn = src.channels(), dcn = m.rows;
    CV_Assert( scn == m.cols || scn + 1 == m.cols );
    bool isDiag = false;
-    
+
    _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
    Mat dst = _dst.getMat();

    int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F;
    AutoBuffer<double> _mbuf;
    double* mbuf = _mbuf;
-    
+
    if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 )
    {
        _mbuf.allocate(dcn*(scn+1));
@@ -1791,12 +1791,12 @@ void cv::transform( const InputArray& _src, OutputArray _dst, const InputArray&

    TransformFunc func = isDiag ? diagTransformTab[depth] : transformTab[depth];
    CV_Assert( func != 0 );
-    
+
    const Mat* arrays[] = {&src, &dst, 0};
    uchar* ptrs[2];
    NAryMatIterator it(arrays, ptrs);
    size_t i, total = it.size;
-    
+
    for( i = 0; i < it.nplanes; i++, ++it )
        func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
 }
@@ -1813,7 +1813,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
 {
    const double eps = FLT_EPSILON;
    int i;
-    
+
    if( scn == 2 && dcn == 2 )
    {
        for( i = 0; i < len*2; i += 2 )
@@ -1837,7 +1837,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
        {
            T x = src[i], y = src[i + 1], z = src[i + 2];
            double w = x*m[12] + y*m[13] + z*m[14] + m[15];
-            
+
            if( fabs(w) > eps )
            {
                w = 1./w;
@@ -1855,7 +1855,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
        {
            T x = src[0], y = src[1], z = src[2];
            double w = x*m[8] + y*m[9] + z*m[10] + m[11];
-            
+
            if( fabs(w) > eps )
            {
                w = 1./w;
@@ -1893,7 +1893,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
    }
 }

-    
+
 static void
 perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn)
 {
@@ -1905,22 +1905,22 @@ perspectiveTransform_64f(const double* src, double* dst, const double* m, int le
 {
    perspectiveTransform_(src, dst, m, len, scn, dcn);
 }
-    
+
 }
-    
+
 void cv::perspectiveTransform( const InputArray& _src, OutputArray _dst, const InputArray& _mtx )
 {
    Mat src = _src.getMat(), m = _mtx.getMat();
    int depth = src.depth(), scn = src.channels(), dcn = m.rows-1;
    CV_Assert( scn + 1 == m.cols && (depth == CV_32F || depth == CV_64F));
-    
+
    _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
    Mat dst = _dst.getMat();
-    
+
    const int mtype = CV_64F;
    AutoBuffer<double> _mbuf;
    double* mbuf = _mbuf;
-    
+
    if( !m.isContinuous() || m.type() != mtype )
    {
        _mbuf.allocate((dcn+1)*(scn+1));
@@ -1930,20 +1930,20 @@ void cv::perspectiveTransform( const InputArray& _src, OutputArray _dst, const I
    }
    else
        mbuf = (double*)m.data;
-    
+
    TransformFunc func = depth == CV_32F ?
        (TransformFunc)perspectiveTransform_32f :
        (TransformFunc)perspectiveTransform_64f;
    CV_Assert( func != 0 );
-    
+
    const Mat* arrays[] = {&src, &dst, 0};
    uchar* ptrs[2];
    NAryMatIterator it(arrays, ptrs);
    size_t i, total = it.size;
-    
+
    for( i = 0; i < it.nplanes; i++, ++it )
        func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
-}    
+}

 /****************************************************************************************\
 *                                       ScaleAdd                                         *
@@ -2000,7 +2000,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
        dst[i] = src1[i]*alpha + src2[i];
 }

-    
+
 static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
                         int len, double* _alpha)
 {
@@ -2040,39 +2040,39 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
 typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha);

 }
-    
+
 void cv::scaleAdd( const InputArray& _src1, double alpha, const InputArray& _src2, OutputArray _dst )
 {
    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
    int depth = src1.depth(), cn = src1.channels();
-    
+
    CV_Assert( src1.type() == src2.type() );
    if( depth < CV_32F )
    {
        addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
        return;
    }
-    
+
    _dst.create(src1.dims, src1.size, src1.type());
    Mat dst = _dst.getMat();
-    
+
    float falpha = (float)alpha;
    void* palpha = depth == CV_32F ? (void*)&falpha : (void*)&alpha;
-    
+
    ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f; 
-    
+
    if( src1.isContinuous() && src2.isContinuous() && dst.isContinuous() )
    {
        size_t len = src1.total()*cn;
        func(src1.data, src2.data, dst.data, (int)len, palpha);
        return;
    }
-    
+
    const Mat* arrays[] = {&src1, &src2, &dst, 0};
    uchar* ptrs[3];
    NAryMatIterator it(arrays, ptrs);
    size_t i, len = it.size*cn;
-    
+
    for( i = 0; i < it.nplanes; i++, ++it )
        func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha );
 }
@@ -2243,7 +2243,7 @@ double cv::Mahalonobis( const InputArray& _v1, const InputArray& _v2, const Inpu
 {
    return Mahalanobis(_v1, _v2, _icovar);
 }
-    
+
 /****************************************************************************************\
 *                                        MulTransposed                                   *
 \****************************************************************************************/
@@ -2445,7 +2445,7 @@ MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scal
 typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale);

 }
-    
+
 void cv::mulTransposed( const InputArray& _src, OutputArray _dst, bool ata,
                        const InputArray& _delta, double scale, int dtype )
 {
@@ -2578,7 +2578,7 @@ dotProd_(const T* src1, const T* src2, int len)
            (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
    for( ; i < len; i++ )
        result += (double)src1[i]*src2[i];
-    
+
    return result;
 }

@@ -2590,9 +2590,10 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
    ippiDotProd_8u64f_C1R(src1, (int)(len*sizeof(src1[0])),
                          src2, (int)(len*sizeof(src2[0])),
                          ippiSize(len, 1), &r);
+    return r;
 #else
    int i = 0;
-    
+
 #if CV_SSE2
    if( USE_SSE2 )
    {
@@ -2616,7 +2617,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
                s = _mm_add_epi32(s, s0);
                s = _mm_add_epi32(s, s2);
            }
-            
+
            for( ; j < blockSize; j += 4 )
            {
                __m128i s0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + j)), z);
@@ -2627,7 +2628,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
            CV_DECL_ALIGNED(16) int buf[4];
            _mm_store_si128((__m128i*)buf, s);
            r += buf[0] + buf[1] + buf[2] + buf[3];
-            
+
            src1 += blockSize;
            src2 += blockSize;
            i += blockSize;
@@ -2692,7 +2693,7 @@ static double dotProd_64f(const double* src1, const double* src2, int len)


 typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
-    
+
 static DotProdFunc dotProdTab[] =
 {
    (DotProdFunc)dotProd_8u, (DotProdFunc)dotProd_8s, (DotProdFunc)dotProd_16u,
@@ -2713,16 +2714,16 @@ double Mat::dot(const InputArray& _mat) const
        if( len == (size_t)(int)len )
            return func(data, mat.data, len);
    }
-    
+
    const Mat* arrays[] = {this, &mat, 0};
    uchar* ptrs[2];
    NAryMatIterator it(arrays, ptrs);
    int len = (int)(it.size*cn);
    double r = 0;
-    
+
    for( size_t i = 0; i < it.nplanes; i++, ++it )
        r += func( ptrs[0], ptrs[1], len );
-    
+
    return r;
 }

@@ -3027,12 +3028,12 @@ cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigen
    evects = pca.eigenvectors;
    int ecount0 = evals0.cols + evals0.rows - 1;
    int ecount = evals.cols + evals.rows - 1;
-    
+
    CV_Assert( (evals0.cols == 1 || evals0.rows == 1) &&
                ecount0 <= ecount &&
                evects0.cols == evects.cols &&
                evects0.rows == ecount0 );
-    
+
    cv::Mat temp = evals0;
    if( evals.rows == 1 )
        evals.colRange(0, ecount0).convertTo(temp, evals0.type());

--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -87,7 +87,7 @@ extern const uchar g_Saturate8u[];
 void deleteThreadAllocData();
 void deleteThreadRNGData();
 #endif
-    
+
 template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
 {
    typedef T1 type1;
@@ -176,24 +176,24 @@ typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
                           void*);

 BinaryFunc getConvertFunc(int sdepth, int ddepth);
-BinaryFunc getConvertScaleFunc(int sdepth, int ddepth);  
+BinaryFunc getConvertScaleFunc(int sdepth, int ddepth);
 BinaryFunc getCopyMaskFunc(size_t esz);

 enum { BLOCK_SIZE = 1024 };

 #ifdef HAVE_IPP
-static inline IppiSize ippiSize(int width, int height) { IppiSize sz={width, height}; return sz; }
-static inline IppiSize ippiSize(Size _sz) { reIppiSize sz={_sz.width, _sz.height}; return sz; }
+static inline IppiSize ippiSize(int width, int height) { IppiSize sz = { width, height}; return sz; }
+static inline IppiSize ippiSize(Size _sz)              { IppiSize sz = { _sz.width, _sz.height}; return sz; }
 #endif
-    
+
 #if defined HAVE_IPP && (IPP_VERSION_MAJOR >= 7)
 #define ARITHM_USE_IPP 1
 #define IF_IPP(then_call, else_call) then_call
 #else
 #define ARITHM_USE_IPP 0
 #define IF_IPP(then_call, else_call) else_call
-#endif    
-    
+#endif
+
 }

 #endif /*_CXCORE_INTERNAL_H_*/
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -170,9 +170,10 @@ struct IPPInitializer
 IPPInitializer ippInitializer;
 #else
 volatile bool useOptimizedFlag = false;
-volatile bool USE_SSE2 = false;
 #endif

+volatile bool USE_SSE2 = false;
+
 void setUseOptimized( bool flag )
 {
    useOptimizedFlag = flag;