Merge pull request #13631 from terfendail:thresh_wintr

ca00c1dc · Alexander Alekhin · 133eb8d1 · a202dc9a · ca00c1dc
Commit ca00c1dc authored Jan 16, 2019 by Alexander Alekhin
Hide whitespace changes
Inline Side-by-side

Showing with 578 additions and 458 deletions

thresh.cpp modules/imgproc/src/thresh.cpp +578 -458

No files found.
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -195,82 +195,78 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    int j = 0;
    const uchar* src = _src.ptr();
    uchar* dst = _dst.ptr();
-#if CV_SIMD128
+#if CV_SIMD
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
+    v_uint8 thresh_u = vx_setall_u8( thresh );
-    if( useSIMD )
+    v_uint8 maxval16 = vx_setall_u8( maxval );
-    {
-        v_uint8x16 thresh_u = v_setall_u8( thresh );
-        v_uint8x16 maxval16 = v_setall_u8( maxval );
-        switch( type )
+    switch( type )
+    {
+    case THRESH_BINARY:
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-        case THRESH_BINARY:
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+                v_uint8 v0;
-                {
+                v0 = vx_load( src + j );
-                    v_uint8x16 v0;
+                v0 = thresh_u < v0;
-                    v0 = v_load( src + j );
+                v0 = v0 & maxval16;
-                    v0 = thresh_u < v0;
+                v_store( dst + j, v0 );
-                    v0 = v0 & maxval16;
-                    v_store( dst + j, v0 );
-                }
            }
-            break;
+        }
+        break;
-        case THRESH_BINARY_INV:
+    case THRESH_BINARY_INV:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+                v_uint8 v0;
-                {
+                v0 = vx_load( src + j );
-                    v_uint8x16 v0;
+                v0 = v0 <= thresh_u;
-                    v0 = v_load( src + j );
+                v0 = v0 & maxval16;
-                    v0 = v0 <= thresh_u;
+                v_store( dst + j, v0 );
-                    v0 = v0 & maxval16;
-                    v_store( dst + j, v0 );
-                }
            }
-            break;
+        }
+        break;
-        case THRESH_TRUNC:
+    case THRESH_TRUNC:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+                v_uint8 v0;
-                {
+                v0 = vx_load( src + j );
-                    v_uint8x16 v0;
+                v0 = v0 - ( v0 - thresh_u );
-                    v0 = v_load( src + j );
+                v_store( dst + j, v0 );
-                    v0 = v0 - ( v0 - thresh_u );
-                    v_store( dst + j, v0 );
-                }
            }
-            break;
+        }
+        break;
-        case THRESH_TOZERO:
+    case THRESH_TOZERO:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+                v_uint8 v0;
-                {
+                v0 = vx_load( src + j );
-                    v_uint8x16 v0;
+                v0 = ( thresh_u < v0 ) & v0;
-                    v0 = v_load( src + j );
+                v_store( dst + j, v0 );
-                    v0 = ( thresh_u < v0 ) & v0;
-                    v_store( dst + j, v0 );
-                }
            }
-            break;
+        }
+        break;
-        case THRESH_TOZERO_INV:
+    case THRESH_TOZERO_INV:
-            for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
            {
-                for( j = 0; j <= roi.width - 16; j += 16 )
+                v_uint8 v0;
-                {
+                v0 = vx_load( src + j );
-                    v_uint8x16 v0;
+                v0 = ( v0 <= thresh_u ) & v0;
-                    v0 = v_load( src + j );
+                v_store( dst + j, v0 );
-                    v0 = ( v0 <= thresh_u ) & v0;
-                    v_store( dst + j, v0 );
-                }
            }
-            break;
        }
+        break;
    }
 #endif
@@ -362,125 +358,156 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
    const ushort* src = _src.ptr<ushort>();
    ushort* dst = _dst.ptr<ushort>();
-#if CV_SIMD128
+#if CV_SIMD
-    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
+    int i, j;
-    if (useSIMD)
+    v_uint16 thresh_u = vx_setall_u16(thresh);
-    {
+    v_uint16 maxval16 = vx_setall_u16(maxval);
-        int i, j;
-        v_uint16x8 thresh_u = v_setall_u16(thresh);
-        v_uint16x8 maxval16 = v_setall_u16(maxval);
-        switch (type)
+    switch (type)
+    {
+    case THRESH_BINARY:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
-        case THRESH_BINARY:
+            for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
            {
-                for (j = 0; j <= roi.width - 16; j += 16)
+                v_uint16 v0, v1;
-                {
+                v0 = vx_load(src + j);
-                    v_uint16x8 v0, v1;
+                v1 = vx_load(src + j + v_uint16::nlanes);
-                    v0 = v_load(src + j);
+                v0 = thresh_u < v0;
-                    v1 = v_load(src + j + 8);
+                v1 = thresh_u < v1;
-                    v0 = thresh_u < v0;
+                v0 = v0 & maxval16;
-                    v1 = thresh_u < v1;
+                v1 = v1 & maxval16;
-                    v0 = v0 & maxval16;
+                v_store(dst + j, v0);
-                    v1 = v1 & maxval16;
+                v_store(dst + j + v_uint16::nlanes, v1);
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
-                for (; j < roi.width; j++)
-                    dst[j] = threshBinary<ushort>(src[j], thresh, maxval);
            }
-            break;
+            if (j <= roi.width - v_uint16::nlanes)
-        case THRESH_BINARY_INV:
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
            {
-                j = 0;
+                v_uint16 v0 = vx_load(src + j);
-                for (; j <= roi.width - 16; j += 16)
+                v0 = thresh_u < v0;
-                {
+                v0 = v0 & maxval16;
-                    v_uint16x8 v0, v1;
+                v_store(dst + j, v0);
-                    v0 = v_load(src + j);
+                j += v_uint16::nlanes;
-                    v1 = v_load(src + j + 8);
-                    v0 = v0 <= thresh_u;
-                    v1 = v1 <= thresh_u;
-                    v0 = v0 & maxval16;
-                    v1 = v1 & maxval16;
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
-                for (; j < roi.width; j++)
-                    dst[j] = threshBinaryInv<ushort>(src[j], thresh, maxval);
            }
-            break;
-        case THRESH_TRUNC:
+            for (; j < roi.width; j++)
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+                dst[j] = threshBinary<ushort>(src[j], thresh, maxval);
-            {
+        }
-                j = 0;
+        break;
-                for (; j <= roi.width - 16; j += 16)
-                {
-                    v_uint16x8 v0, v1;
-                    v0 = v_load(src + j);
-                    v1 = v_load(src + j + 8);
-                    v0 = v_min(v0, thresh_u);
-                    v1 = v_min(v1, thresh_u);
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
-                for (; j < roi.width; j++)
+    case THRESH_BINARY_INV:
-                    dst[j] = threshTrunc<ushort>(src[j], thresh);
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            {
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = v0 <= thresh_u;
+                v1 = v1 <= thresh_u;
+                v0 = v0 & maxval16;
+                v1 = v1 & maxval16;
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
            }
-            break;
+            if (j <= roi.width - v_uint16::nlanes)
-        case THRESH_TOZERO:
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
            {
-                j = 0;
+                v_uint16 v0 = vx_load(src + j);
-                for (; j <= roi.width - 16; j += 16)
+                v0 = v0 <= thresh_u;
-                {
+                v0 = v0 & maxval16;
-                    v_uint16x8 v0, v1;
+                v_store(dst + j, v0);
-                    v0 = v_load(src + j);
+                j += v_uint16::nlanes;
-                    v1 = v_load(src + j + 8);
+            }
-                    v0 = (thresh_u < v0) & v0;
-                    v1 = (thresh_u < v1) & v1;
-                    v_store(dst + j, v0);
-                    v_store(dst + j + 8, v1);
-                }
-                for (; j < roi.width; j++)
+            for (; j < roi.width; j++)
-                    dst[j] = threshToZero<ushort>(src[j], thresh);
+                dst[j] = threshBinaryInv<ushort>(src[j], thresh, maxval);
+        }
+        break;
+    case THRESH_TRUNC:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            {
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = v_min(v0, thresh_u);
+                v1 = v_min(v1, thresh_u);
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
+            }
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = v_min(v0, thresh_u);
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
            }
-            break;
-        case THRESH_TOZERO_INV:
+            for (; j < roi.width; j++)
-            for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+                dst[j] = threshTrunc<ushort>(src[j], thresh);
+        }
+        break;
+    case THRESH_TOZERO:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
            {
-                j = 0;
+                v_uint16 v0, v1;
-                for (; j <= roi.width - 16; j += 16)
+                v0 = vx_load(src + j);
-                {
+                v1 = vx_load(src + j + v_uint16::nlanes);
-                    v_uint16x8 v0, v1;
+                v0 = (thresh_u < v0) & v0;
-                    v0 = v_load(src + j);
+                v1 = (thresh_u < v1) & v1;
-                    v1 = v_load(src + j + 8);
+                v_store(dst + j, v0);
-                    v0 = (v0 <= thresh_u) & v0;
+                v_store(dst + j + v_uint16::nlanes, v1);
-                    v1 = (v1 <= thresh_u) & v1;
+            }
-                    v_store(dst + j, v0);
+            if (j <= roi.width - v_uint16::nlanes)
-                    v_store(dst + j + 8, v1);
+            {
-                }
+                v_uint16 v0 = vx_load(src + j);
+                v0 = (thresh_u < v0) & v0;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
+            }
-                for (; j < roi.width; j++)
+            for (; j < roi.width; j++)
-                    dst[j] = threshToZeroInv<ushort>(src[j], thresh);
+                dst[j] = threshToZero<ushort>(src[j], thresh);
+        }
+        break;
+    case THRESH_TOZERO_INV:
+        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
+        {
+            j = 0;
+            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            {
+                v_uint16 v0, v1;
+                v0 = vx_load(src + j);
+                v1 = vx_load(src + j + v_uint16::nlanes);
+                v0 = (v0 <= thresh_u) & v0;
+                v1 = (v1 <= thresh_u) & v1;
+                v_store(dst + j, v0);
+                v_store(dst + j + v_uint16::nlanes, v1);
            }
-            break;
+            if (j <= roi.width - v_uint16::nlanes)
+            {
+                v_uint16 v0 = vx_load(src + j);
+                v0 = (v0 <= thresh_u) & v0;
+                v_store(dst + j, v0);
+                j += v_uint16::nlanes;
+            }
+            for (; j < roi.width; j++)
+                dst[j] = threshToZeroInv<ushort>(src[j], thresh);
        }
+        break;
    }
-    else
+#else
+    threshGeneric<ushort>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
 #endif
-    {
-        threshGeneric<ushort>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
 }
 static void
@@ -556,128 +583,159 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
    }
 #endif
-#if CV_SIMD128
+#if CV_SIMD
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
+    int i, j;
-    if( useSIMD )
+    v_int16 thresh8 = vx_setall_s16( thresh );
-    {
+    v_int16 maxval8 = vx_setall_s16( maxval );
-        int i, j;
-        v_int16x8 thresh8 = v_setall_s16( thresh );
-        v_int16x8 maxval8 = v_setall_s16( maxval );
-        switch( type )
+    switch( type )
+    {
+    case THRESH_BINARY:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-        case THRESH_BINARY:
+            j = 0;
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
            {
-                j = 0;
+                v_int16 v0, v1;
-                for( ; j <= roi.width - 16; j += 16 )
+                v0 = vx_load( src + j );
-                {
+                v1 = vx_load( src + j + v_int16::nlanes );
-                    v_int16x8 v0, v1;
+                v0 = thresh8 < v0;
-                    v0 = v_load( src + j );
+                v1 = thresh8 < v1;
-                    v1 = v_load( src + j + 8 );
+                v0 = v0 & maxval8;
-                    v0 = thresh8 < v0;
+                v1 = v1 & maxval8;
-                    v1 = thresh8 < v1;
+                v_store( dst + j, v0 );
-                    v0 = v0 & maxval8;
+                v_store( dst + j + v_int16::nlanes, v1 );
-                    v1 = v1 & maxval8;
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
-                for( ; j < roi.width; j++ )
-                    dst[j] = threshBinary<short>(src[j], thresh, maxval);
            }
-            break;
+            if( j <= roi.width - v_int16::nlanes )
-        case THRESH_BINARY_INV:
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
-                j = 0;
+                v_int16 v0 = vx_load( src + j );
-                for( ; j <= roi.width - 16; j += 16 )
+                v0 = thresh8 < v0;
-                {
+                v0 = v0 & maxval8;
-                    v_int16x8 v0, v1;
+                v_store( dst + j, v0 );
-                    v0 = v_load( src + j );
+                j += v_int16::nlanes;
-                    v1 = v_load( src + j + 8 );
-                    v0 = v0 <= thresh8;
-                    v1 = v1 <= thresh8;
-                    v0 = v0 & maxval8;
-                    v1 = v1 & maxval8;
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
-                for( ; j < roi.width; j++ )
-                    dst[j] = threshBinaryInv<short>(src[j], thresh, maxval);
            }
-            break;
-        case THRESH_TRUNC:
+            for( ; j < roi.width; j++ )
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+                dst[j] = threshBinary<short>(src[j], thresh, maxval);
-            {
+        }
-                j = 0;
+        break;
-                for( ; j <= roi.width - 16; j += 16 )
-                {
-                    v_int16x8 v0, v1;
-                    v0 = v_load( src + j );
-                    v1 = v_load( src + j + 8 );
-                    v0 = v_min( v0, thresh8 );
-                    v1 = v_min( v1, thresh8 );
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
-                for( ; j < roi.width; j++ )
+    case THRESH_BINARY_INV:
-                    dst[j] = threshTrunc<short>( src[j], thresh );
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            {
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = v0 <= thresh8;
+                v1 = v1 <= thresh8;
+                v0 = v0 & maxval8;
+                v1 = v1 & maxval8;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
            }
-            break;
+            if( j <= roi.width - v_int16::nlanes )
-        case THRESH_TOZERO:
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
-                j = 0;
+                v_int16 v0 = vx_load( src + j );
-                for( ; j <= roi.width - 16; j += 16 )
+                v0 = v0 <= thresh8;
-                {
+                v0 = v0 & maxval8;
-                    v_int16x8 v0, v1;
+                v_store( dst + j, v0 );
-                    v0 = v_load( src + j );
+                j += v_int16::nlanes;
-                    v1 = v_load( src + j + 8 );
+            }
-                    v0 = ( thresh8 < v0 ) & v0;
-                    v1 = ( thresh8 < v1 ) & v1;
-                    v_store( dst + j, v0 );
-                    v_store( dst + j + 8, v1 );
-                }
-                for( ; j < roi.width; j++ )
+            for( ; j < roi.width; j++ )
-                    dst[j] = threshToZero<short>(src[j], thresh);
+                dst[j] = threshBinaryInv<short>(src[j], thresh, maxval);
+        }
+        break;
+    case THRESH_TRUNC:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            {
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = v_min( v0, thresh8 );
+                v1 = v_min( v1, thresh8 );
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
+            }
+            if( j <= roi.width - v_int16::nlanes )
+            {
+                v_int16 v0 = vx_load( src + j );
+                v0 = v_min( v0, thresh8 );
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
            }
-            break;
-        case THRESH_TOZERO_INV:
+            for( ; j < roi.width; j++ )
-            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+                dst[j] = threshTrunc<short>( src[j], thresh );
+        }
+        break;
+    case THRESH_TOZERO:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
            {
-                j = 0;
+                v_int16 v0, v1;
-                for( ; j <= roi.width - 16; j += 16 )
+                v0 = vx_load( src + j );
-                {
+                v1 = vx_load( src + j + v_int16::nlanes );
-                    v_int16x8 v0, v1;
+                v0 = ( thresh8 < v0 ) & v0;
-                    v0 = v_load( src + j );
+                v1 = ( thresh8 < v1 ) & v1;
-                    v1 = v_load( src + j + 8 );
+                v_store( dst + j, v0 );
-                    v0 = ( v0 <= thresh8 ) & v0;
+                v_store( dst + j + v_int16::nlanes, v1 );
-                    v1 = ( v1 <= thresh8 ) & v1;
+            }
-                    v_store( dst + j, v0 );
+            if( j <= roi.width - v_int16::nlanes )
-                    v_store( dst + j + 8, v1 );
+            {
-                }
+                v_int16 v0 = vx_load( src + j );
+                v0 = ( thresh8 < v0 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
+            }
-                for( ; j < roi.width; j++ )
+            for( ; j < roi.width; j++ )
-                    dst[j] = threshToZeroInv<short>(src[j], thresh);
+                dst[j] = threshToZero<short>(src[j], thresh);
+        }
+        break;
+    case THRESH_TOZERO_INV:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            {
+                v_int16 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_int16::nlanes );
+                v0 = ( v0 <= thresh8 ) & v0;
+                v1 = ( v1 <= thresh8 ) & v1;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_int16::nlanes, v1 );
            }
-            break;
+            if( j <= roi.width - v_int16::nlanes )
-        default:
+            {
-            CV_Error( CV_StsBadArg, "" ); return;
+                v_int16 v0 = vx_load( src + j );
+                v0 = ( v0 <= thresh8 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_int16::nlanes;
+            }
+            for( ; j < roi.width; j++ )
+                dst[j] = threshToZeroInv<short>(src[j], thresh);
        }
+        break;
+    default:
+        CV_Error( CV_StsBadArg, "" ); return;
    }
-    else
+#else
+    threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
 #endif
-    {
-        threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
 }
@@ -736,175 +794,40 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
    }
 #endif
-#if CV_SIMD128
+#if CV_SIMD
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
+    int i, j;
-    if( useSIMD )
+    v_float32 thresh4 = vx_setall_f32( thresh );
-    {
+    v_float32 maxval4 = vx_setall_f32( maxval );
-        int i, j;
-        v_float32x4 thresh4 = v_setall_f32( thresh );
-        v_float32x4 maxval4 = v_setall_f32( maxval );
-        switch( type )
-        {
-            case THRESH_BINARY:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = thresh4 < v0;
-                        v1 = thresh4 < v1;
-                        v0 = v0 & maxval4;
-                        v1 = v1 & maxval4;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshBinary<float>(src[j], thresh, maxval);
-                }
-                break;
-            case THRESH_BINARY_INV:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = v0 <= thresh4;
-                        v1 = v1 <= thresh4;
-                        v0 = v0 & maxval4;
-                        v1 = v1 & maxval4;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshBinaryInv<float>(src[j], thresh, maxval);
-                }
-                break;
-            case THRESH_TRUNC:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = v_min( v0, thresh4 );
-                        v1 = v_min( v1, thresh4 );
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshTrunc<float>(src[j], thresh);
-                }
-                break;
-            case THRESH_TOZERO:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = ( thresh4 < v0 ) & v0;
-                        v1 = ( thresh4 < v1 ) & v1;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshToZero<float>(src[j], thresh);
-                }
-                break;
-            case THRESH_TOZERO_INV:
-                for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
-                {
-                    j = 0;
-                    for( ; j <= roi.width - 8; j += 8 )
-                    {
-                        v_float32x4 v0, v1;
-                        v0 = v_load( src + j );
-                        v1 = v_load( src + j + 4 );
-                        v0 = ( v0 <= thresh4 ) & v0;
-                        v1 = ( v1 <= thresh4 ) & v1;
-                        v_store( dst + j, v0 );
-                        v_store( dst + j + 4, v1 );
-                    }
-                    for( ; j < roi.width; j++ )
-                        dst[j] = threshToZeroInv<float>(src[j], thresh);
-                }
-                break;
-            default:
-                CV_Error( CV_StsBadArg, "" ); return;
-        }
-    }
-    else
-#endif
-    {
-        threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
-    }
-}
-static void
-thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
-{
-    Size roi = _src.size();
-    roi.width *= _src.channels();
-    const double* src = _src.ptr<double>();
-    double* dst = _dst.ptr<double>();
-    size_t src_step = _src.step / sizeof(src[0]);
-    size_t dst_step = _dst.step / sizeof(dst[0]);
-    if (_src.isContinuous() && _dst.isContinuous())
-    {
-        roi.width *= roi.height;
-        roi.height = 1;
-    }
-#if CV_SIMD128_64F
+    switch( type )
-    bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
-    if( useSIMD )
    {
-        int i, j;
-        v_float64x2 thresh2 = v_setall_f64( thresh );
-        v_float64x2 maxval2 = v_setall_f64( maxval );
-        switch( type )
-        {
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                {
-                    v_float64x2 v0, v1;
+                    v_float32 v0, v1;
-                    v0 = v_load( src + j );
+                    v0 = vx_load( src + j );
-                    v1 = v_load( src + j + 2 );
+                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = thresh2 < v0;
+                    v0 = thresh4 < v0;
-                    v1 = thresh2 < v1;
+                    v1 = thresh4 < v1;
-                    v0 = v0 & maxval2;
+                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval2;
+                    v1 = v1 & maxval4;
                    v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = thresh4 < v0;
+                    v0 = v0 & maxval4;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                }
                for( ; j < roi.width; j++ )
-                    dst[j] = threshBinary<double>(src[j], thresh, maxval);
+                    dst[j] = threshBinary<float>(src[j], thresh, maxval);
            }
            break;
@@ -912,21 +835,29 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
                {
-                    v_float64x2 v0, v1;
+                    v_float32 v0, v1;
-                    v0 = v_load( src + j );
+                    v0 = vx_load( src + j );
-                    v1 = v_load( src + j + 2 );
+                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = v0 <= thresh2;
+                    v0 = v0 <= thresh4;
-                    v1 = v1 <= thresh2;
+                    v1 = v1 <= thresh4;
-                    v0 = v0 & maxval2;
+                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval2;
+                    v1 = v1 & maxval4;
                    v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
+                {
+                    v_float32 v0 = vx_load( src + j );
+                    v0 = v0 <= thresh4;
+                    v0 = v0 & maxval4;
+                    v_store( dst + j, v0 );
+                    j += v_float32::nlanes;
                }
                for( ; j < roi.width; j++ )
-                    dst[j] = threshBinaryInv<double>(src[j], thresh, maxval);
+                    dst[j] = threshBinaryInv<float>(src[j], thresh, maxval);
            }
            break;
@@ -934,19 +865,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                {
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = v_min( v0, thresh4 );
+                    v1 = v_min( v1, thresh4 );
+                    v_store( dst + j, v0 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
                {
-                    v_float64x2 v0, v1;
+                    v_float32 v0 = vx_load( src + j );
-                    v0 = v_load( src + j );
+                    v0 = v_min( v0, thresh4 );
-                    v1 = v_load( src + j + 2 );
-                    v0 = v_min( v0, thresh2 );
-                    v1 = v_min( v1, thresh2 );
                    v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    j += v_float32::nlanes;
                }
                for( ; j < roi.width; j++ )
-                    dst[j] = threshTrunc<double>(src[j], thresh);
+                    dst[j] = threshTrunc<float>(src[j], thresh);
            }
            break;
@@ -954,19 +892,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                {
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = ( thresh4 < v0 ) & v0;
+                    v1 = ( thresh4 < v1 ) & v1;
+                    v_store( dst + j, v0 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
                {
-                    v_float64x2 v0, v1;
+                    v_float32 v0 = vx_load( src + j );
-                    v0 = v_load( src + j );
+                    v0 = ( thresh4 < v0 ) & v0;
-                    v1 = v_load( src + j + 2 );
-                    v0 = ( thresh2 < v0 ) & v0;
-                    v1 = ( thresh2 < v1 ) & v1;
                    v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    j += v_float32::nlanes;
                }
                for( ; j < roi.width; j++ )
-                    dst[j] = threshToZero<double>(src[j], thresh);
+                    dst[j] = threshToZero<float>(src[j], thresh);
            }
            break;
@@ -974,30 +919,205 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 4; j += 4 )
+                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                {
+                    v_float32 v0, v1;
+                    v0 = vx_load( src + j );
+                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v0 = ( v0 <= thresh4 ) & v0;
+                    v1 = ( v1 <= thresh4 ) & v1;
+                    v_store( dst + j, v0 );
+                    v_store( dst + j + v_float32::nlanes, v1 );
+                }
+                if( j <= roi.width - v_float32::nlanes )
                {
-                    v_float64x2 v0, v1;
+                    v_float32 v0 = vx_load( src + j );
-                    v0 = v_load( src + j );
+                    v0 = ( v0 <= thresh4 ) & v0;
-                    v1 = v_load( src + j + 2 );
-                    v0 = ( v0 <= thresh2 ) & v0;
-                    v1 = ( v1 <= thresh2 ) & v1;
                    v_store( dst + j, v0 );
-                    v_store( dst + j + 2, v1 );
+                    j += v_float32::nlanes;
                }
                for( ; j < roi.width; j++ )
-                    dst[j] = threshToZeroInv<double>(src[j], thresh);
+                    dst[j] = threshToZeroInv<float>(src[j], thresh);
            }
            break;
        default:
-            CV_Error(CV_StsBadArg, ""); return;
+            CV_Error( CV_StsBadArg, "" ); return;
-        }
    }
-    else
+#else
+    threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
 #endif
+}
+static void
+thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
+{
+    Size roi = _src.size();
+    roi.width *= _src.channels();
+    const double* src = _src.ptr<double>();
+    double* dst = _dst.ptr<double>();
+    size_t src_step = _src.step / sizeof(src[0]);
+    size_t dst_step = _dst.step / sizeof(dst[0]);
+    if (_src.isContinuous() && _dst.isContinuous())
    {
-        threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
+        roi.width *= roi.height;
+        roi.height = 1;
    }
+#if CV_SIMD_64F
+    int i, j;
+    v_float64 thresh2 = vx_setall_f64( thresh );
+    v_float64 maxval2 = vx_setall_f64( maxval );
+    switch( type )
+    {
+    case THRESH_BINARY:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = thresh2 < v0;
+                v1 = thresh2 < v1;
+                v0 = v0 & maxval2;
+                v1 = v1 & maxval2;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = thresh2 < v0;
+                v0 = v0 & maxval2;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+            for( ; j < roi.width; j++ )
+                dst[j] = threshBinary<double>(src[j], thresh, maxval);
+        }
+        break;
+    case THRESH_BINARY_INV:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = v0 <= thresh2;
+                v1 = v1 <= thresh2;
+                v0 = v0 & maxval2;
+                v1 = v1 & maxval2;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = v0 <= thresh2;
+                v0 = v0 & maxval2;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+            for( ; j < roi.width; j++ )
+                dst[j] = threshBinaryInv<double>(src[j], thresh, maxval);
+        }
+        break;
+    case THRESH_TRUNC:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = v_min( v0, thresh2 );
+                v1 = v_min( v1, thresh2 );
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = v_min( v0, thresh2 );
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+            for( ; j < roi.width; j++ )
+                dst[j] = threshTrunc<double>(src[j], thresh);
+        }
+        break;
+    case THRESH_TOZERO:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = ( thresh2 < v0 ) & v0;
+                v1 = ( thresh2 < v1 ) & v1;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = ( thresh2 < v0 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+            for( ; j < roi.width; j++ )
+                dst[j] = threshToZero<double>(src[j], thresh);
+        }
+        break;
+    case THRESH_TOZERO_INV:
+        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
+        {
+            j = 0;
+            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            {
+                v_float64 v0, v1;
+                v0 = vx_load( src + j );
+                v1 = vx_load( src + j + v_float64::nlanes );
+                v0 = ( v0 <= thresh2 ) & v0;
+                v1 = ( v1 <= thresh2 ) & v1;
+                v_store( dst + j, v0 );
+                v_store( dst + j + v_float64::nlanes, v1 );
+            }
+            if( j <= roi.width - v_float64::nlanes )
+            {
+                v_float64 v0 = vx_load( src + j );
+                v0 = ( v0 <= thresh2 ) & v0;
+                v_store( dst + j, v0 );
+                j += v_float64::nlanes;
+            }
+            for( ; j < roi.width; j++ )
+                dst[j] = threshToZeroInv<double>(src[j], thresh);
+        }
+        break;
+    default:
+        CV_Error(CV_StsBadArg, ""); return;
+    }
+#else
+    threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
+#endif
 }
 #ifdef HAVE_IPP