Commit ca00c1dc authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #13631 from terfendail:thresh_wintr

parents 133eb8d1 a202dc9a
...@@ -195,22 +195,19 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -195,22 +195,19 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
int j = 0; int j = 0;
const uchar* src = _src.ptr(); const uchar* src = _src.ptr();
uchar* dst = _dst.ptr(); uchar* dst = _dst.ptr();
#if CV_SIMD128 #if CV_SIMD
bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); v_uint8 thresh_u = vx_setall_u8( thresh );
if( useSIMD ) v_uint8 maxval16 = vx_setall_u8( maxval );
{
v_uint8x16 thresh_u = v_setall_u8( thresh );
v_uint8x16 maxval16 = v_setall_u8( maxval );
switch( type ) switch( type )
{ {
case THRESH_BINARY: case THRESH_BINARY:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 16; j += 16 ) for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
{ {
v_uint8x16 v0; v_uint8 v0;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v0 = thresh_u < v0; v0 = thresh_u < v0;
v0 = v0 & maxval16; v0 = v0 & maxval16;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
...@@ -221,10 +218,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -221,10 +218,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_BINARY_INV: case THRESH_BINARY_INV:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 16; j += 16 ) for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
{ {
v_uint8x16 v0; v_uint8 v0;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v0 = v0 <= thresh_u; v0 = v0 <= thresh_u;
v0 = v0 & maxval16; v0 = v0 & maxval16;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
...@@ -235,10 +232,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -235,10 +232,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_TRUNC: case THRESH_TRUNC:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 16; j += 16 ) for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
{ {
v_uint8x16 v0; v_uint8 v0;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v0 = v0 - ( v0 - thresh_u ); v0 = v0 - ( v0 - thresh_u );
v_store( dst + j, v0 ); v_store( dst + j, v0 );
} }
...@@ -248,10 +245,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -248,10 +245,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_TOZERO: case THRESH_TOZERO:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 16; j += 16 ) for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
{ {
v_uint8x16 v0; v_uint8 v0;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v0 = ( thresh_u < v0 ) & v0; v0 = ( thresh_u < v0 ) & v0;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
} }
...@@ -261,17 +258,16 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -261,17 +258,16 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_TOZERO_INV: case THRESH_TOZERO_INV:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 16; j += 16 ) for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
{ {
v_uint8x16 v0; v_uint8 v0;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v0 = ( v0 <= thresh_u ) & v0; v0 = ( v0 <= thresh_u ) & v0;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
} }
} }
break; break;
} }
}
#endif #endif
int j_scalar = j; int j_scalar = j;
...@@ -362,30 +358,35 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) ...@@ -362,30 +358,35 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
const ushort* src = _src.ptr<ushort>(); const ushort* src = _src.ptr<ushort>();
ushort* dst = _dst.ptr<ushort>(); ushort* dst = _dst.ptr<ushort>();
#if CV_SIMD128 #if CV_SIMD
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
if (useSIMD)
{
int i, j; int i, j;
v_uint16x8 thresh_u = v_setall_u16(thresh); v_uint16 thresh_u = vx_setall_u16(thresh);
v_uint16x8 maxval16 = v_setall_u16(maxval); v_uint16 maxval16 = vx_setall_u16(maxval);
switch (type) switch (type)
{ {
case THRESH_BINARY: case THRESH_BINARY:
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
{ {
for (j = 0; j <= roi.width - 16; j += 16) for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
{ {
v_uint16x8 v0, v1; v_uint16 v0, v1;
v0 = v_load(src + j); v0 = vx_load(src + j);
v1 = v_load(src + j + 8); v1 = vx_load(src + j + v_uint16::nlanes);
v0 = thresh_u < v0; v0 = thresh_u < v0;
v1 = thresh_u < v1; v1 = thresh_u < v1;
v0 = v0 & maxval16; v0 = v0 & maxval16;
v1 = v1 & maxval16; v1 = v1 & maxval16;
v_store(dst + j, v0); v_store(dst + j, v0);
v_store(dst + j + 8, v1); v_store(dst + j + v_uint16::nlanes, v1);
}
if (j <= roi.width - v_uint16::nlanes)
{
v_uint16 v0 = vx_load(src + j);
v0 = thresh_u < v0;
v0 = v0 & maxval16;
v_store(dst + j, v0);
j += v_uint16::nlanes;
} }
for (; j < roi.width; j++) for (; j < roi.width; j++)
...@@ -397,17 +398,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) ...@@ -397,17 +398,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
{ {
j = 0; j = 0;
for (; j <= roi.width - 16; j += 16) for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
{ {
v_uint16x8 v0, v1; v_uint16 v0, v1;
v0 = v_load(src + j); v0 = vx_load(src + j);
v1 = v_load(src + j + 8); v1 = vx_load(src + j + v_uint16::nlanes);
v0 = v0 <= thresh_u; v0 = v0 <= thresh_u;
v1 = v1 <= thresh_u; v1 = v1 <= thresh_u;
v0 = v0 & maxval16; v0 = v0 & maxval16;
v1 = v1 & maxval16; v1 = v1 & maxval16;
v_store(dst + j, v0); v_store(dst + j, v0);
v_store(dst + j + 8, v1); v_store(dst + j + v_uint16::nlanes, v1);
}
if (j <= roi.width - v_uint16::nlanes)
{
v_uint16 v0 = vx_load(src + j);
v0 = v0 <= thresh_u;
v0 = v0 & maxval16;
v_store(dst + j, v0);
j += v_uint16::nlanes;
} }
for (; j < roi.width; j++) for (; j < roi.width; j++)
...@@ -419,15 +428,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) ...@@ -419,15 +428,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
{ {
j = 0; j = 0;
for (; j <= roi.width - 16; j += 16) for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
{ {
v_uint16x8 v0, v1; v_uint16 v0, v1;
v0 = v_load(src + j); v0 = vx_load(src + j);
v1 = v_load(src + j + 8); v1 = vx_load(src + j + v_uint16::nlanes);
v0 = v_min(v0, thresh_u); v0 = v_min(v0, thresh_u);
v1 = v_min(v1, thresh_u); v1 = v_min(v1, thresh_u);
v_store(dst + j, v0); v_store(dst + j, v0);
v_store(dst + j + 8, v1); v_store(dst + j + v_uint16::nlanes, v1);
}
if (j <= roi.width - v_uint16::nlanes)
{
v_uint16 v0 = vx_load(src + j);
v0 = v_min(v0, thresh_u);
v_store(dst + j, v0);
j += v_uint16::nlanes;
} }
for (; j < roi.width; j++) for (; j < roi.width; j++)
...@@ -439,15 +455,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) ...@@ -439,15 +455,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
{ {
j = 0; j = 0;
for (; j <= roi.width - 16; j += 16) for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
{ {
v_uint16x8 v0, v1; v_uint16 v0, v1;
v0 = v_load(src + j); v0 = vx_load(src + j);
v1 = v_load(src + j + 8); v1 = vx_load(src + j + v_uint16::nlanes);
v0 = (thresh_u < v0) & v0; v0 = (thresh_u < v0) & v0;
v1 = (thresh_u < v1) & v1; v1 = (thresh_u < v1) & v1;
v_store(dst + j, v0); v_store(dst + j, v0);
v_store(dst + j + 8, v1); v_store(dst + j + v_uint16::nlanes, v1);
}
if (j <= roi.width - v_uint16::nlanes)
{
v_uint16 v0 = vx_load(src + j);
v0 = (thresh_u < v0) & v0;
v_store(dst + j, v0);
j += v_uint16::nlanes;
} }
for (; j < roi.width; j++) for (; j < roi.width; j++)
...@@ -459,15 +482,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) ...@@ -459,15 +482,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
{ {
j = 0; j = 0;
for (; j <= roi.width - 16; j += 16) for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
{ {
v_uint16x8 v0, v1; v_uint16 v0, v1;
v0 = v_load(src + j); v0 = vx_load(src + j);
v1 = v_load(src + j + 8); v1 = vx_load(src + j + v_uint16::nlanes);
v0 = (v0 <= thresh_u) & v0; v0 = (v0 <= thresh_u) & v0;
v1 = (v1 <= thresh_u) & v1; v1 = (v1 <= thresh_u) & v1;
v_store(dst + j, v0); v_store(dst + j, v0);
v_store(dst + j + 8, v1); v_store(dst + j + v_uint16::nlanes, v1);
}
if (j <= roi.width - v_uint16::nlanes)
{
v_uint16 v0 = vx_load(src + j);
v0 = (v0 <= thresh_u) & v0;
v_store(dst + j, v0);
j += v_uint16::nlanes;
} }
for (; j < roi.width; j++) for (; j < roi.width; j++)
...@@ -475,12 +505,9 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) ...@@ -475,12 +505,9 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
} }
break; break;
} }
} #else
else
#endif
{
threshGeneric<ushort>(roi, src, src_step, dst, dst_step, thresh, maxval, type); threshGeneric<ushort>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
} #endif
} }
static void static void
...@@ -556,13 +583,10 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -556,13 +583,10 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
} }
#endif #endif
#if CV_SIMD128 #if CV_SIMD
bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
if( useSIMD )
{
int i, j; int i, j;
v_int16x8 thresh8 = v_setall_s16( thresh ); v_int16 thresh8 = vx_setall_s16( thresh );
v_int16x8 maxval8 = v_setall_s16( maxval ); v_int16 maxval8 = vx_setall_s16( maxval );
switch( type ) switch( type )
{ {
...@@ -570,17 +594,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -570,17 +594,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
{ {
v_int16x8 v0, v1; v_int16 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 8 ); v1 = vx_load( src + j + v_int16::nlanes );
v0 = thresh8 < v0; v0 = thresh8 < v0;
v1 = thresh8 < v1; v1 = thresh8 < v1;
v0 = v0 & maxval8; v0 = v0 & maxval8;
v1 = v1 & maxval8; v1 = v1 & maxval8;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 8, v1 ); v_store( dst + j + v_int16::nlanes, v1 );
}
if( j <= roi.width - v_int16::nlanes )
{
v_int16 v0 = vx_load( src + j );
v0 = thresh8 < v0;
v0 = v0 & maxval8;
v_store( dst + j, v0 );
j += v_int16::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -592,17 +624,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -592,17 +624,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
{ {
v_int16x8 v0, v1; v_int16 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 8 ); v1 = vx_load( src + j + v_int16::nlanes );
v0 = v0 <= thresh8; v0 = v0 <= thresh8;
v1 = v1 <= thresh8; v1 = v1 <= thresh8;
v0 = v0 & maxval8; v0 = v0 & maxval8;
v1 = v1 & maxval8; v1 = v1 & maxval8;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 8, v1 ); v_store( dst + j + v_int16::nlanes, v1 );
}
if( j <= roi.width - v_int16::nlanes )
{
v_int16 v0 = vx_load( src + j );
v0 = v0 <= thresh8;
v0 = v0 & maxval8;
v_store( dst + j, v0 );
j += v_int16::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -614,15 +654,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -614,15 +654,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
{ {
v_int16x8 v0, v1; v_int16 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 8 ); v1 = vx_load( src + j + v_int16::nlanes );
v0 = v_min( v0, thresh8 ); v0 = v_min( v0, thresh8 );
v1 = v_min( v1, thresh8 ); v1 = v_min( v1, thresh8 );
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 8, v1 ); v_store( dst + j + v_int16::nlanes, v1 );
}
if( j <= roi.width - v_int16::nlanes )
{
v_int16 v0 = vx_load( src + j );
v0 = v_min( v0, thresh8 );
v_store( dst + j, v0 );
j += v_int16::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -634,15 +681,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -634,15 +681,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
{ {
v_int16x8 v0, v1; v_int16 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 8 ); v1 = vx_load( src + j + v_int16::nlanes );
v0 = ( thresh8 < v0 ) & v0; v0 = ( thresh8 < v0 ) & v0;
v1 = ( thresh8 < v1 ) & v1; v1 = ( thresh8 < v1 ) & v1;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 8, v1 ); v_store( dst + j + v_int16::nlanes, v1 );
}
if( j <= roi.width - v_int16::nlanes )
{
v_int16 v0 = vx_load( src + j );
v0 = ( thresh8 < v0 ) & v0;
v_store( dst + j, v0 );
j += v_int16::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -654,15 +708,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -654,15 +708,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
{ {
v_int16x8 v0, v1; v_int16 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 8 ); v1 = vx_load( src + j + v_int16::nlanes );
v0 = ( v0 <= thresh8 ) & v0; v0 = ( v0 <= thresh8 ) & v0;
v1 = ( v1 <= thresh8 ) & v1; v1 = ( v1 <= thresh8 ) & v1;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 8, v1 ); v_store( dst + j + v_int16::nlanes, v1 );
}
if( j <= roi.width - v_int16::nlanes )
{
v_int16 v0 = vx_load( src + j );
v0 = ( v0 <= thresh8 ) & v0;
v_store( dst + j, v0 );
j += v_int16::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -672,12 +733,9 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -672,12 +733,9 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
default: default:
CV_Error( CV_StsBadArg, "" ); return; CV_Error( CV_StsBadArg, "" ); return;
} }
} #else
else
#endif
{
threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type); threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
} #endif
} }
...@@ -736,13 +794,10 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -736,13 +794,10 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
} }
#endif #endif
#if CV_SIMD128 #if CV_SIMD
bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
if( useSIMD )
{
int i, j; int i, j;
v_float32x4 thresh4 = v_setall_f32( thresh ); v_float32 thresh4 = vx_setall_f32( thresh );
v_float32x4 maxval4 = v_setall_f32( maxval ); v_float32 maxval4 = vx_setall_f32( maxval );
switch( type ) switch( type )
{ {
...@@ -750,17 +805,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -750,17 +805,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
{ {
v_float32x4 v0, v1; v_float32 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 4 ); v1 = vx_load( src + j + v_float32::nlanes );
v0 = thresh4 < v0; v0 = thresh4 < v0;
v1 = thresh4 < v1; v1 = thresh4 < v1;
v0 = v0 & maxval4; v0 = v0 & maxval4;
v1 = v1 & maxval4; v1 = v1 & maxval4;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 4, v1 ); v_store( dst + j + v_float32::nlanes, v1 );
}
if( j <= roi.width - v_float32::nlanes )
{
v_float32 v0 = vx_load( src + j );
v0 = thresh4 < v0;
v0 = v0 & maxval4;
v_store( dst + j, v0 );
j += v_float32::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -772,17 +835,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -772,17 +835,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
{ {
v_float32x4 v0, v1; v_float32 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 4 ); v1 = vx_load( src + j + v_float32::nlanes );
v0 = v0 <= thresh4; v0 = v0 <= thresh4;
v1 = v1 <= thresh4; v1 = v1 <= thresh4;
v0 = v0 & maxval4; v0 = v0 & maxval4;
v1 = v1 & maxval4; v1 = v1 & maxval4;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 4, v1 ); v_store( dst + j + v_float32::nlanes, v1 );
}
if( j <= roi.width - v_float32::nlanes )
{
v_float32 v0 = vx_load( src + j );
v0 = v0 <= thresh4;
v0 = v0 & maxval4;
v_store( dst + j, v0 );
j += v_float32::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -794,15 +865,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -794,15 +865,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
{ {
v_float32x4 v0, v1; v_float32 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 4 ); v1 = vx_load( src + j + v_float32::nlanes );
v0 = v_min( v0, thresh4 ); v0 = v_min( v0, thresh4 );
v1 = v_min( v1, thresh4 ); v1 = v_min( v1, thresh4 );
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 4, v1 ); v_store( dst + j + v_float32::nlanes, v1 );
}
if( j <= roi.width - v_float32::nlanes )
{
v_float32 v0 = vx_load( src + j );
v0 = v_min( v0, thresh4 );
v_store( dst + j, v0 );
j += v_float32::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -814,15 +892,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -814,15 +892,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
{ {
v_float32x4 v0, v1; v_float32 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 4 ); v1 = vx_load( src + j + v_float32::nlanes );
v0 = ( thresh4 < v0 ) & v0; v0 = ( thresh4 < v0 ) & v0;
v1 = ( thresh4 < v1 ) & v1; v1 = ( thresh4 < v1 ) & v1;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 4, v1 ); v_store( dst + j + v_float32::nlanes, v1 );
}
if( j <= roi.width - v_float32::nlanes )
{
v_float32 v0 = vx_load( src + j );
v0 = ( thresh4 < v0 ) & v0;
v_store( dst + j, v0 );
j += v_float32::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -834,15 +919,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -834,15 +919,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
{ {
v_float32x4 v0, v1; v_float32 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 4 ); v1 = vx_load( src + j + v_float32::nlanes );
v0 = ( v0 <= thresh4 ) & v0; v0 = ( v0 <= thresh4 ) & v0;
v1 = ( v1 <= thresh4 ) & v1; v1 = ( v1 <= thresh4 ) & v1;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 4, v1 ); v_store( dst + j + v_float32::nlanes, v1 );
}
if( j <= roi.width - v_float32::nlanes )
{
v_float32 v0 = vx_load( src + j );
v0 = ( v0 <= thresh4 ) & v0;
v_store( dst + j, v0 );
j += v_float32::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -852,12 +944,9 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -852,12 +944,9 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
default: default:
CV_Error( CV_StsBadArg, "" ); return; CV_Error( CV_StsBadArg, "" ); return;
} }
} #else
else
#endif
{
threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type); threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
} #endif
} }
static void static void
...@@ -876,13 +965,10 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -876,13 +965,10 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
roi.height = 1; roi.height = 1;
} }
#if CV_SIMD128_64F #if CV_SIMD_64F
bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
if( useSIMD )
{
int i, j; int i, j;
v_float64x2 thresh2 = v_setall_f64( thresh ); v_float64 thresh2 = vx_setall_f64( thresh );
v_float64x2 maxval2 = v_setall_f64( maxval ); v_float64 maxval2 = vx_setall_f64( maxval );
switch( type ) switch( type )
{ {
...@@ -890,17 +976,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -890,17 +976,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
{ {
v_float64x2 v0, v1; v_float64 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 2 ); v1 = vx_load( src + j + v_float64::nlanes );
v0 = thresh2 < v0; v0 = thresh2 < v0;
v1 = thresh2 < v1; v1 = thresh2 < v1;
v0 = v0 & maxval2; v0 = v0 & maxval2;
v1 = v1 & maxval2; v1 = v1 & maxval2;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 2, v1 ); v_store( dst + j + v_float64::nlanes, v1 );
}
if( j <= roi.width - v_float64::nlanes )
{
v_float64 v0 = vx_load( src + j );
v0 = thresh2 < v0;
v0 = v0 & maxval2;
v_store( dst + j, v0 );
j += v_float64::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -912,17 +1006,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -912,17 +1006,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
{ {
v_float64x2 v0, v1; v_float64 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 2 ); v1 = vx_load( src + j + v_float64::nlanes );
v0 = v0 <= thresh2; v0 = v0 <= thresh2;
v1 = v1 <= thresh2; v1 = v1 <= thresh2;
v0 = v0 & maxval2; v0 = v0 & maxval2;
v1 = v1 & maxval2; v1 = v1 & maxval2;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 2, v1 ); v_store( dst + j + v_float64::nlanes, v1 );
}
if( j <= roi.width - v_float64::nlanes )
{
v_float64 v0 = vx_load( src + j );
v0 = v0 <= thresh2;
v0 = v0 & maxval2;
v_store( dst + j, v0 );
j += v_float64::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -934,15 +1036,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -934,15 +1036,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
{ {
v_float64x2 v0, v1; v_float64 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 2 ); v1 = vx_load( src + j + v_float64::nlanes );
v0 = v_min( v0, thresh2 ); v0 = v_min( v0, thresh2 );
v1 = v_min( v1, thresh2 ); v1 = v_min( v1, thresh2 );
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 2, v1 ); v_store( dst + j + v_float64::nlanes, v1 );
}
if( j <= roi.width - v_float64::nlanes )
{
v_float64 v0 = vx_load( src + j );
v0 = v_min( v0, thresh2 );
v_store( dst + j, v0 );
j += v_float64::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -954,15 +1063,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -954,15 +1063,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
{ {
v_float64x2 v0, v1; v_float64 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 2 ); v1 = vx_load( src + j + v_float64::nlanes );
v0 = ( thresh2 < v0 ) & v0; v0 = ( thresh2 < v0 ) & v0;
v1 = ( thresh2 < v1 ) & v1; v1 = ( thresh2 < v1 ) & v1;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 2, v1 ); v_store( dst + j + v_float64::nlanes, v1 );
}
if( j <= roi.width - v_float64::nlanes )
{
v_float64 v0 = vx_load( src + j );
v0 = ( thresh2 < v0 ) & v0;
v_store( dst + j, v0 );
j += v_float64::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -974,15 +1090,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -974,15 +1090,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
{ {
v_float64x2 v0, v1; v_float64 v0, v1;
v0 = v_load( src + j ); v0 = vx_load( src + j );
v1 = v_load( src + j + 2 ); v1 = vx_load( src + j + v_float64::nlanes );
v0 = ( v0 <= thresh2 ) & v0; v0 = ( v0 <= thresh2 ) & v0;
v1 = ( v1 <= thresh2 ) & v1; v1 = ( v1 <= thresh2 ) & v1;
v_store( dst + j, v0 ); v_store( dst + j, v0 );
v_store( dst + j + 2, v1 ); v_store( dst + j + v_float64::nlanes, v1 );
}
if( j <= roi.width - v_float64::nlanes )
{
v_float64 v0 = vx_load( src + j );
v0 = ( v0 <= thresh2 ) & v0;
v_store( dst + j, v0 );
j += v_float64::nlanes;
} }
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
...@@ -992,12 +1115,9 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -992,12 +1115,9 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
default: default:
CV_Error(CV_StsBadArg, ""); return; CV_Error(CV_StsBadArg, ""); return;
} }
} #else
else
#endif
{
threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type); threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
} #endif
} }
#ifdef HAVE_IPP #ifdef HAVE_IPP
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment