Commit ea6410d1 authored by Tomoaki Teshima's avatar Tomoaki Teshima

use universal intrinsic in threshold

  * add performance test for 32F and 64F threshold
  * requires update of opencv_extra
parent 0f03f692
...@@ -14,7 +14,7 @@ typedef perf::TestBaseWithParam<Size_MatType_ThreshType_t> Size_MatType_ThreshTy ...@@ -14,7 +14,7 @@ typedef perf::TestBaseWithParam<Size_MatType_ThreshType_t> Size_MatType_ThreshTy
PERF_TEST_P(Size_MatType_ThreshType, threshold, PERF_TEST_P(Size_MatType_ThreshType, threshold,
testing::Combine( testing::Combine(
testing::Values(TYPICAL_MAT_SIZES), testing::Values(TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_16SC1), testing::Values(CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1),
ThreshType::all() ThreshType::all()
) )
) )
......
...@@ -42,23 +42,7 @@ ...@@ -42,23 +42,7 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp" #include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#if CV_NEON && defined(__aarch64__)
#include <arm_neon.h>
namespace cv {
// Workaround with missing definitions of vreinterpretq_u64_f64/vreinterpretq_f64_u64
template <typename T> static inline
uint64x2_t vreinterpretq_u64_f64(T a)
{
return (uint64x2_t) a;
}
template <typename T> static inline
float64x2_t vreinterpretq_f64_u64(T a)
{
return (float64x2_t) a;
}
} // namespace cv
#endif
namespace cv namespace cv
{ {
...@@ -137,38 +121,25 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -137,38 +121,25 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
int j = 0; int j = 0;
const uchar* src = _src.ptr(); const uchar* src = _src.ptr();
uchar* dst = _dst.ptr(); uchar* dst = _dst.ptr();
#if CV_SSE2 #if CV_SIMD128
if( (roi.width >= 8) && checkHardwareSupport(CV_CPU_SSE2) ) bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
if( useSIMD )
{ {
__m128i _x80 = _mm_set1_epi8('\x80'); v_uint8x16 thresh_u = v_setall_u8( thresh );
__m128i thresh_u = _mm_set1_epi8(thresh); v_uint8x16 maxval16 = v_setall_u8( maxval );
__m128i thresh_s = _mm_set1_epi8(thresh ^ 0x80);
__m128i maxval_ = _mm_set1_epi8(maxval);
switch( type ) switch( type )
{ {
case THRESH_BINARY: case THRESH_BINARY:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 32; j += 32 ) for( j = 0; j <= roi.width - 16; j += 16 )
{
__m128i v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s );
v0 = _mm_and_si128( v0, maxval_ );
v1 = _mm_and_si128( v1, maxval_ );
_mm_storeu_si128( (__m128i*)(dst + j), v0 );
_mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
}
for( ; j <= roi.width - 8; j += 8 )
{ {
__m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) ); v_uint8x16 v0;
v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s ); v0 = v_load( src + j );
v0 = _mm_and_si128( v0, maxval_ ); v0 = thresh_u < v0;
_mm_storel_epi64( (__m128i*)(dst + j), v0 ); v0 = v0 & maxval16;
v_store( dst + j, v0 );
} }
} }
break; break;
...@@ -176,25 +147,13 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -176,25 +147,13 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_BINARY_INV: case THRESH_BINARY_INV:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 32; j += 32 ) for( j = 0; j <= roi.width - 16; j += 16 )
{ {
__m128i v0, v1; v_uint8x16 v0;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v0 = v_load( src + j );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) ); v0 = v0 <= thresh_u;
v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s ); v0 = v0 & maxval16;
v1 = _mm_cmpgt_epi8( _mm_xor_si128(v1, _x80), thresh_s ); v_store( dst + j, v0 );
v0 = _mm_andnot_si128( v0, maxval_ );
v1 = _mm_andnot_si128( v1, maxval_ );
_mm_storeu_si128( (__m128i*)(dst + j), v0 );
_mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
}
for( ; j <= roi.width - 8; j += 8 )
{
__m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
v0 = _mm_cmpgt_epi8( _mm_xor_si128(v0, _x80), thresh_s );
v0 = _mm_andnot_si128( v0, maxval_ );
_mm_storel_epi64( (__m128i*)(dst + j), v0 );
} }
} }
break; break;
...@@ -202,22 +161,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -202,22 +161,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_TRUNC: case THRESH_TRUNC:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 32; j += 32 ) for( j = 0; j <= roi.width - 16; j += 16 )
{ {
__m128i v0, v1; v_uint8x16 v0;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v0 = v_load( src + j );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) ); v0 = v0 - ( v0 - thresh_u );
v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u )); v_store( dst + j, v0 );
v1 = _mm_subs_epu8( v1, _mm_subs_epu8( v1, thresh_u ));
_mm_storeu_si128( (__m128i*)(dst + j), v0 );
_mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
}
for( ; j <= roi.width - 8; j += 8 )
{
__m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) );
v0 = _mm_subs_epu8( v0, _mm_subs_epu8( v0, thresh_u ));
_mm_storel_epi64( (__m128i*)(dst + j), v0 );
} }
} }
break; break;
...@@ -225,22 +174,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -225,22 +174,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_TOZERO: case THRESH_TOZERO:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 32; j += 32 ) for( j = 0; j <= roi.width - 16; j += 16 )
{
__m128i v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ));
v1 = _mm_and_si128( v1, _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ));
_mm_storeu_si128( (__m128i*)(dst + j), v0 );
_mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
}
for( ; j <= roi.width - 8; j += 8 )
{ {
__m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) ); v_uint8x16 v0;
v0 = _mm_and_si128( v0, _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s )); v0 = v_load( src + j );
_mm_storel_epi64( (__m128i*)(dst + j), v0 ); v0 = ( thresh_u < v0 ) & v0;
v_store( dst + j, v0 );
} }
} }
break; break;
...@@ -248,76 +187,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) ...@@ -248,76 +187,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
case THRESH_TOZERO_INV: case THRESH_TOZERO_INV:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
for( j = 0; j <= roi.width - 32; j += 32 ) for( j = 0; j <= roi.width - 16; j += 16 )
{
__m128i v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 16) );
v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 );
v1 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v1, _x80), thresh_s ), v1 );
_mm_storeu_si128( (__m128i*)(dst + j), v0 );
_mm_storeu_si128( (__m128i*)(dst + j + 16), v1 );
}
for( ; j <= roi.width - 8; j += 8 )
{ {
__m128i v0 = _mm_loadl_epi64( (const __m128i*)(src + j) ); v_uint8x16 v0;
v0 = _mm_andnot_si128( _mm_cmpgt_epi8(_mm_xor_si128(v0, _x80), thresh_s ), v0 ); v0 = v_load( src + j );
_mm_storel_epi64( (__m128i*)(dst + j), v0 ); v0 = ( v0 <= thresh_u ) & v0;
} v_store( dst + j, v0 );
}
break;
}
}
#elif CV_NEON
if( roi.width >= 16 )
{
uint8x16_t v_thresh = vdupq_n_u8(thresh), v_maxval = vdupq_n_u8(maxval);
switch( type )
{
case THRESH_BINARY:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for ( j = 0; j <= roi.width - 16; j += 16)
vst1q_u8(dst + j, vandq_u8(vcgtq_u8(vld1q_u8(src + j), v_thresh), v_maxval));
}
break;
case THRESH_BINARY_INV:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for ( j = 0; j <= roi.width - 16; j += 16)
vst1q_u8(dst + j, vandq_u8(vcleq_u8(vld1q_u8(src + j), v_thresh), v_maxval));
}
break;
case THRESH_TRUNC:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for ( j = 0; j <= roi.width - 16; j += 16)
vst1q_u8(dst + j, vminq_u8(vld1q_u8(src + j), v_thresh));
}
break;
case THRESH_TOZERO:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for ( j = 0; j <= roi.width - 16; j += 16)
{
uint8x16_t v_src = vld1q_u8(src + j), v_mask = vcgtq_u8(v_src, v_thresh);
vst1q_u8(dst + j, vandq_u8(v_mask, v_src));
}
}
break;
case THRESH_TOZERO_INV:
for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for ( j = 0; j <= roi.width - 16; j += 16)
{
uint8x16_t v_src = vld1q_u8(src + j), v_mask = vcleq_u8(v_src, v_thresh);
vst1q_u8(dst + j, vandq_u8(v_mask, v_src));
} }
} }
break; break;
...@@ -404,10 +279,6 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -404,10 +279,6 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
size_t src_step = _src.step/sizeof(src[0]); size_t src_step = _src.step/sizeof(src[0]);
size_t dst_step = _dst.step/sizeof(dst[0]); size_t dst_step = _dst.step/sizeof(dst[0]);
#if CV_SSE2
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
if( _src.isContinuous() && _dst.isContinuous() ) if( _src.isContinuous() && _dst.isContinuous() )
{ {
roi.width *= roi.height; roi.width *= roi.height;
...@@ -471,38 +342,31 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -471,38 +342,31 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
} }
#endif #endif
#if CV_SIMD128
bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
if( useSIMD )
{
v_int16x8 thresh8 = v_setall_s16( thresh );
v_int16x8 maxval8 = v_setall_s16( maxval );
switch( type ) switch( type )
{ {
case THRESH_BINARY: case THRESH_BINARY:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 16; j += 16 )
{ {
__m128i v0, v1; v_int16x8 v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v0 = v_load( src + j );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v1 = v_load( src + j + 8 );
v0 = _mm_cmpgt_epi16( v0, thresh8 ); v0 = thresh8 < v0;
v1 = _mm_cmpgt_epi16( v1, thresh8 ); v1 = thresh8 < v1;
v0 = _mm_and_si128( v0, maxval8 ); v0 = v0 & maxval8;
v1 = _mm_and_si128( v1, maxval8 ); v1 = v1 & maxval8;
_mm_storeu_si128((__m128i*)(dst + j), v0 ); v_store( dst + j, v0 );
_mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); v_store( dst + j + 8, v1 );
}
} }
#elif CV_NEON
int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
for( ; j <= roi.width - 8; j += 8 )
{
uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh);
vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
}
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
dst[j] = src[j] > thresh ? maxval : 0; dst[j] = src[j] > thresh ? maxval : 0;
...@@ -513,32 +377,18 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -513,32 +377,18 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 16; j += 16 )
{ {
__m128i v0, v1; v_int16x8 v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v0 = v_load( src + j );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v1 = v_load( src + j + 8 );
v0 = _mm_cmpgt_epi16( v0, thresh8 ); v0 = v0 <= thresh8;
v1 = _mm_cmpgt_epi16( v1, thresh8 ); v1 = v1 <= thresh8;
v0 = _mm_andnot_si128( v0, maxval8 ); v0 = v0 & maxval8;
v1 = _mm_andnot_si128( v1, maxval8 ); v1 = v1 & maxval8;
_mm_storeu_si128((__m128i*)(dst + j), v0 ); v_store( dst + j, v0 );
_mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); v_store( dst + j + 8, v1 );
}
}
#elif CV_NEON
int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
for( ; j <= roi.width - 8; j += 8 )
{
uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh);
vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
} }
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
dst[j] = src[j] <= thresh ? maxval : 0; dst[j] = src[j] <= thresh ? maxval : 0;
...@@ -549,30 +399,19 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -549,30 +399,19 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128i thresh8 = _mm_set1_epi16(thresh);
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 16; j += 16 )
{ {
__m128i v0, v1; v_int16x8 v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v0 = v_load( src + j );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v1 = v_load( src + j + 8 );
v0 = _mm_min_epi16( v0, thresh8 ); v0 = v_min( v0, thresh8 );
v1 = _mm_min_epi16( v1, thresh8 ); v1 = v_min( v1, thresh8 );
_mm_storeu_si128((__m128i*)(dst + j), v0 ); v_store( dst + j, v0 );
_mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); v_store( dst + j + 8, v1 );
}
} }
#elif CV_NEON
int16x8_t v_thresh = vdupq_n_s16(thresh);
for( ; j <= roi.width - 8; j += 8 )
vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh));
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
dst[j] = std::min(src[j], thresh); dst[j] = std::min( src[j], thresh );
} }
break; break;
...@@ -580,31 +419,16 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -580,31 +419,16 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128i thresh8 = _mm_set1_epi16(thresh);
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 16; j += 16 )
{ {
__m128i v0, v1; v_int16x8 v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v0 = v_load( src + j );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v1 = v_load( src + j + 8 );
v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v0 = ( thresh8 < v0 ) & v0;
v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); v1 = ( thresh8 < v1 ) & v1;
_mm_storeu_si128((__m128i*)(dst + j), v0 ); v_store( dst + j, v0 );
_mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); v_store( dst + j + 8, v1 );
}
}
#elif CV_NEON
int16x8_t v_thresh = vdupq_n_s16(thresh);
for( ; j <= roi.width - 8; j += 8 )
{
int16x8_t v_src = vld1q_s16(src + j);
uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh);
vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
} }
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
{ {
...@@ -618,32 +442,72 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -618,32 +442,72 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128i thresh8 = _mm_set1_epi16(thresh);
for( ; j <= roi.width - 16; j += 16 ) for( ; j <= roi.width - 16; j += 16 )
{ {
__m128i v0, v1; v_int16x8 v0, v1;
v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v0 = v_load( src + j );
v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v1 = v_load( src + j + 8 );
v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v0 = ( v0 <= thresh8 ) & v0;
v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); v1 = ( v1 <= thresh8 ) & v1;
_mm_storeu_si128((__m128i*)(dst + j), v0 ); v_store( dst + j, v0 );
_mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); v_store( dst + j + 8, v1 );
}
for( ; j < roi.width; j++ )
{
short v = src[j];
dst[j] = v <= thresh ? v : 0;
} }
} }
#elif CV_NEON break;
int16x8_t v_thresh = vdupq_n_s16(thresh); default:
return CV_Error( CV_StsBadArg, "" );
}
}
else
#endif
{
switch( type )
{
case THRESH_BINARY:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
dst[j] = src[j] > thresh ? maxval : 0;
}
break;
for( ; j <= roi.width - 8; j += 8 ) case THRESH_BINARY_INV:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
int16x8_t v_src = vld1q_s16(src + j); for( j = 0; j < roi.width; j++ )
uint16x8_t v_mask = vcleq_s16(v_src, v_thresh); dst[j] = src[j] <= thresh ? maxval : 0;
vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
} }
#endif break;
for( ; j < roi.width; j++ )
case THRESH_TRUNC:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
dst[j] = std::min( src[j], thresh );
}
break;
case THRESH_TOZERO:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
{
short v = src[j];
dst[j] = v > thresh ? v : 0;
}
}
break;
case THRESH_TOZERO_INV:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
{ {
short v = src[j]; short v = src[j];
dst[j] = v <= thresh ? v : 0; dst[j] = v <= thresh ? v : 0;
...@@ -653,6 +517,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) ...@@ -653,6 +517,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
default: default:
return CV_Error( CV_StsBadArg, "" ); return CV_Error( CV_StsBadArg, "" );
} }
}
} }
...@@ -667,10 +532,6 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -667,10 +532,6 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
size_t src_step = _src.step/sizeof(src[0]); size_t src_step = _src.step/sizeof(src[0]);
size_t dst_step = _dst.step/sizeof(dst[0]); size_t dst_step = _dst.step/sizeof(dst[0]);
#if CV_SSE
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif
if( _src.isContinuous() && _dst.isContinuous() ) if( _src.isContinuous() && _dst.isContinuous() )
{ {
roi.width *= roi.height; roi.width *= roi.height;
...@@ -716,40 +577,31 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -716,40 +577,31 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
} }
#endif #endif
#if CV_SIMD128
bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
if( useSIMD )
{
v_float32x4 thresh4 = v_setall_f32( thresh );
v_float32x4 maxval4 = v_setall_f32( maxval );
switch( type ) switch( type )
{ {
case THRESH_BINARY: case THRESH_BINARY:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE
if( useSIMD )
{
__m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 8; j += 8 )
{ {
__m128 v0, v1; v_float32x4 v0, v1;
v0 = _mm_loadu_ps( src + j ); v0 = v_load( src + j );
v1 = _mm_loadu_ps( src + j + 4 ); v1 = v_load( src + j + 4 );
v0 = _mm_cmpgt_ps( v0, thresh4 ); v0 = thresh4 < v0;
v1 = _mm_cmpgt_ps( v1, thresh4 ); v1 = thresh4 < v1;
v0 = _mm_and_ps( v0, maxval4 ); v0 = v0 & maxval4;
v1 = _mm_and_ps( v1, maxval4 ); v1 = v1 & maxval4;
_mm_storeu_ps( dst + j, v0 ); v_store( dst + j, v0 );
_mm_storeu_ps( dst + j + 4, v1 ); v_store( dst + j + 4, v1 );
}
} }
#elif CV_NEON
float32x4_t v_thresh = vdupq_n_f32(thresh);
uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
for( ; j <= roi.width - 4; j += 4 )
{
float32x4_t v_src = vld1q_f32(src + j);
uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval);
vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
}
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
dst[j] = src[j] > thresh ? maxval : 0; dst[j] = src[j] > thresh ? maxval : 0;
...@@ -760,34 +612,18 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -760,34 +612,18 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE
if( useSIMD )
{
__m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 8; j += 8 )
{ {
__m128 v0, v1; v_float32x4 v0, v1;
v0 = _mm_loadu_ps( src + j ); v0 = v_load( src + j );
v1 = _mm_loadu_ps( src + j + 4 ); v1 = v_load( src + j + 4 );
v0 = _mm_cmple_ps( v0, thresh4 ); v0 = v0 <= thresh4;
v1 = _mm_cmple_ps( v1, thresh4 ); v1 = v1 <= thresh4;
v0 = _mm_and_ps( v0, maxval4 ); v0 = v0 & maxval4;
v1 = _mm_and_ps( v1, maxval4 ); v1 = v1 & maxval4;
_mm_storeu_ps( dst + j, v0 ); v_store( dst + j, v0 );
_mm_storeu_ps( dst + j + 4, v1 ); v_store( dst + j + 4, v1 );
} }
}
#elif CV_NEON
float32x4_t v_thresh = vdupq_n_f32(thresh);
uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
for( ; j <= roi.width - 4; j += 4 )
{
float32x4_t v_src = vld1q_f32(src + j);
uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval);
vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
}
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
dst[j] = src[j] <= thresh ? maxval : 0; dst[j] = src[j] <= thresh ? maxval : 0;
...@@ -798,30 +634,19 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -798,30 +634,19 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE
if( useSIMD )
{
__m128 thresh4 = _mm_set1_ps(thresh);
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 8; j += 8 )
{ {
__m128 v0, v1; v_float32x4 v0, v1;
v0 = _mm_loadu_ps( src + j ); v0 = v_load( src + j );
v1 = _mm_loadu_ps( src + j + 4 ); v1 = v_load( src + j + 4 );
v0 = _mm_min_ps( v0, thresh4 ); v0 = v_min( v0, thresh4 );
v1 = _mm_min_ps( v1, thresh4 ); v1 = v_min( v1, thresh4 );
_mm_storeu_ps( dst + j, v0 ); v_store( dst + j, v0 );
_mm_storeu_ps( dst + j + 4, v1 ); v_store( dst + j + 4, v1 );
} }
}
#elif CV_NEON
float32x4_t v_thresh = vdupq_n_f32(thresh);
for( ; j <= roi.width - 4; j += 4 )
vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh));
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
dst[j] = std::min(src[j], thresh); dst[j] = std::min( src[j], thresh );
} }
break; break;
...@@ -829,32 +654,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -829,32 +654,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE
if( useSIMD )
{
__m128 thresh4 = _mm_set1_ps(thresh);
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 8; j += 8 )
{ {
__m128 v0, v1; v_float32x4 v0, v1;
v0 = _mm_loadu_ps( src + j ); v0 = v_load( src + j );
v1 = _mm_loadu_ps( src + j + 4 ); v1 = v_load( src + j + 4 );
v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4)); v0 = ( thresh4 < v0 ) & v0;
v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4)); v1 = ( thresh4 < v1 ) & v1;
_mm_storeu_ps( dst + j, v0 ); v_store( dst + j, v0 );
_mm_storeu_ps( dst + j + 4, v1 ); v_store( dst + j + 4, v1 );
}
}
#elif CV_NEON
float32x4_t v_thresh = vdupq_n_f32(thresh);
for( ; j <= roi.width - 4; j += 4 )
{
float32x4_t v_src = vld1q_f32(src + j);
uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh),
vreinterpretq_u32_f32(v_src));
vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
} }
#endif
for( ; j < roi.width; j++ ) for( ; j < roi.width; j++ )
{ {
...@@ -868,33 +677,72 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -868,33 +677,72 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE
if( useSIMD )
{
__m128 thresh4 = _mm_set1_ps(thresh);
for( ; j <= roi.width - 8; j += 8 ) for( ; j <= roi.width - 8; j += 8 )
{ {
__m128 v0, v1; v_float32x4 v0, v1;
v0 = _mm_loadu_ps( src + j ); v0 = v_load( src + j );
v1 = _mm_loadu_ps( src + j + 4 ); v1 = v_load( src + j + 4 );
v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4)); v0 = ( v0 <= thresh4 ) & v0;
v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4)); v1 = ( v1 <= thresh4 ) & v1;
_mm_storeu_ps( dst + j, v0 ); v_store( dst + j, v0 );
_mm_storeu_ps( dst + j + 4, v1 ); v_store( dst + j + 4, v1 );
} }
}
#elif CV_NEON
float32x4_t v_thresh = vdupq_n_f32(thresh);
for( ; j <= roi.width - 4; j += 4 ) for( ; j < roi.width; j++ )
{ {
float32x4_t v_src = vld1q_f32(src + j); float v = src[j];
uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), dst[j] = v <= thresh ? v : 0;
vreinterpretq_u32_f32(v_src));
vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
} }
}
break;
default:
return CV_Error( CV_StsBadArg, "" );
}
}
else
#endif #endif
for( ; j < roi.width; j++ ) {
switch( type )
{
case THRESH_BINARY:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
dst[j] = src[j] > thresh ? maxval : 0;
}
break;
case THRESH_BINARY_INV:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
dst[j] = src[j] <= thresh ? maxval : 0;
}
break;
case THRESH_TRUNC:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
dst[j] = std::min( src[j], thresh );
}
break;
case THRESH_TOZERO:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
{
float v = src[j];
dst[j] = v > thresh ? v : 0;
}
}
break;
case THRESH_TOZERO_INV:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
for( j = 0; j < roi.width; j++ )
{ {
float v = src[j]; float v = src[j];
dst[j] = v <= thresh ? v : 0; dst[j] = v <= thresh ? v : 0;
...@@ -904,6 +752,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) ...@@ -904,6 +752,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
default: default:
return CV_Error( CV_StsBadArg, "" ); return CV_Error( CV_StsBadArg, "" );
} }
}
} }
static void static void
...@@ -917,251 +766,183 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -917,251 +766,183 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
size_t src_step = _src.step / sizeof(src[0]); size_t src_step = _src.step / sizeof(src[0]);
size_t dst_step = _dst.step / sizeof(dst[0]); size_t dst_step = _dst.step / sizeof(dst[0]);
#if CV_SSE2
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
if (_src.isContinuous() && _dst.isContinuous()) if (_src.isContinuous() && _dst.isContinuous())
{ {
roi.width *= roi.height; roi.width *= roi.height;
roi.height = 1; roi.height = 1;
} }
switch (type) #if CV_SIMD128_64F
bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON );
if( useSIMD )
{
v_float64x2 thresh2 = v_setall_f64( thresh );
v_float64x2 maxval2 = v_setall_f64( maxval );
switch( type )
{ {
case THRESH_BINARY: case THRESH_BINARY:
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
for( ; j <= roi.width - 8; j += 8 )
{
__m128d v0, v1, v2, v3;
v0 = _mm_loadu_pd( src + j );
v1 = _mm_loadu_pd( src + j + 2 );
v2 = _mm_loadu_pd( src + j + 4 );
v3 = _mm_loadu_pd( src + j + 6 );
v0 = _mm_cmpgt_pd( v0, thresh2 );
v1 = _mm_cmpgt_pd( v1, thresh2 );
v2 = _mm_cmpgt_pd( v2, thresh2 );
v3 = _mm_cmpgt_pd( v3, thresh2 );
v0 = _mm_and_pd( v0, maxval2 );
v1 = _mm_and_pd( v1, maxval2 );
v2 = _mm_and_pd( v2, maxval2 );
v3 = _mm_and_pd( v3, maxval2 );
_mm_storeu_pd( dst + j, v0 );
_mm_storeu_pd( dst + j + 2, v1 );
_mm_storeu_pd( dst + j + 4, v2 );
_mm_storeu_pd( dst + j + 6, v3 );
}
}
#elif CV_NEON && defined(__aarch64__)
float64x2_t v_thresh = vdupq_n_f64(thresh);
uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 4; j += 4 )
{ {
float64x2_t v_src0 = vld1q_f64(src + j); v_float64x2 v0, v1;
float64x2_t v_src1 = vld1q_f64(src + j + 2); v0 = v_load( src + j );
uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v_maxval); v1 = v_load( src + j + 2 );
uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v_maxval); v0 = thresh2 < v0;
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); v1 = thresh2 < v1;
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); v0 = v0 & maxval2;
v1 = v1 & maxval2;
v_store( dst + j, v0 );
v_store( dst + j + 2, v1 );
} }
#endif
for (; j < roi.width; j++) for( ; j < roi.width; j++ )
dst[j] = src[j] > thresh ? maxval : 0; dst[j] = src[j] > thresh ? maxval : 0;
} }
break; break;
case THRESH_BINARY_INV: case THRESH_BINARY_INV:
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval);
for( ; j <= roi.width - 8; j += 8 )
{
__m128d v0, v1, v2, v3;
v0 = _mm_loadu_pd( src + j );
v1 = _mm_loadu_pd( src + j + 2 );
v2 = _mm_loadu_pd( src + j + 4 );
v3 = _mm_loadu_pd( src + j + 6 );
v0 = _mm_cmple_pd( v0, thresh2 );
v1 = _mm_cmple_pd( v1, thresh2 );
v2 = _mm_cmple_pd( v2, thresh2 );
v3 = _mm_cmple_pd( v3, thresh2 );
v0 = _mm_and_pd( v0, maxval2 );
v1 = _mm_and_pd( v1, maxval2 );
v2 = _mm_and_pd( v2, maxval2 );
v3 = _mm_and_pd( v3, maxval2 );
_mm_storeu_pd( dst + j, v0 );
_mm_storeu_pd( dst + j + 2, v1 );
_mm_storeu_pd( dst + j + 4, v2 );
_mm_storeu_pd( dst + j + 6, v3 );
}
}
#elif CV_NEON && defined(__aarch64__)
float64x2_t v_thresh = vdupq_n_f64(thresh);
uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval));
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 4; j += 4 )
{ {
float64x2_t v_src0 = vld1q_f64(src + j); v_float64x2 v0, v1;
float64x2_t v_src1 = vld1q_f64(src + j + 2); v0 = v_load( src + j );
uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), v_maxval); v1 = v_load( src + j + 2 );
uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), v_maxval); v0 = v0 <= thresh2;
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); v1 = v1 <= thresh2;
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); v0 = v0 & maxval2;
v1 = v1 & maxval2;
v_store( dst + j, v0 );
v_store( dst + j + 2, v1 );
} }
#endif
for (; j < roi.width; j++) for( ; j < roi.width; j++ )
dst[j] = src[j] <= thresh ? maxval : 0; dst[j] = src[j] <= thresh ? maxval : 0;
} }
break; break;
case THRESH_TRUNC: case THRESH_TRUNC:
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
#if CV_SSE2
if( useSIMD )
{
__m128d thresh2 = _mm_set1_pd(thresh);
for( ; j <= roi.width - 8; j += 8 )
{
__m128d v0, v1, v2, v3;
v0 = _mm_loadu_pd( src + j );
v1 = _mm_loadu_pd( src + j + 2 );
v2 = _mm_loadu_pd( src + j + 4 );
v3 = _mm_loadu_pd( src + j + 6 );
v0 = _mm_min_pd( v0, thresh2 );
v1 = _mm_min_pd( v1, thresh2 );
v2 = _mm_min_pd( v2, thresh2 );
v3 = _mm_min_pd( v3, thresh2 );
_mm_storeu_pd( dst + j, v0 );
_mm_storeu_pd( dst + j + 2, v1 );
_mm_storeu_pd( dst + j + 4, v2 );
_mm_storeu_pd( dst + j + 6, v3 );
}
}
#elif CV_NEON && defined(__aarch64__)
float64x2_t v_thresh = vdupq_n_f64(thresh);
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 4; j += 4 )
{ {
float64x2_t v_src0 = vld1q_f64(src + j); v_float64x2 v0, v1;
float64x2_t v_src1 = vld1q_f64(src + j + 2); v0 = v_load( src + j );
float64x2_t v_dst0 = vminq_f64(v_src0, v_thresh); v1 = v_load( src + j + 2 );
float64x2_t v_dst1 = vminq_f64(v_src1, v_thresh); v0 = v_min( v0, thresh2 );
vst1q_f64(dst + j, v_dst0); v1 = v_min( v1, thresh2 );
vst1q_f64(dst + j + 2, v_dst1); v_store( dst + j, v0 );
v_store( dst + j + 2, v1 );
} }
#endif
for (; j < roi.width; j++) for( ; j < roi.width; j++ )
dst[j] = std::min(src[j], thresh); dst[j] = std::min( src[j], thresh );
} }
break; break;
case THRESH_TOZERO: case THRESH_TOZERO:
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j <= roi.width - 4; j += 4 )
#if CV_SSE2
if( useSIMD )
{ {
__m128d thresh2 = _mm_set1_pd(thresh); v_float64x2 v0, v1;
for( ; j <= roi.width - 8; j += 8 ) v0 = v_load( src + j );
v1 = v_load( src + j + 2 );
v0 = ( thresh2 < v0 ) & v0;
v1 = ( thresh2 < v1 ) & v1;
v_store( dst + j, v0 );
v_store( dst + j + 2, v1 );
}
for( ; j < roi.width; j++ )
{ {
__m128d v0, v1, v2, v3; double v = src[j];
v0 = _mm_loadu_pd( src + j ); dst[j] = v > thresh ? v : 0;
v1 = _mm_loadu_pd( src + j + 2 );
v2 = _mm_loadu_pd( src + j + 4 );
v3 = _mm_loadu_pd( src + j + 6 );
v0 = _mm_and_pd( v0, _mm_cmpgt_pd(v0, thresh2));
v1 = _mm_and_pd( v1, _mm_cmpgt_pd(v1, thresh2));
v2 = _mm_and_pd( v2, _mm_cmpgt_pd(v2, thresh2));
v3 = _mm_and_pd( v3, _mm_cmpgt_pd(v3, thresh2));
_mm_storeu_pd( dst + j, v0 );
_mm_storeu_pd( dst + j + 2, v1 );
_mm_storeu_pd( dst + j + 4, v2 );
_mm_storeu_pd( dst + j + 6, v3 );
} }
} }
#elif CV_NEON && defined(__aarch64__) break;
float64x2_t v_thresh = vdupq_n_f64(thresh);
case THRESH_TOZERO_INV:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
j = 0;
for( ; j <= roi.width - 4; j += 4 ) for( ; j <= roi.width - 4; j += 4 )
{ {
float64x2_t v_src0 = vld1q_f64(src + j); v_float64x2 v0, v1;
float64x2_t v_src1 = vld1q_f64(src + j + 2); v0 = v_load( src + j );
uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v1 = v_load( src + j + 2 );
vreinterpretq_u64_f64(v_src0)); v0 = ( v0 <= thresh2 ) & v0;
uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v1 = ( v1 <= thresh2 ) & v1;
vreinterpretq_u64_f64(v_src1)); v_store( dst + j, v0 );
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); v_store( dst + j + 2, v1 );
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
} }
#endif
for (; j < roi.width; j++) for( ; j < roi.width; j++ )
{ {
double v = src[j]; double v = src[j];
dst[j] = v > thresh ? v : 0; dst[j] = v <= thresh ? v : 0;
}
}
break;
default:
return CV_Error(CV_StsBadArg, "");
}
} }
else
#endif
{
switch( type )
{
case THRESH_BINARY:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{
j = 0;
for( ; j < roi.width; j++ )
dst[j] = src[j] > thresh ? maxval : 0;
} }
break; break;
case THRESH_TOZERO_INV: case THRESH_BINARY_INV:
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
j = 0; j = 0;
for( ; j < roi.width; j++ )
dst[j] = src[j] <= thresh ? maxval : 0;
}
break;
#if CV_SSE2 case THRESH_TRUNC:
if( useSIMD ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
__m128d thresh2 = _mm_set1_pd(thresh); j = 0;
for( ; j <= roi.width - 8; j += 8 ) for( ; j < roi.width; j++ )
dst[j] = std::min( src[j], thresh );
}
break;
case THRESH_TOZERO:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
__m128d v0, v1, v2, v3; j = 0;
v0 = _mm_loadu_pd( src + j ); for( ; j < roi.width; j++ )
v1 = _mm_loadu_pd( src + j + 2 ); {
v2 = _mm_loadu_pd( src + j + 4 ); double v = src[j];
v3 = _mm_loadu_pd( src + j + 6 ); dst[j] = v > thresh ? v : 0;
v0 = _mm_and_pd( v0, _mm_cmple_pd(v0, thresh2));
v1 = _mm_and_pd( v1, _mm_cmple_pd(v1, thresh2));
v2 = _mm_and_pd( v2, _mm_cmple_pd(v2, thresh2));
v3 = _mm_and_pd( v3, _mm_cmple_pd(v3, thresh2));
_mm_storeu_pd( dst + j, v0 );
_mm_storeu_pd( dst + j + 2, v1 );
_mm_storeu_pd( dst + j + 4, v2 );
_mm_storeu_pd( dst + j + 6, v3 );
} }
} }
#elif CV_NEON && defined(__aarch64__) break;
float64x2_t v_thresh = vdupq_n_f64(thresh);
for( ; j <= roi.width - 4; j += 4 ) case THRESH_TOZERO_INV:
for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
{ {
float64x2_t v_src0 = vld1q_f64(src + j); j = 0;
float64x2_t v_src1 = vld1q_f64(src + j + 2); for( ; j < roi.width; j++ )
uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh),
vreinterpretq_u64_f64(v_src0));
uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh),
vreinterpretq_u64_f64(v_src1));
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0));
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1));
}
#endif
for (; j < roi.width; j++)
{ {
double v = src[j]; double v = src[j];
dst[j] = v <= thresh ? v : 0; dst[j] = v <= thresh ? v : 0;
...@@ -1171,6 +952,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) ...@@ -1171,6 +952,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
default: default:
return CV_Error(CV_StsBadArg, ""); return CV_Error(CV_StsBadArg, "");
} }
}
} }
#ifdef HAVE_IPP #ifdef HAVE_IPP
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment