Commit 9d602f27 authored by Vitaly Tuzov's avatar Vitaly Tuzov

Replaced SSE2 area resize implementation with wide universal intrinsic implementation

parent ceca139f
......@@ -1181,760 +1181,247 @@ struct HResizeNoVec
const uchar*, int, int, int, int, int) const { return 0; }
};
#if CV_SSE2
#if CV_SIMD
struct VResizeLinearVec_32s8u
{
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const int** src = (const int**)_src;
const short* beta = (const short*)_beta;
const int *S0 = src[0], *S1 = src[1];
int x = 0;
__m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
__m128i delta = _mm_set1_epi16(2);
if( (((size_t)S0|(size_t)S1)&15) == 0 )
for( ; x <= width - 16; x += 16 )
{
__m128i x0, x1, x2, y0, y1, y2;
x0 = _mm_load_si128((const __m128i*)(S0 + x));
x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_load_si128((const __m128i*)(S1 + x));
y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
_mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
}
v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]);
if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x ) >> 4, vx_load_aligned(S0 + x + v_int32::nlanes) >> 4), b0) +
v_mul_hi(v_pack(vx_load_aligned(S1 + x ) >> 4, vx_load_aligned(S1 + x + v_int32::nlanes) >> 4), b1),
v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
else
for( ; x <= width - 16; x += 16 )
{
__m128i x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
_mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
}
for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x ) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
v_mul_hi(v_pack(vx_load(S1 + x ) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1),
v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
for( ; x < width - 4; x += 4 )
{
__m128i x0, y0;
x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
x0 = _mm_packs_epi32(x0, x0);
y0 = _mm_packs_epi32(y0, y0);
x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
x0 = _mm_packus_epi16(x0, x0);
*(int*)(dst + x) = _mm_cvtsi128_si32(x0);
}
for( ; x < width - v_int16::nlanes; x += v_int16::nlanes)
v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1));
return x;
}
};
template<int shiftval> struct VResizeLinearVec_32f16
struct VResizeLinearVec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
ushort* dst = (ushort*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
__m128i preshift = _mm_set1_epi32(shiftval);
__m128i postshift = _mm_set1_epi16((short)shiftval);
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
if( (((size_t)S0|(size_t)S1)&15) == 0 )
for( ; x <= width - 16; x += 16 )
{
__m128 x0, x1, y0, y1;
__m128i t0, t1, t2;
x0 = _mm_load_ps(S0 + x);
x1 = _mm_load_ps(S0 + x + 4);
y0 = _mm_load_ps(S1 + x);
y1 = _mm_load_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
x0 = _mm_load_ps(S0 + x + 8);
x1 = _mm_load_ps(S0 + x + 12);
y0 = _mm_load_ps(S1 + x + 8);
y1 = _mm_load_ps(S1 + x + 12);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
_mm_storeu_si128( (__m128i*)(dst + x), t0);
_mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
}
if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)),
v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
else
for( ; x <= width - 16; x += 16 )
{
__m128 x0, x1, y0, y1;
__m128i t0, t1, t2;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
x0 = _mm_loadu_ps(S0 + x + 8);
x1 = _mm_loadu_ps(S0 + x + 12);
y0 = _mm_loadu_ps(S1 + x + 8);
y1 = _mm_loadu_ps(S1 + x + 12);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
_mm_storeu_si128( (__m128i*)(dst + x), t0);
_mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
}
for( ; x < width - 4; x += 4 )
for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)),
v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
{
__m128 x0, y0;
__m128i t0;
x0 = _mm_loadu_ps(S0 + x);
y0 = _mm_loadu_ps(S1 + x);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
_mm_storel_epi64( (__m128i*)(dst + x), t0);
v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
v_store_low(dst + x, v_pack_u(t0, t0));
}
return x;
}
};
typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
struct VResizeLinearVec_32f
struct VResizeLinearVec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
float* dst = (float*)_dst;
short* dst = (short*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
if( (((size_t)S0|(size_t)S1)&15) == 0 )
for( ; x <= width - 8; x += 8 )
{
__m128 x0, x1, y0, y1;
x0 = _mm_load_ps(S0 + x);
x1 = _mm_load_ps(S0 + x + 4);
y0 = _mm_load_ps(S1 + x);
y1 = _mm_load_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
_mm_storeu_ps( dst + x, x0);
_mm_storeu_ps( dst + x + 4, x1);
}
else
for( ; x <= width - 8; x += 8 )
{
__m128 x0, x1, y0, y1;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
_mm_storeu_ps( dst + x, x0);
_mm_storeu_ps( dst + x + 4, x1);
}
return x;
}
};
struct VResizeCubicVec_32s8u
{
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const int** src = (const int**)_src;
const short* beta = (const short*)_beta;
const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
int x = 0;
float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
__m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
for( ; x <= width - 8; x += 8 )
{
__m128i x0, x1, y0, y1;
__m128 s0, s1, f0, f1;
x0 = _mm_load_si128((const __m128i*)(S0 + x));
x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_load_si128((const __m128i*)(S1 + x));
y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_load_si128((const __m128i*)(S2 + x));
x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
y0 = _mm_load_si128((const __m128i*)(S3 + x));
y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_cvtps_epi32(s0);
x1 = _mm_cvtps_epi32(s1);
x0 = _mm_packs_epi32(x0, x1);
_mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
}
if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)),
v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
else
for( ; x <= width - 8; x += 8 )
{
__m128i x0, x1, y0, y1;
__m128 s0, s1, f0, f1;
x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_cvtps_epi32(s0);
x1 = _mm_cvtps_epi32(s1);
x0 = _mm_packs_epi32(x0, x1);
_mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
}
return x;
}
};
template<int shiftval> struct VResizeCubicVec_32f16
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
ushort* dst = (ushort*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
__m128i preshift = _mm_set1_epi32(shiftval);
__m128i postshift = _mm_set1_epi16((short)shiftval);
for( ; x <= width - 8; x += 8 )
for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)),
v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
{
__m128 x0, x1, y0, y1, s0, s1;
__m128i t0, t1;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
s0 = _mm_mul_ps(x0, b0);
s1 = _mm_mul_ps(x1, b0);
y0 = _mm_mul_ps(y0, b1);
y1 = _mm_mul_ps(y1, b1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
x0 = _mm_loadu_ps(S2 + x);
x1 = _mm_loadu_ps(S2 + x + 4);
y0 = _mm_loadu_ps(S3 + x);
y1 = _mm_loadu_ps(S3 + x + 4);
x0 = _mm_mul_ps(x0, b2);
x1 = _mm_mul_ps(x1, b2);
y0 = _mm_mul_ps(y0, b3);
y1 = _mm_mul_ps(y1, b3);
s0 = _mm_add_ps(s0, x0);
s1 = _mm_add_ps(s1, x1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
_mm_storeu_si128( (__m128i*)(dst + x), t0);
v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
v_store_low(dst + x, v_pack(t0, t0));
}
return x;
}
};
typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
struct VResizeCubicVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
float* dst = (float*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
for( ; x <= width - 8; x += 8 )
{
__m128 x0, x1, y0, y1, s0, s1;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
s0 = _mm_mul_ps(x0, b0);
s1 = _mm_mul_ps(x1, b0);
y0 = _mm_mul_ps(y0, b1);
y1 = _mm_mul_ps(y1, b1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
x0 = _mm_loadu_ps(S2 + x);
x1 = _mm_loadu_ps(S2 + x + 4);
y0 = _mm_loadu_ps(S3 + x);
y1 = _mm_loadu_ps(S3 + x + 4);
x0 = _mm_mul_ps(x0, b2);
x1 = _mm_mul_ps(x1, b2);
y0 = _mm_mul_ps(y0, b3);
y1 = _mm_mul_ps(y1, b3);
s0 = _mm_add_ps(s0, x0);
s1 = _mm_add_ps(s1, x1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
_mm_storeu_ps( dst + x, s0);
_mm_storeu_ps( dst + x + 4, s1);
}
return x;
}
};
#if CV_TRY_SSE4_1
struct VResizeLanczos4Vec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
else return 0;
}
};
#else
typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
#endif
struct VResizeLanczos4Vec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
short * dst = (short*)_dst;
int x = 0;
__m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
for( ; x <= width - 8; x += 8 )
{
__m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
__m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
__m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
__m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
_mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
}
return x;
}
};
struct VResizeLanczos4Vec_32f
struct VResizeLinearVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
const float *S0 = src[0], *S1 = src[1];
float* dst = (float*)_dst;
int x = 0;
__m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
for( ; x <= width - 4; x += 4 )
{
__m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
_mm_storeu_ps(dst + x, v_dst);
}
if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1));
else
for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
return x;
}
};
#elif CV_NEON
struct VResizeLinearVec_32s8u
struct VResizeCubicVec_32s8u
{
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
{
const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
const int** src = (const int**)_src;
const short* beta = (const short*)_beta;
const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
int x = 0;
int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
for( ; x <= width - 16; x += 16)
{
int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
}
float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale),
b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale);
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 )
for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x )), b0,
v_muladd(v_cvt_f32(vx_load_aligned(S1 + x )), b1,
v_muladd(v_cvt_f32(vx_load_aligned(S2 + x )), b2,
v_cvt_f32(vx_load_aligned(S3 + x )) * b3)))),
v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)), b0,
v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)), b1,
v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)), b2,
v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3))))));
else
for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x )), b0,
v_muladd(v_cvt_f32(vx_load(S1 + x )), b1,
v_muladd(v_cvt_f32(vx_load(S2 + x )), b2,
v_cvt_f32(vx_load(S3 + x )) * b3)))),
v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)), b0,
v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)), b1,
v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)), b2,
v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3))))));
return x;
}
};
struct VResizeLinearVec_32f16u
struct VResizeCubicVec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
ushort* dst = (ushort*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
}
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0,
v_muladd(vx_load(S1 + x ), b1,
v_muladd(vx_load(S2 + x ), b2,
vx_load(S3 + x ) * b3)))),
v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
vx_load(S3 + x + v_float32::nlanes) * b3))))));
return x;
}
};
struct VResizeLinearVec_32f16s
struct VResizeCubicVec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
short* dst = (short*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
}
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0,
v_muladd(vx_load(S1 + x ), b1,
v_muladd(vx_load(S2 + x ), b2,
vx_load(S3 + x ) * b3)))),
v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
vx_load(S3 + x + v_float32::nlanes) * b3))))));
return x;
}
};
struct VResizeLinearVec_32f
struct VResizeCubicVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
float* dst = (float*)_dst;
int x = 0;
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
}
for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
v_store(dst + x, v_muladd(vx_load(S0 + x), b0,
v_muladd(vx_load(S1 + x), b1,
v_muladd(vx_load(S2 + x), b2,
vx_load(S3 + x) * b3))));
return x;
}
};
typedef VResizeNoVec VResizeCubicVec_32s8u;
struct VResizeCubicVec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
ushort* dst = (ushort*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
}
return x;
}
};
#if CV_TRY_SSE4_1
struct VResizeCubicVec_32f16s
struct VResizeLanczos4Vec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
short* dst = (short*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
}
return x;
if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
else return 0;
}
};
struct VResizeCubicVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
float* dst = (float*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
for( ; x <= width - 8; x += 8 )
{
vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x)));
vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4)));
}
return x;
}
};
#else
struct VResizeLanczos4Vec_32f16u
{
......@@ -1946,41 +1433,35 @@ struct VResizeLanczos4Vec_32f16u
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
ushort * dst = (ushort*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
v_b5, vld1q_f32(S5 + x)),
v_b6, vld1q_f32(S6 + x)),
v_b7, vld1q_f32(S7 + x));
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
v_b5, vld1q_f32(S5 + x + 4)),
v_b6, vld1q_f32(S6 + x + 4)),
v_b7, vld1q_f32(S7 + x + 4));
v_dst1 = vaddq_f32(v_dst0, v_dst1);
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
}
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0,
v_muladd(vx_load(S1 + x ), b1,
v_muladd(vx_load(S2 + x ), b2,
v_muladd(vx_load(S3 + x ), b3,
v_muladd(vx_load(S4 + x ), b4,
v_muladd(vx_load(S5 + x ), b5,
v_muladd(vx_load(S6 + x ), b6,
vx_load(S7 + x ) * b7)))))))),
v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
v_muladd(vx_load(S3 + x + v_float32::nlanes), b3,
v_muladd(vx_load(S4 + x + v_float32::nlanes), b4,
v_muladd(vx_load(S5 + x + v_float32::nlanes), b5,
v_muladd(vx_load(S6 + x + v_float32::nlanes), b6,
vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
return x;
}
};
#endif
struct VResizeLanczos4Vec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
......@@ -1991,36 +1472,28 @@ struct VResizeLanczos4Vec_32f16s
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
short * dst = (short*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
v_b5, vld1q_f32(S5 + x)),
v_b6, vld1q_f32(S6 + x)),
v_b7, vld1q_f32(S7 + x));
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
v_b5, vld1q_f32(S5 + x + 4)),
v_b6, vld1q_f32(S6 + x + 4)),
v_b7, vld1q_f32(S7 + x + 4));
v_dst1 = vaddq_f32(v_dst0, v_dst1);
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
}
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0,
v_muladd(vx_load(S1 + x ), b1,
v_muladd(vx_load(S2 + x ), b2,
v_muladd(vx_load(S3 + x ), b3,
v_muladd(vx_load(S4 + x ), b4,
v_muladd(vx_load(S5 + x ), b5,
v_muladd(vx_load(S6 + x ), b6,
vx_load(S7 + x ) * b7)))))))),
v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0,
v_muladd(vx_load(S1 + x + v_float32::nlanes), b1,
v_muladd(vx_load(S2 + x + v_float32::nlanes), b2,
v_muladd(vx_load(S3 + x + v_float32::nlanes), b3,
v_muladd(vx_load(S4 + x + v_float32::nlanes), b4,
v_muladd(vx_load(S5 + x + v_float32::nlanes), b5,
v_muladd(vx_load(S6 + x + v_float32::nlanes), b6,
vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
return x;
}
......@@ -2036,23 +1509,21 @@ struct VResizeLanczos4Vec_32f
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
float* dst = (float*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
for( ; x <= width - 4; x += 4 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
v_b5, vld1q_f32(S5 + x)),
v_b6, vld1q_f32(S6 + x)),
v_b7, vld1q_f32(S7 + x));
vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
}
v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
v_store(dst + x, v_muladd(vx_load(S0 + x), b0,
v_muladd(vx_load(S1 + x), b1,
v_muladd(vx_load(S2 + x), b2,
v_muladd(vx_load(S3 + x), b3,
v_muladd(vx_load(S4 + x), b4,
v_muladd(vx_load(S5 + x), b5,
v_muladd(vx_load(S6 + x), b6,
vx_load(S7 + x) * b7))))))));
return x;
}
......@@ -2695,95 +2166,94 @@ private:
int step;
};
#elif CV_SSE2
#elif CV_SIMD
class ResizeAreaFastVec_SIMD_8u
{
public:
ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
cn(_cn), step(_step)
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
cn(_cn), step(_step) {}
int operator() (const uchar* S, uchar* D, int w) const
{
if (!use_simd)
return 0;
int dx = 0;
const uchar* S0 = S;
const uchar* S1 = S0 + step;
__m128i zero = _mm_setzero_si128();
__m128i delta2 = _mm_set1_epi16(2);
if (cn == 1)
{
__m128i masklow = _mm_set1_epi16(0x00ff);
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
v_uint16 masklow = vx_setall_u16(0x00ff);
for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
__m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0));
v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1));
v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow));
}
}
else if (cn == 3)
for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
{
if (CV_SIMD_WIDTH > 64)
return 0;
for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
__m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
__m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
__m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
__m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
__m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
_mm_storel_epi64((__m128i*)(D+3), s0);
v_uint16 t0, t1, t2, t3, t4, t5;
v_uint16 s0, s1, s2, s3, s4, s5;
s0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
s1 = vx_load_expand(S0 + v_uint16::nlanes) + vx_load_expand(S1 + v_uint16::nlanes);
s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes);
s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes);
s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes);
s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_uint16 bl, gl, rl;
#if CV_SIMD_WIDTH == 16
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
#elif CV_SIMD_WIDTH == 32
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
#elif CV_SIMD_WIDTH == 64
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
#endif
s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes);
s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes);
s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes);
s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes);
s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes);
s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_uint16 bh, gh, rh;
#if CV_SIMD_WIDTH == 16
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
#elif CV_SIMD_WIDTH == 32
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
#elif CV_SIMD_WIDTH == 64
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
#endif
v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
}
}
else
{
CV_Assert(cn == 4);
int v[] = { 0, 0, -1, -1 };
__m128i mask = _mm_loadu_si128((const __m128i*)v);
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
__m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
__m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
__m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
__m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
__m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
__m128i res0 = _mm_srli_epi16(s0, 2);
s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
__m128i res1 = _mm_srli_epi16(s0, 2);
s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
_mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
_mm_storel_epi64((__m128i*)(D), s0);
v_uint32 r00, r01, r10, r11;
v_load_deinterleave((uint32_t*)S0, r00, r01);
v_load_deinterleave((uint32_t*)S1, r10, r11);
v_uint16 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
v_expand(v_reinterpret_as_u8(r00), r00l, r00h);
v_expand(v_reinterpret_as_u8(r01), r01l, r01h);
v_expand(v_reinterpret_as_u8(r10), r10l, r10h);
v_expand(v_reinterpret_as_u8(r11), r11l, r11h);
v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
}
}
......@@ -2792,7 +2262,6 @@ public:
private:
int cn;
bool use_simd;
int step;
};
......@@ -2800,164 +2269,258 @@ class ResizeAreaFastVec_SIMD_16u
{
public:
ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
cn(_cn), step(_step)
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
cn(_cn), step(_step) {}
int operator() (const ushort* S, ushort* D, int w) const
{
if (!use_simd)
return 0;
int dx = 0;
const ushort* S0 = (const ushort*)S;
const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
__m128i masklow = _mm_set1_epi32(0x0000ffff);
__m128i zero = _mm_setzero_si128();
__m128i delta2 = _mm_set1_epi32(2);
#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
if (cn == 1)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
v_uint32 masklow = vx_setall_u32(0x0000ffff);
for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
__m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
s0 = _mm_srli_epi32(s0, 2);
s0 = _mm_packus_epi32(s0, zero);
_mm_storel_epi64((__m128i*)D, s0);
v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0));
v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1));
v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow));
}
}
else if (cn == 3)
{
#if CV_SIMD_WIDTH == 16
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
#if CV_SSE4_1
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
__m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
__m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
__m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
__m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
__m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
v_uint32 r0, r1, r2, r3;
v_expand(vx_load(S0), r0, r1);
v_expand(vx_load(S1), r2, r3);
r0 += r2; r1 += r3;
v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
}
#else
v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
#endif
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
{
v_uint32 t0, t1, t2, t3, t4, t5;
v_uint32 s0, s1, s2, s3, s4, s5;
s0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
s1 = vx_load_expand(S0 + v_uint32::nlanes) + vx_load_expand(S1 + v_uint32::nlanes);
s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes);
s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes);
s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes);
s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_uint32 bl, gl, rl;
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
#if CV_SIMD_WIDTH == 32
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
#else //CV_SIMD_WIDTH == 64
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
#endif
s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes);
s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes);
s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes);
s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes);
s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes);
s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_uint32 bh, gh, rh;
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
#if CV_SIMD_WIDTH == 32
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
#else //CV_SIMD_WIDTH == 64
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
#endif
v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
}
#elif CV_SIMD_WIDTH >= 64
v_uint32 masklow = vx_setall_u32(0x0000ffff);
for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
{
v_uint16 b0, g0, r0, b1, g1, r1;
v_load_deinterleave(S0, b0, g0, r0);
v_load_deinterleave(S1, b1, g1, r1);
v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
}
#endif
}
else
{
CV_Assert(cn == 4);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
#if CV_SIMD_WIDTH >= 64
for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
__m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
__m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
__m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
__m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
__m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
v_uint64 r00, r01, r10, r11;
v_load_deinterleave((uint64_t*)S0, r00, r01);
v_load_deinterleave((uint64_t*)S1, r10, r11);
v_uint32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
v_expand(v_reinterpret_as_u16(r00), r00l, r00h);
v_expand(v_reinterpret_as_u16(r01), r01l, r01h);
v_expand(v_reinterpret_as_u16(r10), r10l, r10h);
v_expand(v_reinterpret_as_u16(r11), r11l, r11h);
v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
}
#else
for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
{
v_uint32 r0, r1, r2, r3;
v_expand(vx_load(S0), r0, r1);
v_expand(vx_load(S1), r2, r3);
r0 += r2; r1 += r3;
v_uint32 v_d;
#if CV_SIMD_WIDTH == 16
v_d = r0 + r1;
#elif CV_SIMD_WIDTH == 32
v_uint32 t0, t1;
v_recombine(r0, r1, t0, t1);
v_d = t0 + t1;
#endif
v_rshr_pack_store<2>(D, v_d);
}
#endif
}
#undef _mm_packus_epi32
return dx;
}
private:
int cn;
int step;
bool use_simd;
};
class ResizeAreaFastVec_SIMD_16s
{
public:
ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
cn(_cn), step(_step)
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
cn(_cn), step(_step) {}
int operator() (const short* S, short* D, int w) const
{
if (!use_simd)
return 0;
int dx = 0;
const short* S0 = (const short*)S;
const short* S1 = (const short*)((const uchar*)(S) + step);
__m128i masklow = _mm_set1_epi32(0x0000ffff);
__m128i zero = _mm_setzero_si128();
__m128i delta2 = _mm_set1_epi32(2);
if (cn == 1)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
v_int32 masklow = vx_setall_s32(0x0000ffff);
for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
__m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
s0 = _mm_srai_epi32(s0, 2);
s0 = _mm_packs_epi32(s0, zero);
_mm_storel_epi64((__m128i*)D, s0);
v_int32 r0 = v_reinterpret_as_s32(vx_load(S0));
v_int32 r1 = v_reinterpret_as_s32(vx_load(S1));
v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16));
}
}
else if (cn == 3)
{
#if CV_SIMD_WIDTH == 16
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
{
v_int32 t0, t1, t2, t3, t4, t5;
v_int32 s0, s1, s2, s3, s4, s5;
s0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
s1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes);
s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes);
s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_int32 bl, gl, rl;
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
#if CV_SIMD_WIDTH == 32
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
#else //CV_SIMD_WIDTH == 64
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
#endif
s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes);
s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes);
s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes);
s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes);
s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes);
s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_int32 bh, gh, rh;
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
#if CV_SIMD_WIDTH == 32
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
#else //CV_SIMD_WIDTH == 64
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
#endif
v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
}
#elif CV_SIMD_WIDTH >= 64
for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
__m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
__m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
__m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
__m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
__m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
v_int16 b0, g0, r0, b1, g1, r1;
v_load_deinterleave(S0, b0, g0, r0);
v_load_deinterleave(S1, b1, g1, r1);
v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
}
#endif
}
else
{
CV_Assert(cn == 4);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
__m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
__m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
__m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
__m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
__m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
#if CV_SIMD_WIDTH >= 64
v_int64 r00, r01, r10, r11;
v_load_deinterleave((int64_t*)S0, r00, r01);
v_load_deinterleave((int64_t*)S1, r10, r11);
v_int32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
v_expand(v_reinterpret_as_s16(r00), r00l, r00h);
v_expand(v_reinterpret_as_s16(r01), r01l, r01h);
v_expand(v_reinterpret_as_s16(r10), r10l, r10h);
v_expand(v_reinterpret_as_s16(r11), r11l, r11h);
v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
#else
v_int32 r0, r1, r2, r3;
r0 = vx_load_expand(S0 ) + vx_load_expand(S1 );
r1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes);
r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
v_int32 dl, dh;
#if CV_SIMD_WIDTH == 16
dl = r0 + r1; dh = r2 + r3;
#elif CV_SIMD_WIDTH == 32
v_int32 t0, t1, t2, t3;
v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
dl = t0 + t1; dh = t2 + t3;
#endif
v_store(D, v_rshr_pack<2>(dl, dh));
#endif
}
}
......@@ -2967,7 +2530,6 @@ public:
private:
int cn;
int step;
bool use_simd;
};
struct ResizeAreaFastVec_SIMD_32f
......@@ -2976,7 +2538,6 @@ struct ResizeAreaFastVec_SIMD_32f
cn(_cn), step(_step)
{
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const float * S, float * D, int w) const
......@@ -2987,33 +2548,32 @@ struct ResizeAreaFastVec_SIMD_32f
const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
int dx = 0;
__m128 v_025 = _mm_set1_ps(0.25f);
if (cn == 1)
{
const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
v_float32 v_025 = vx_setall_f32(0.25f);
for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
{
__m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
__m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
_mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
__m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
_mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
v_float32 v_row00, v_row01, v_row10, v_row11;
v_load_deinterleave(S0, v_row00, v_row01);
v_load_deinterleave(S1, v_row10, v_row11);
v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025);
}
}
else if (cn == 4)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
#if CV_SIMD_WIDTH == 16
v_float32 v_025 = vx_setall_f32(0.25f);
for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
#elif CV_SIMD256
v_float32x8 v_025 = v256_setall_f32(0.25f);
for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
{
__m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
__m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
v_float32x8 dst0, dst1;
v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1);
v_store(D, (dst0 + dst1) * v_025);
}
#endif
}
return dx;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment