Commit a82e70cd authored by Tomoaki Teshima's avatar Tomoaki Teshima

remove raw SSE2/NEON implementation from imgwarp.cpp

  * use universal intrinsic instead of raw intrinsic
  * add 2 channels de-interleave on x86 platform
  * add v_int32x4 version of v_muladd
  * add accumulate version of v_dotprod based on the commit from seiko2plus on bf1852d
  * remove some verify check in performance test
  * avoid the out of boundary access and keep the performance
parent fdd83e50
......@@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
/** @brief Multiply and add
Returns \f$ a*b + c \f$
For floating point types only. */
For floating point types and signed 32bit int only. */
template<typename _Tp, int n>
inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
const v_reg<_Tp, n>& c)
......@@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
return c;
}
/** @brief Dot product of elements
Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
Scheme:
@code
{A1 A2 ...} // 16-bit
x {B1 B2 ...} // 16-bit
-------------
{A1B1+A2B2+C1 ...} // 32-bit
@endcode
Implemented only for 16-bit signed source type (v_int16x8).
*/
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
{
typedef typename V_TypeTraits<_Tp>::w_type w_type;
v_reg<w_type, n/2> s;
for( int i = 0; i < (n/2); i++ )
s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
return s;
}
/** @brief Multiply and expand
Multiply values two registers and store results in two registers with wider pack type.
......
......@@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
v_int32x4 s = v_dotprod(a, b);
return v_int32x4(vaddq_s32(s.val , c.val));
}
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
......@@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
}
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
}
#if CV_SIMD128_64F
inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
{
......@@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
#endif
#if CV_SIMD128_64F
inline v_int32x4 v_round(const v_float32x4& a)
{
float32x4_t a_ = a.val;
int32x4_t result;
__asm__ ("fcvtns %0.4s, %1.4s"
: "=w"(result)
: "w"(a_)
: /* No clobbers */);
return v_int32x4(result);
}
#else
inline v_int32x4 v_round(const v_float32x4& a)
{
static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
......@@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
}
#endif
inline v_int32x4 v_floor(const v_float32x4& a)
{
int32x4_t a1 = vcvtq_s32_f32(a.val);
......
......@@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
return v_int32x4(_mm_madd_epi16(a.val, b.val));
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
}
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
......@@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
}
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return a * b + c;
}
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
......@@ -1599,7 +1608,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2&
c = v_reinterpret_as_f64(t2);
}
// 2-channel, float only
// 2-channel
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
......@@ -1611,7 +1620,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}
inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
{
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
__m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
__m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
__m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
__m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
}
inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
{
v_int16x8 sa, sb;
v_load_deinterleave((const short*)ptr, sa, sb);
a = v_reinterpret_as_u16(sa);
b = v_reinterpret_as_u16(sb);
}
inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
{
__m128i t0, t1;
t0 = _mm_unpacklo_epi16(a.val, b.val);
......
......@@ -821,6 +821,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{ return a * b + c; }
// TODO: exp, log, sin, cos
/** Absolute values **/
......@@ -904,6 +907,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3)
......
......@@ -521,15 +521,25 @@ template<typename R> struct TheTest
TheTest & test_dot_prod()
{
typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
typedef typename Rx2::lane_type w_type;
Data<R> dataA, dataB(2);
R a = dataA, b = dataB;
Data<Rx2> res = v_dotprod(a, b);
Data<Rx2> dataC;
dataC += std::numeric_limits<w_type>::is_signed ?
std::numeric_limits<w_type>::min() :
std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
Rx2 c = dataC;
Data<Rx2> resD = v_dotprod(a, b),
resE = v_dotprod(a, b, c);
const int n = R::nlanes / 2;
for (int i = 0; i < n; ++i)
{
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
}
return *this;
}
......
......@@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,
OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
SANITY_CHECK(dst, eps);
SANITY_CHECK_NOTHING();
}
} } // namespace opencv_test::ocl
......
......@@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
PERF_TEST_P( TestRemap, remap,
Combine(
Values( TYPICAL_MAT_TYPES ),
Values( szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
Values( szVGA, sz1080p ),
InterType::all(),
BorderMode::all(),
RemapMode::all()
......@@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap,
remap(source, destination, map_x, map_y, interpolationType, borderMode);
}
SANITY_CHECK(destination, 1);
SANITY_CHECK_NOTHING();
}
void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
......
......@@ -50,7 +50,7 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "hal_replacement.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp"
#include "imgwarp.hpp"
......@@ -130,7 +130,7 @@ static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
#if CV_SSE2 || CV_NEON
#if CV_SIMD128
static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
#endif
......@@ -266,7 +266,7 @@ static const void* initInterTab2D( int method, bool fixpt )
}
tab -= INTER_TAB_SIZE2*ksize*ksize;
itab -= INTER_TAB_SIZE2*ksize*ksize;
#if CV_SSE2 || CV_NEON
#if CV_SIMD128
if( method == INTER_LINEAR )
{
for( i = 0; i < INTER_TAB_SIZE2; i++ )
......@@ -432,7 +432,7 @@ struct RemapNoVec
const void*, int ) const { return 0; }
};
#if CV_SSE2
#if CV_SIMD128
struct RemapVec_8u
{
......@@ -441,190 +441,192 @@ struct RemapVec_8u
{
int cn = _src.channels(), x = 0, sstep = (int)_src.step;
if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
if( (cn != 1 && cn != 3 && cn != 4) || !hasSIMD128() ||
sstep > 0x8000 )
return 0;
const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
uchar* D = (uchar*)_dst;
__m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
__m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
__m128i z = _mm_setzero_si128();
v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
#define CV_PICK_AND_PACK_RGB(ptr, offset, result) \
{ \
const uchar* const p = ((const uchar*)ptr) + (offset); \
if (p <= src_limit_8bytes) \
{ \
v_uint8x16 rrggbb, dummy; \
v_uint16x8 rrggbb8, dummy8; \
v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \
v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + 3), 0, 0, 0)); \
v_zip(rgb0, rgb1, rrggbb, dummy); \
v_expand(rrggbb, rrggbb8, dummy8); \
result = v_reinterpret_as_s16(rrggbb8); \
} \
else \
{ \
result = v_int16x8((short)p[0], (short)p[3], /* r0r1 */ \
(short)p[1], (short)p[4], /* g0g1 */ \
(short)p[2], (short)p[5], /* b0b1 */ 0, 0); \
} \
}
#define CV_PICK_AND_PACK_RGBA(ptr, offset, result) \
{ \
const uchar* const p = ((const uchar*)ptr) + (offset); \
CV_DbgAssert(p <= src_limit_8bytes); \
v_uint8x16 rrggbbaa, dummy; \
v_uint16x8 rrggbbaa8, dummy8; \
v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \
v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
v_zip(rgba0, rgba1, rrggbbaa, dummy); \
v_expand(rrggbbaa, rrggbbaa8, dummy8); \
result = v_reinterpret_as_s16(rrggbbaa8); \
}
#define CV_PICK_AND_PACK4(base,offset) \
v_uint16x8(*(ushort*)(base + offset[0]), *(ushort*)(base + offset[1]), \
*(ushort*)(base + offset[2]), *(ushort*)(base + offset[3]), \
0, 0, 0, 0)
if( cn == 1 )
{
for( ; x <= width - 8; x += 8 )
{
__m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
__m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
__m128i v0, v1, v2, v3, a0, a1, b0, b1;
unsigned i0, i1;
xy0 = _mm_madd_epi16( xy0, xy2ofs );
xy1 = _mm_madd_epi16( xy1, xy2ofs );
_mm_store_si128( (__m128i*)iofs0, xy0 );
_mm_store_si128( (__m128i*)iofs1, xy1 );
i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
v0 = _mm_unpacklo_epi8(v0, z);
v1 = _mm_unpacklo_epi8(v1, z);
a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
_mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
_mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
b0 = _mm_unpacklo_epi64(a0, a1);
b1 = _mm_unpackhi_epi64(a0, a1);
v0 = _mm_madd_epi16(v0, b0);
v1 = _mm_madd_epi16(v1, b1);
v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
v2 = _mm_unpacklo_epi8(v2, z);
v3 = _mm_unpacklo_epi8(v3, z);
a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
_mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
_mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
b0 = _mm_unpacklo_epi64(a0, a1);
b1 = _mm_unpackhi_epi64(a0, a1);
v2 = _mm_madd_epi16(v2, b0);
v3 = _mm_madd_epi16(v3, b1);
v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
_mm_storel_epi64( (__m128i*)(D + x), v0 );
v_int16x8 _xy0 = v_load(XY + x*2);
v_int16x8 _xy1 = v_load(XY + x*2 + 8);
v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
v_int32x4 xy1 = v_dotprod( _xy1, xy2ofs );
v_store( iofs0, xy0 );
v_store( iofs1, xy1 );
v_uint16x8 stub, dummy;
v_uint16x8 vec16;
vec16 = CV_PICK_AND_PACK4(S0, iofs0);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v0 = v_reinterpret_as_s32(stub);
vec16 = CV_PICK_AND_PACK4(S1, iofs0);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v1 = v_reinterpret_as_s32(stub);
v_zip(v_load_low((int*)(wtab + FXY[x] * 4)), v_load_low((int*)(wtab + FXY[x + 1] * 4)), a0, a1);
v_zip(v_load_low((int*)(wtab + FXY[x + 2] * 4)), v_load_low((int*)(wtab + FXY[x + 3] * 4)), b0, b1);
v_recombine(a0, b0, a2, b2);
v1 = v_dotprod(v_reinterpret_as_s16(v1), v_reinterpret_as_s16(b2), delta);
v0 = v_dotprod(v_reinterpret_as_s16(v0), v_reinterpret_as_s16(a2), v1);
vec16 = CV_PICK_AND_PACK4(S0, iofs1);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v2 = v_reinterpret_as_s32(stub);
vec16 = CV_PICK_AND_PACK4(S1, iofs1);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v3 = v_reinterpret_as_s32(stub);
v_zip(v_load_low((int*)(wtab + FXY[x + 4] * 4)), v_load_low((int*)(wtab + FXY[x + 5] * 4)), c0, c1);
v_zip(v_load_low((int*)(wtab + FXY[x + 6] * 4)), v_load_low((int*)(wtab + FXY[x + 7] * 4)), d0, d1);
v_recombine(c0, d0, c2, d2);
v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
v0 = v0 >> INTER_REMAP_COEF_BITS;
v2 = v2 >> INTER_REMAP_COEF_BITS;
v_pack_u_store(D + x, v_pack(v0, v2));
}
}
else if( cn == 3 )
{
for( ; x <= width - 5; x += 4, D += 12 )
{
__m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
__m128i u0, v0, u1, v1;
xy0 = _mm_madd_epi16( xy0, xy2ofs );
_mm_store_si128( (__m128i*)iofs0, xy0 );
const __m128i *w0, *w1;
w0 = (const __m128i*)(wtab + FXY[x]*16);
w1 = (const __m128i*)(wtab + FXY[x+1]*16);
u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
u0 = _mm_unpacklo_epi8(u0, z);
v0 = _mm_unpacklo_epi8(v0, z);
u1 = _mm_unpacklo_epi8(u1, z);
v1 = _mm_unpacklo_epi8(v1, z);
u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
u0 = _mm_slli_si128(u0, 4);
u0 = _mm_packs_epi32(u0, u1);
u0 = _mm_packus_epi16(u0, u0);
_mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
w0 = (const __m128i*)(wtab + FXY[x+2]*16);
w1 = (const __m128i*)(wtab + FXY[x+3]*16);
u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
u0 = _mm_unpacklo_epi8(u0, z);
v0 = _mm_unpacklo_epi8(v0, z);
u1 = _mm_unpacklo_epi8(u1, z);
v1 = _mm_unpacklo_epi8(v1, z);
u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
u0 = _mm_slli_si128(u0, 4);
u0 = _mm_packs_epi32(u0, u1);
u0 = _mm_packus_epi16(u0, u0);
_mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
v_int16x8 u0, v0, u1, v1;
v_int16x8 _xy0 = v_load(XY + x * 2);
v_int32x4 xy0 = v_dotprod(_xy0, xy2ofs);
v_store(iofs0, xy0);
int offset0 = FXY[x] * 16;
int offset1 = FXY[x + 1] * 16;
int offset2 = FXY[x + 2] * 16;
int offset3 = FXY[x + 3] * 16;
v_int16x8 w00 = v_load(wtab + offset0);
v_int16x8 w01 = v_load(wtab + offset0 + 8);
v_int16x8 w10 = v_load(wtab + offset1);
v_int16x8 w11 = v_load(wtab + offset1 + 8);
CV_PICK_AND_PACK_RGB(S0, iofs0[0], u0);
CV_PICK_AND_PACK_RGB(S1, iofs0[0], v0);
CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result0 = v_rotate_left<1>(result0);
v_int16x8 result8 = v_pack(result0, result1);
v_uint8x16 result16 = v_pack_u(result8, result8);
v_store_low(D, v_rotate_right<1>(result16));
w00 = v_load(wtab + offset2);
w01 = v_load(wtab + offset2 + 8);
w10 = v_load(wtab + offset3);
w11 = v_load(wtab + offset3 + 8);
CV_PICK_AND_PACK_RGB(S0, iofs0[2], u0);
CV_PICK_AND_PACK_RGB(S1, iofs0[2], v0);
CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result0 = v_rotate_left<1>(result0);
result8 = v_pack(result0, result1);
result16 = v_pack_u(result8, result8);
v_store_low(D + 6, v_rotate_right<1>(result16));
}
}
else if( cn == 4 )
{
for( ; x <= width - 4; x += 4, D += 16 )
{
__m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
__m128i u0, v0, u1, v1;
xy0 = _mm_madd_epi16( xy0, xy2ofs );
_mm_store_si128( (__m128i*)iofs0, xy0 );
const __m128i *w0, *w1;
w0 = (const __m128i*)(wtab + FXY[x]*16);
w1 = (const __m128i*)(wtab + FXY[x+1]*16);
u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
u0 = _mm_unpacklo_epi8(u0, z);
v0 = _mm_unpacklo_epi8(v0, z);
u1 = _mm_unpacklo_epi8(u1, z);
v1 = _mm_unpacklo_epi8(v1, z);
u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
u0 = _mm_packs_epi32(u0, u1);
u0 = _mm_packus_epi16(u0, u0);
_mm_storel_epi64((__m128i*)D, u0);
w0 = (const __m128i*)(wtab + FXY[x+2]*16);
w1 = (const __m128i*)(wtab + FXY[x+3]*16);
u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
u0 = _mm_unpacklo_epi8(u0, z);
v0 = _mm_unpacklo_epi8(v0, z);
u1 = _mm_unpacklo_epi8(u1, z);
v1 = _mm_unpacklo_epi8(v1, z);
u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
u0 = _mm_packs_epi32(u0, u1);
u0 = _mm_packus_epi16(u0, u0);
_mm_storel_epi64((__m128i*)(D + 8), u0);
v_int16x8 _xy0 = v_load(XY + x * 2);
v_int16x8 u0, v0, u1, v1;
v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
v_store(iofs0, xy0);
int offset0 = FXY[x] * 16;
int offset1 = FXY[x + 1] * 16;
int offset2 = FXY[x + 2] * 16;
int offset3 = FXY[x + 3] * 16;
v_int16x8 w00 = v_load(wtab + offset0);
v_int16x8 w01 = v_load(wtab + offset0 + 8);
v_int16x8 w10 = v_load(wtab + offset1);
v_int16x8 w11 = v_load(wtab + offset1 + 8);
CV_PICK_AND_PACK_RGBA(S0, iofs0[0], u0);
CV_PICK_AND_PACK_RGBA(S1, iofs0[0], v0);
CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
v_int16x8 result8 = v_pack(result0, result1);
v_pack_u_store(D, result8);
w00 = v_load(wtab + offset2);
w01 = v_load(wtab + offset2 + 8);
w10 = v_load(wtab + offset3);
w11 = v_load(wtab + offset3 + 8);
CV_PICK_AND_PACK_RGBA(S0, iofs0[2], u0);
CV_PICK_AND_PACK_RGBA(S1, iofs0[2], v0);
CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result8 = v_pack(result0, result1);
v_pack_u_store(D + 8, result8);
}
}
......@@ -660,7 +662,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
CV_Assert( ssize.area() > 0 );
#if CV_SSE2
#if CV_SIMD128
if( _src.type() == CV_8UC3 )
width1 = std::max(ssize.width-2, 0);
#endif
......@@ -1091,9 +1093,9 @@ public:
int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
int bcols0 = std::min(buf_size/brows0, dst->cols);
brows0 = std::min(buf_size/bcols0, dst->rows);
#if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
#if CV_SIMD128
bool useSIMD = hasSIMD128();
#endif
Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
if( !nnfunc )
......@@ -1139,29 +1141,24 @@ public:
const float* sY = m2->ptr<float>(y+y1) + x;
x1 = 0;
#if CV_SSE2
#if CV_SIMD128
if( useSIMD )
{
for( ; x1 <= bcols - 8; x1 += 8 )
int span = v_float32x4::nlanes;
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
__m128 fx0 = _mm_loadu_ps(sX + x1);
__m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
__m128 fy0 = _mm_loadu_ps(sY + x1);
__m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
__m128i ix0 = _mm_cvtps_epi32(fx0);
__m128i ix1 = _mm_cvtps_epi32(fx1);
__m128i iy0 = _mm_cvtps_epi32(fy0);
__m128i iy1 = _mm_cvtps_epi32(fy1);
ix0 = _mm_packs_epi32(ix0, ix1);
iy0 = _mm_packs_epi32(iy0, iy1);
ix1 = _mm_unpacklo_epi16(ix0, iy0);
iy1 = _mm_unpackhi_epi16(ix0, iy0);
_mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
_mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
v_int32x4 ix0 = v_round(v_load(sX + x1));
v_int32x4 iy0 = v_round(v_load(sY + x1));
v_int32x4 ix1 = v_round(v_load(sX + x1 + span));
v_int32x4 iy1 = v_round(v_load(sY + x1 + span));
v_int16x8 dx, dy;
dx = v_pack(ix0, ix1);
dy = v_pack(iy0, iy1);
v_store_interleave(XY + x1 * 2, dx, dy);
}
}
#endif
for( ; x1 < bcols; x1++ )
{
XY[x1*2] = saturate_cast<short>(sX[x1]);
......@@ -1186,16 +1183,15 @@ public:
const ushort* sA = m2->ptr<ushort>(y+y1) + x;
x1 = 0;
#if CV_NEON
uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
for ( ; x1 <= bcols - 8; x1 += 8)
vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
#elif CV_SSE2
__m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
for ( ; x1 <= bcols - 8; x1 += 8)
_mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
#if CV_SIMD128
if (useSIMD)
{
v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
int span = v_uint16x8::nlanes;
for( ; x1 <= bcols - span; x1 += span )
v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
}
#endif
for( ; x1 < bcols; x1++ )
A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
}
......@@ -1205,60 +1201,29 @@ public:
const float* sY = m2->ptr<float>(y+y1) + x;
x1 = 0;
#if CV_SSE2
#if CV_SIMD128
if( useSIMD )
{
__m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
__m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
for( ; x1 <= bcols - 8; x1 += 8 )
{
__m128 fx0 = _mm_loadu_ps(sX + x1);
__m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
__m128 fy0 = _mm_loadu_ps(sY + x1);
__m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
__m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
__m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
__m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
__m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
__m128i mx0 = _mm_and_si128(ix0, mask);
__m128i mx1 = _mm_and_si128(ix1, mask);
__m128i my0 = _mm_and_si128(iy0, mask);
__m128i my1 = _mm_and_si128(iy1, mask);
mx0 = _mm_packs_epi32(mx0, mx1);
my0 = _mm_packs_epi32(my0, my1);
my0 = _mm_slli_epi16(my0, INTER_BITS);
mx0 = _mm_or_si128(mx0, my0);
_mm_storeu_si128((__m128i*)(A + x1), mx0);
ix0 = _mm_srai_epi32(ix0, INTER_BITS);
ix1 = _mm_srai_epi32(ix1, INTER_BITS);
iy0 = _mm_srai_epi32(iy0, INTER_BITS);
iy1 = _mm_srai_epi32(iy1, INTER_BITS);
ix0 = _mm_packs_epi32(ix0, ix1);
iy0 = _mm_packs_epi32(iy0, iy1);
ix1 = _mm_unpacklo_epi16(ix0, iy0);
iy1 = _mm_unpackhi_epi16(ix0, iy0);
_mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
_mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
}
}
#elif CV_NEON
float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
for( ; x1 <= bcols - 4; x1 += 4 )
{
int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
vandq_s32(v_sy, v_scale2));
vst1_u16(A + x1, vqmovun_s32(v_v));
int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
int span = v_float32x4::nlanes;
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
v_store(A + x1, v_v);
v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
v_int16x8 v_d1 = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
v_store_interleave(XY + (x1 << 1), v_d0, v_d1);
}
}
#endif
for( ; x1 < bcols; x1++ )
{
int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
......@@ -1274,26 +1239,33 @@ public:
const float* sXY = m1->ptr<float>(y+y1) + x*2;
x1 = 0;
#if CV_NEON
float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
for( ; x1 <= bcols - 4; x1 += 4 )
#if CV_SIMD128
if( useSIMD )
{
float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
vandq_s32(v_sy, v_scale2));
vst1_u16(A + x1, vqmovun_s32(v_v));
int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
int span = v_float32x4::nlanes;
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_float32x4 v_fx, v_fy;
v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
v_int32x4 v_sx0 = v_round(v_fx * v_scale);
v_int32x4 v_sy0 = v_round(v_fy * v_scale);
v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
v_int32x4 v_sx1 = v_round(v_fx * v_scale);
v_int32x4 v_sy1 = v_round(v_fy * v_scale);
v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
v_store(A + x1, v_v8);
v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
v_int16x8 v_dy = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
v_store_interleave(XY + (x1 << 1), v_dx, v_dy);
}
}
#endif
for( x1 = 0; x1 < bcols; x1++ )
for( ; x1 < bcols; x1++ )
{
int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
......@@ -1915,8 +1887,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
size.height = 1;
}
#if CV_SSE2
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
bool useSIMD = hasSIMD128();
#endif
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
......@@ -1941,59 +1913,66 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
{
if( nninterpolate )
{
#if CV_NEON
for( ; x <= size.width - 8; x += 8 )
{
int16x8x2_t v_dst;
v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
vst2q_s16(dst1 + (x << 1), v_dst);
}
#elif CV_TRY_SSE4_1
#if CV_TRY_SSE4_1
if (useSSE4_1)
opt_SSE4_1::convertMaps_nninterpolate32f1c16s_SSE41(src1f, src2f, dst1, size.width);
else
#endif
{
#if CV_SIMD128
if( useSIMD )
{
int span = v_int16x8::nlanes;
for( ; x <= size.width - span; x += span )
{
v_int16x8 v_dst[2];
#define CV_PACK_MAP(X) v_pack(v_round(v_load(X)), v_round(v_load((X)+4)))
v_dst[0] = CV_PACK_MAP(src1f + x);
v_dst[1] = CV_PACK_MAP(src2f + x);
#undef CV_PACK_MAP
v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x < size.width; x++ )
{
dst1[x*2] = saturate_cast<short>(src1f[x]);
dst1[x*2+1] = saturate_cast<short>(src2f[x]);
}
}
}
else
{
#if CV_NEON
float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
for( ; x <= size.width - 8; x += 8 )
#if CV_TRY_SSE4_1
if (useSSE4_1)
opt_SSE4_1::convertMaps_32f1c16s_SSE41(src1f, src2f, dst1, dst2, size.width);
else
#endif
{
int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
int16x8x2_t v_dst;
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
#if CV_SIMD128
if( useSIMD )
{
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
int span = v_float32x4::nlanes;
for( ; x <= size.width - span * 2; x += span * 2 )
{
v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x)));
v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span)));
v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x)));
v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span)));
vst2q_s16(dst1 + (x << 1), v_dst);
v_int16x8 v_dst[2];
v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
vandq_s32(v_ix0, v_mask)));
uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
vandq_s32(v_ix1, v_mask)));
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask));
v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask));
v_store(dst2 + x, v_pack_u(v_dst0, v_dst1));
}
}
#elif CV_TRY_SSE4_1
if (useSSE4_1)
opt_SSE4_1::convertMaps_32f1c16s_SSE41(src1f, src2f, dst1, dst2, size.width);
else
#endif
for( ; x < size.width; x++ )
{
......@@ -2005,20 +1984,17 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
}
}
}
}
else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
{
if( nninterpolate )
{
#if CV_NEON
for( ; x <= (size.width << 1) - 8; x += 8 )
vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
#elif CV_SSE2
for( ; x <= (size.width << 1) - 8; x += 8 )
{
_mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
}
#if CV_SIMD128
int span = v_float32x4::nlanes;
if( useSIMD )
for( ; x <= (size.width << 1) - span * 2; x += span * 2 )
v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)),
v_round(v_load(src1f + x + span))));
#endif
for( ; x < size.width; x++ )
{
......@@ -2028,36 +2004,39 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
}
else
{
#if CV_NEON
float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
for( ; x <= size.width - 8; x += 8 )
#if CV_TRY_SSE4_1
if( useSSE4_1 )
opt_SSE4_1::convertMaps_32f2c16s_SSE41(src1f, dst1, dst2, size.width);
else
#endif
{
float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
int16x8x2_t v_dst;
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
#if CV_SIMD128
if( useSIMD )
{
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
int span = v_uint16x8::nlanes;
for (; x <= size.width - span; x += span )
{
v_float32x4 v_src0[2], v_src1[2];
v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]);
v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]);
v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale);
v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale);
v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale);
v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale);
vst2q_s16(dst1 + (x << 1), v_dst);
v_int16x8 v_dst[2];
v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
vandq_s32(v_ix0, v_mask)));
uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
vandq_s32(v_ix1, v_mask)));
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
v_store(dst2 + x, v_pack_u(
v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)),
v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask))));
}
}
#elif CV_TRY_SSE4_1
if (useSSE4_1)
opt_SSE4_1::convertMaps_32f2c16s_SSE41(src1f, dst1, dst2, size.width);
else
#endif
for( ; x < size.width; x++ )
{
......@@ -2069,77 +2048,48 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
}
}
}
}
else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
{
#if CV_NEON
uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
float32x4_t v_scale = vdupq_n_f32(scale);
for( ; x <= size.width - 8; x += 8)
#if CV_SIMD128
if( useSIMD )
{
uint32x4_t v_fxy1, v_fxy2;
if (src2)
v_uint16x8 v_mask2 = v_setall_u16(INTER_TAB_SIZE2-1);
v_uint32x4 v_zero = v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1);
v_float32x4 v_scale = v_setall_f32(scale);
int span = v_float32x4::nlanes;
for( ; x <= size.width - span * 2; x += span * 2 )
{
v_uint32x4 v_fxy1, v_fxy2;
if ( src2 )
{
uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2;
v_expand(v_src2, v_fxy1, v_fxy2);
}
else
v_fxy1 = v_fxy2 = v_zero;
int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
vst1q_f32(dst1f + x, v_dst1);
vst1q_f32(dst2f + x, v_dst2);
v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
vst1q_f32(dst1f + x + 4, v_dst1);
vst1q_f32(dst2f + x + 4, v_dst2);
}
#elif CV_SSE2
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
__m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128 v_scale = _mm_set1_ps(scale);
for( ; x <= size.width - 16; x += 16)
{
__m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
__m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
__m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
__m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
_mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
__m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
v_int16x8 v_src[2];
v_int32x4 v_src0[2], v_src1[2];
v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
v_expand(v_src[0], v_src0[0], v_src0[1]);
v_expand(v_src[1], v_src1[0], v_src1[1]);
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\
v_cvt_f32(v_reinterpret_as_s32(X)))
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\
v_cvt_f32(v_reinterpret_as_s32(Y)))
v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
v_store(dst1f + x, v_dst1);
v_store(dst2f + x, v_dst2);
v_dst1 = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2);
v_dst2 = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2);
v_store(dst1f + x + span, v_dst1);
v_store(dst2f + x + span, v_dst2);
#undef CV_COMPUTE_MAP_X
#undef CV_COMPUTE_MAP_Y
}
}
#endif
for( ; x < size.width; x++ )
......@@ -2151,56 +2101,42 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
}
else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
{
#if CV_NEON
int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
float32x4_t v_scale = vdupq_n_f32(scale);
for( ; x <= size.width - 8; x += 8)
#if CV_SIMD128
if( useSIMD )
{
int32x4_t v_fxy1, v_fxy2;
v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1);
v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1);
v_float32x4 v_scale = v_setall_f32(scale);
int span = v_int16x8::nlanes;
for( ; x <= size.width - span; x += span )
{
v_int32x4 v_fxy1, v_fxy2;
if (src2)
{
int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2;
v_expand(v_src2, v_fxy1, v_fxy2);
}
else
v_fxy1 = v_fxy2 = v_zero;
int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
float32x4x2_t v_dst;
v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
vst2q_f32(dst1f + (x << 1), v_dst);
v_int16x8 v_src[2];
v_int32x4 v_src0[2], v_src1[2];
v_float32x4 v_dst[2];
v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
v_expand(v_src[0], v_src0[0], v_src0[1]);
v_expand(v_src[1], v_src1[0], v_src1[1]);
v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
vst2q_f32(dst1f + (x << 1) + 8, v_dst);
}
#elif CV_SSE2
if (useSSE2)
{
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
__m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128 v_scale = _mm_set1_ps(scale);
#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X))
#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y))
v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]);
for ( ; x <= size.width - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
__m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
__m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
__m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
v_dst[0] = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2);
v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2);
v_store_interleave(dst1f + (x << 1) + span, v_dst[0], v_dst[1]);
#undef CV_COMPUTE_MAP_X
#undef CV_COMPUTE_MAP_Y
}
}
#endif
......@@ -2242,8 +2178,8 @@ public:
#if CV_TRY_AVX2
bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
#endif
#if CV_SSE2
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
bool useSIMD = hasSIMD128();
#endif
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
......@@ -2272,23 +2208,29 @@ public:
if( interpolation == INTER_NEAREST )
{
x1 = 0;
#if CV_NEON
int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
for( ; x1 <= bw - 8; x1 += 8 )
{
int16x8x2_t v_dst;
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
vst2q_s16(xy + (x1 << 1), v_dst);
}
#elif CV_TRY_SSE4_1
if (useSSE4_1)
#if CV_TRY_SSE4_1
if( useSSE4_1 )
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
else
#endif
{
#if CV_SIMD128
if( useSIMD )
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = v_uint16x8::nlanes;
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
#undef CV_CONVERT_MAP
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x+x1]) >> AB_BITS;
......@@ -2297,6 +2239,7 @@ public:
xy[x1*2+1] = saturate_cast<short>(Y);
}
}
}
else
{
short* alpha = A + y1*bw;
......@@ -2305,59 +2248,28 @@ public:
if ( useAVX2 )
x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
#endif
#if CV_SSE2
if( useSSE2 )
{
__m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
__m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
for( ; x1 <= bw - 8; x1 += 8 )
{
__m128i tx0, tx1, ty0, ty1;
tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
__m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
_mm_and_si128(tx1, fxy_mask));
__m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
_mm_and_si128(ty1, fxy_mask));
tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
_mm_srai_epi32(tx1, INTER_BITS));
ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
_mm_srai_epi32(ty1, INTER_BITS));
fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
_mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
_mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
_mm_storeu_si128((__m128i*)(alpha + x1), fx_);
}
}
#elif CV_NEON
int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
for( ; x1 <= bw - 8; x1 += 8 )
{
int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
int16x8x2_t v_xy;
v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
vst2q_s16(xy + (x1 << 1), v_xy);
int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
vandq_s32(v_X0, v_mask)));
int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
vandq_s32(v_X1, v_mask)));
vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
#if CV_SIMD128
if( useSIMD )
{
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
int span = v_float32x4::nlanes;
for( ; x1 <= bw - span * 2; x1 += span * 2 )
{
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
v_int16x8 v_xy[2];
v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
}
}
#endif
for( ; x1 < bw; x1++ )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment