Commit cfaca432 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #11169 from tomoaki0705:universalRemap

parents a2d6ee2d a82e70cd
...@@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> ...@@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
/** @brief Multiply and add /** @brief Multiply and add
Returns \f$ a*b + c \f$ Returns \f$ a*b + c \f$
For floating point types only. */ For floating point types and signed 32bit int only. */
template<typename _Tp, int n> template<typename _Tp, int n>
inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
const v_reg<_Tp, n>& c) const v_reg<_Tp, n>& c)
...@@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n ...@@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
return c; return c;
} }
/** @brief Dot product of elements
Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
Scheme:
@code
{A1 A2 ...} // 16-bit
x {B1 B2 ...} // 16-bit
-------------
{A1B1+A2B2+C1 ...} // 32-bit
@endcode
Implemented only for 16-bit signed source type (v_int16x8).
*/
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
{
typedef typename V_TypeTraits<_Tp>::w_type w_type;
v_reg<w_type, n/2> s;
for( int i = 0; i < (n/2); i++ )
s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
return s;
}
/** @brief Multiply and expand /** @brief Multiply and expand
Multiply values two registers and store results in two registers with wider pack type. Multiply values two registers and store results in two registers with wider pack type.
......
...@@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) ...@@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
} }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
v_int32x4 s = v_dotprod(a, b);
return v_int32x4(vaddq_s32(s.val , c.val));
}
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
...@@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_ ...@@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
} }
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
}
#if CV_SIMD128_64F #if CV_SIMD128_64F
inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
{ {
...@@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32) ...@@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64) OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
#endif #endif
#if CV_SIMD128_64F
inline v_int32x4 v_round(const v_float32x4& a)
{
float32x4_t a_ = a.val;
int32x4_t result;
__asm__ ("fcvtns %0.4s, %1.4s"
: "=w"(result)
: "w"(a_)
: /* No clobbers */);
return v_int32x4(result);
}
#else
inline v_int32x4 v_round(const v_float32x4& a) inline v_int32x4 v_round(const v_float32x4& a)
{ {
static const int32x4_t v_sign = vdupq_n_s32(1 << 31), static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
...@@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a) ...@@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
} }
#endif
inline v_int32x4 v_floor(const v_float32x4& a) inline v_int32x4 v_floor(const v_float32x4& a)
{ {
int32x4_t a1 = vcvtq_s32_f32(a.val); int32x4_t a1 = vcvtq_s32_f32(a.val);
......
...@@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) ...@@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
return v_int32x4(_mm_madd_epi16(a.val, b.val)); return v_int32x4(_mm_madd_epi16(a.val, b.val));
} }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
}
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
...@@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) ...@@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
__m128i m = _mm_cmpgt_epi32(b.val, a.val); __m128i m = _mm_cmpgt_epi32(b.val, a.val);
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
} }
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return a * b + c;
}
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
...@@ -1632,7 +1641,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& ...@@ -1632,7 +1641,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2&
c = v_reinterpret_as_f64(t2); c = v_reinterpret_as_f64(t2);
} }
// 2-channel, float only // 2-channel
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{ {
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
...@@ -1644,7 +1653,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b ...@@ -1644,7 +1653,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
} }
inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b ) inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
{
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
__m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
__m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
__m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
__m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
}
inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
{
v_int16x8 sa, sb;
v_load_deinterleave((const short*)ptr, sa, sb);
a = v_reinterpret_as_u16(sa);
b = v_reinterpret_as_u16(sb);
}
inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
{ {
__m128i t0, t1; __m128i t0, t1;
t0 = _mm_unpacklo_epi16(a.val, b.val); t0 = _mm_unpacklo_epi16(a.val, b.val);
......
...@@ -760,6 +760,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ ...@@ -760,6 +760,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4) OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2) OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{ return a * b + c; }
// TODO: exp, log, sin, cos // TODO: exp, log, sin, cos
/** Absolute values **/ /** Absolute values **/
...@@ -843,6 +846,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) ...@@ -843,6 +846,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); } { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3) const v_float32x4& m3)
......
...@@ -521,15 +521,25 @@ template<typename R> struct TheTest ...@@ -521,15 +521,25 @@ template<typename R> struct TheTest
TheTest & test_dot_prod() TheTest & test_dot_prod()
{ {
typedef typename V_RegTrait128<LaneType>::w_reg Rx2; typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
typedef typename Rx2::lane_type w_type;
Data<R> dataA, dataB(2); Data<R> dataA, dataB(2);
R a = dataA, b = dataB; R a = dataA, b = dataB;
Data<Rx2> res = v_dotprod(a, b); Data<Rx2> dataC;
dataC += std::numeric_limits<w_type>::is_signed ?
std::numeric_limits<w_type>::min() :
std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
Rx2 c = dataC;
Data<Rx2> resD = v_dotprod(a, b),
resE = v_dotprod(a, b, c);
const int n = R::nlanes / 2; const int n = R::nlanes / 2;
for (int i = 0; i < n; ++i) for (int i = 0; i < n; ++i)
{ {
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]); EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
} }
return *this; return *this;
} }
......
...@@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap, ...@@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,
OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode); OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
SANITY_CHECK(dst, eps); SANITY_CHECK_NOTHING();
} }
} } // namespace opencv_test::ocl } } // namespace opencv_test::ocl
......
...@@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear, ...@@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
PERF_TEST_P( TestRemap, remap, PERF_TEST_P( TestRemap, remap,
Combine( Combine(
Values( TYPICAL_MAT_TYPES ), Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
Values( szVGA, sz720p, sz1080p ), Values( szVGA, sz1080p ),
InterType::all(), InterType::all(),
BorderMode::all(), BorderMode::all(),
RemapMode::all() RemapMode::all()
...@@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap, ...@@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap,
remap(source, destination, map_x, map_y, interpolationType, borderMode); remap(source, destination, map_x, map_y, interpolationType, borderMode);
} }
SANITY_CHECK(destination, 1); SANITY_CHECK_NOTHING();
} }
void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode ) void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment