Commit e7d5dbfe authored by Tomoaki Teshima's avatar Tomoaki Teshima

dispatch accumulate series

 - use universal intrinsic for base
 - dispatch for float/double version using AVX
 - AVX2 optimization not done yet
parent b21b6944
......@@ -899,6 +899,15 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64);
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64);
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
......@@ -1520,6 +1529,35 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
__m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));
a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
v_uint64x2 t0, t1, t2;
v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
a = v_reinterpret_as_s64(t0);
b = v_reinterpret_as_s64(t1);
c = v_reinterpret_as_s64(t2);
inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
v_uint64x2 t0, t1, t2;
v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
a = v_reinterpret_as_f64(t0);
b = v_reinterpret_as_f64(t1);
c = v_reinterpret_as_f64(t2);
// 2-channel, float only
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
......@@ -1717,6 +1755,27 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
_mm_storeu_ps((ptr + 4), u1);
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
__m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
__m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
_mm_storeu_si128((__m128i*)ptr, t0);
_mm_storeu_si128((__m128i*)(ptr + 2), t1);
_mm_storeu_si128((__m128i*)(ptr + 4), t2);
inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0 ) \
set(the_description "Image Processing")
ocv_add_dispatched_file(accum SSE2 AVX NEON)
ocv_define_module(imgproc opencv_core WRAP java python)
......@@ -44,1718 +44,17 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "accum.simd.hpp"
#include "accum.simd_declarations.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp"
namespace cv
template <typename T, typename AT>
struct Acc_SIMD
int operator() (const T *, AT *, const uchar *, int, int) const
return 0;
template <typename T, typename AT>
struct AccSqr_SIMD
int operator() (const T *, AT *, const uchar *, int, int) const
return 0;
template <typename T, typename AT>
struct AccProd_SIMD
int operator() (const T *, const T *, AT *, const uchar *, int, int) const
return 0;
template <typename T, typename AT>
struct AccW_SIMD
int operator() (const T *, AT *, const uchar *, int, int, AT) const
return 0;
#if CV_AVX
template <>
struct Acc_SIMD<float, float>
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8 ; x += 8)
__m256 v_src = _mm256_loadu_ps(src + x);
__m256 v_dst = _mm256_loadu_ps(dst + x);
v_dst = _mm256_add_ps(v_src, v_dst);
_mm256_storeu_ps(dst + x, v_dst);
return x;
template <>
struct Acc_SIMD<float, double>
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8 ; x += 8)
__m256 v_src = _mm256_loadu_ps(src + x);
__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
_mm256_storeu_pd(dst + x, v_dst0);
_mm256_storeu_pd(dst + x + 4, v_dst1);
return x;
template <>
struct Acc_SIMD<double, double>
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
__m256d v_src = _mm256_loadu_pd(src + x);
__m256d v_dst = _mm256_loadu_pd(dst + x);
v_dst = _mm256_add_pd(v_dst, v_src);
_mm256_storeu_pd(dst + x, v_dst);
return x;
template <>
struct AccSqr_SIMD<float, float>
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8 ; x += 8)
__m256 v_src = _mm256_loadu_ps(src + x);
__m256 v_dst = _mm256_loadu_ps(dst + x);
v_src = _mm256_mul_ps(v_src, v_src);
v_dst = _mm256_add_ps(v_src, v_dst);
_mm256_storeu_ps(dst + x, v_dst);
return x;
template <>
struct AccSqr_SIMD<float, double>
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8 ; x += 8)
__m256 v_src = _mm256_loadu_ps(src + x);
__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
v_src0 = _mm256_mul_pd(v_src0, v_src0);
v_src1 = _mm256_mul_pd(v_src1, v_src1);
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
_mm256_storeu_pd(dst + x, v_dst0);
_mm256_storeu_pd(dst + x + 4, v_dst1);
return x;
template <>
struct AccSqr_SIMD<double, double>
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
__m256d v_src = _mm256_loadu_pd(src + x);
__m256d v_dst = _mm256_loadu_pd(dst + x);
v_src = _mm256_mul_pd(v_src, v_src);
v_dst = _mm256_add_pd(v_dst, v_src);
_mm256_storeu_pd(dst + x, v_dst);
return x;
template <>
struct AccProd_SIMD<float, float>
int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
__m256 v_src0 = _mm256_loadu_ps(src1 + x);
__m256 v_src1 = _mm256_loadu_ps(src2 + x);
__m256 v_dst = _mm256_loadu_ps(dst + x);
__m256 v_src = _mm256_mul_ps(v_src0, v_src1);
v_dst = _mm256_add_ps(v_src, v_dst);
_mm256_storeu_ps(dst + x, v_dst);
return x;
template <>
struct AccProd_SIMD<float, double>
int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
__m256 v_1src = _mm256_loadu_ps(src1 + x);
__m256 v_2src = _mm256_loadu_ps(src2 + x);
__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0));
__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1));
__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0));
__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1));
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
__m256d v_src0 = _mm256_mul_pd(v_src00, v_src10);
__m256d v_src1 = _mm256_mul_pd(v_src01, v_src11);
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
_mm256_storeu_pd(dst + x, v_dst0);
_mm256_storeu_pd(dst + x + 4, v_dst1);
return x;
template <>
struct AccProd_SIMD<double, double>
int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
__m256d v_src0 = _mm256_loadu_pd(src1 + x);
__m256d v_src1 = _mm256_loadu_pd(src2 + x);
__m256d v_dst = _mm256_loadu_pd(dst + x);
v_src0 = _mm256_mul_pd(v_src0, v_src1);
v_dst = _mm256_add_pd(v_dst, v_src0);
_mm256_storeu_pd(dst + x, v_dst);
return x;
template <>
struct AccW_SIMD<float, float>
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
int x = 0;
__m256 v_alpha = _mm256_set1_ps(alpha);
__m256 v_beta = _mm256_set1_ps(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 16; x += 16)
_mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));
_mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
return x;
template <>
struct AccW_SIMD<float, double>
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
int x = 0;
__m256d v_alpha = _mm256_set1_pd(alpha);
__m256d v_beta = _mm256_set1_pd(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 16; x += 16)
__m256 v_src0 = _mm256_loadu_ps(src + x);
__m256 v_src1 = _mm256_loadu_ps(src + x + 8);
__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0));
__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1));
__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0));
__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1));
_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha)));
_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha)));
_mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha)));
_mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
return x;
template <>
struct AccW_SIMD<double, double>
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
int x = 0;
__m256d v_alpha = _mm256_set1_pd(alpha);
__m256d v_beta = _mm256_set1_pd(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
__m256d v_src0 = _mm256_loadu_pd(src + x);
__m256d v_src1 = _mm256_loadu_pd(src + x + 4);
_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha)));
_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
return x;
#elif CV_SIMD128
template <>
struct Acc_SIMD<float, float>
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_store(dst + x, v_load(dst + x) + v_load(src + x));
v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src + x + 4));
return x;
#if CV_SIMD128_64F
template <>
struct Acc_SIMD<float, double>
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
v_float32x4 v_src = v_load(src + x);
v_float64x2 v_src0 = v_cvt_f64(v_src);
v_float64x2 v_src1 = v_cvt_f64_high(v_src);
v_store(dst + x, v_load(dst + x) + v_src0);
v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
return x;
template <>
struct Acc_SIMD<double, double>
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
v_float64x2 v_src0 = v_load(src + x);
v_float64x2 v_src1 = v_load(src + x + 2);
v_store(dst + x, v_load(dst + x) + v_src0);
v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
return x;
#endif //CV_SIMD128_64F
template <>
struct AccSqr_SIMD<float, float>
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_float32x4 v_src0 = v_load(src + x);
v_float32x4 v_src1 = v_load(src + x + 4);
v_src0 = v_src0 * v_src0;
v_src1 = v_src1 * v_src1;
v_store(dst + x, v_load(dst + x) + v_src0);
v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
return x;
#if CV_SIMD128_64F
template <>
struct AccSqr_SIMD<float, double>
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
v_float32x4 v_src = v_load(src + x);
v_float64x2 v_src0 = v_cvt_f64(v_src);
v_float64x2 v_src1 = v_cvt_f64_high(v_src);
v_src0 = v_src0 * v_src0;
v_src1 = v_src1 * v_src1;
v_store(dst + x, v_load(dst + x) + v_src0);
v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
return x;
template <>
struct AccSqr_SIMD<double, double>
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
v_float64x2 v_src0 = v_load(src + x);
v_float64x2 v_src1 = v_load(src + x + 2);
v_src0 = v_src0 * v_src0;
v_src1 = v_src1 * v_src1;
v_store(dst + x, v_load(dst + x) + v_src0);
v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
return x;
#endif //CV_SIMD128_64F
template <>
struct AccProd_SIMD<float, float>
int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_store(dst + x, v_load(dst + x) + v_load(src1 + x) * v_load(src2 + x));
v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src1 + x + 4) * v_load(src2 + x + 4));
return x;
#if CV_SIMD128_64F
template <>
struct AccProd_SIMD<float, double>
int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
v_float32x4 v_1src = v_load(src1 + x);
v_float32x4 v_2src = v_load(src2 + x);
v_float64x2 v_1src0 = v_cvt_f64(v_1src);
v_float64x2 v_1src1 = v_cvt_f64_high(v_1src);
v_float64x2 v_2src0 = v_cvt_f64(v_2src);
v_float64x2 v_2src1 = v_cvt_f64_high(v_2src);
v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0));
v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1));
return x;
template <>
struct AccProd_SIMD<double, double>
int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
v_float64x2 v_src00 = v_load(src1 + x);
v_float64x2 v_src01 = v_load(src1 + x + 2);
v_float64x2 v_src10 = v_load(src2 + x);
v_float64x2 v_src11 = v_load(src2 + x + 2);
v_store(dst + x, v_load(dst + x) + (v_src00 * v_src10));
v_store(dst + x + 2, v_load(dst + x + 2) + (v_src01 * v_src11));
return x;
#endif //CV_SIMD128_64F
template <>
struct AccW_SIMD<float, float>
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
int x = 0;
v_float32x4 v_alpha = v_setall_f32(alpha);
v_float32x4 v_beta = v_setall_f32(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_load(src + x) * v_alpha)));
v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_load(src + x + 4) * v_alpha)));
return x;
#if CV_SIMD128_64F
template <>
struct AccW_SIMD<float, double>
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
int x = 0;
v_float64x2 v_alpha = v_setall_f64(alpha);
v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_float32x4 v_src0 = v_load(src + x);
v_float32x4 v_src1 = v_load(src + x + 4);
v_float64x2 v_src00 = v_cvt_f64(v_src0);
v_float64x2 v_src01 = v_cvt_f64_high(v_src0);
v_float64x2 v_src10 = v_cvt_f64(v_src1);
v_float64x2 v_src11 = v_cvt_f64_high(v_src1);
v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src00 * v_alpha)));
v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src01 * v_alpha)));
v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_src10 * v_alpha)));
v_store(dst + x + 6, ((v_load(dst + x + 6) * v_beta) + (v_src11 * v_alpha)));
return x;
template <>
struct AccW_SIMD<double, double>
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
int x = 0;
v_float64x2 v_alpha = v_setall_f64(alpha);
v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 4; x += 4)
v_float64x2 v_src0 = v_load(src + x);
v_float64x2 v_src1 = v_load(src + x + 2);
v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src0 * v_alpha)));
v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src1 * v_alpha)));
return x;
#endif //CV_SIMD128_64F
#endif //CV_SIMD128
#if CV_SIMD128
template <>
struct Acc_SIMD<uchar, float>
int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_src = v_load(src + x);
v_uint16x8 v_src0, v_src1;
v_expand(v_src, v_src0, v_src1);
v_uint32x4 v_src00, v_src01, v_src10, v_src11;
v_expand(v_src0, v_src00, v_src01);
v_expand(v_src1, v_src10, v_src11);
v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
else if (cn == 1)
v_uint8x16 v_0 = v_setall_u8(0);
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_mask = v_load(mask + x);
v_mask = ~(v_0 == v_mask);
v_uint8x16 v_src = v_load(src + x);
v_src = v_src & v_mask;
v_uint16x8 v_src0, v_src1;
v_expand(v_src, v_src0, v_src1);
v_uint32x4 v_src00, v_src01, v_src10, v_src11;
v_expand(v_src0, v_src00, v_src01);
v_expand(v_src1, v_src10, v_src11);
v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
return x;
template <>
struct Acc_SIMD<ushort, float>
int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_src = v_load(src + x);
v_uint32x4 v_src0, v_src1;
v_expand(v_src, v_src0, v_src1);
v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
return x;
#if CV_SIMD128_64F
template <>
struct Acc_SIMD<uchar, double>
int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_src = v_load(src + x);
v_uint16x8 v_int0, v_int1;
v_expand(v_src, v_int0, v_int1);
v_uint32x4 v_int00, v_int01, v_int10, v_int11;
v_expand(v_int0, v_int00, v_int01);
v_expand(v_int1, v_int10, v_int11);
v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
v_float64x2 v_dst0 = v_load(dst + x);
v_float64x2 v_dst1 = v_load(dst + x + 2);
v_float64x2 v_dst2 = v_load(dst + x + 4);
v_float64x2 v_dst3 = v_load(dst + x + 6);
v_float64x2 v_dst4 = v_load(dst + x + 8);
v_float64x2 v_dst5 = v_load(dst + x + 10);
v_float64x2 v_dst6 = v_load(dst + x + 12);
v_float64x2 v_dst7 = v_load(dst + x + 14);
v_dst0 = v_dst0 + v_src0;
v_dst1 = v_dst1 + v_src1;
v_dst2 = v_dst2 + v_src2;
v_dst3 = v_dst3 + v_src3;
v_dst4 = v_dst4 + v_src4;
v_dst5 = v_dst5 + v_src5;
v_dst6 = v_dst6 + v_src6;
v_dst7 = v_dst7 + v_src7;
v_store(dst + x, v_dst0);
v_store(dst + x + 2, v_dst1);
v_store(dst + x + 4, v_dst2);
v_store(dst + x + 6, v_dst3);
v_store(dst + x + 8, v_dst4);
v_store(dst + x + 10, v_dst5);
v_store(dst + x + 12, v_dst6);
v_store(dst + x + 14, v_dst7);
return x;
template <>
struct Acc_SIMD<ushort, double>
int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_src = v_load(src + x);
v_uint32x4 v_int0, v_int1;
v_expand(v_src, v_int0, v_int1);
v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
v_float64x2 v_dst0 = v_load(dst + x);
v_float64x2 v_dst1 = v_load(dst + x + 2);
v_float64x2 v_dst2 = v_load(dst + x + 4);
v_float64x2 v_dst3 = v_load(dst + x + 6);
v_dst0 = v_dst0 + v_src0;
v_dst1 = v_dst1 + v_src1;
v_dst2 = v_dst2 + v_src2;
v_dst3 = v_dst3 + v_src3;
v_store(dst + x, v_dst0);
v_store(dst + x + 2, v_dst1);
v_store(dst + x + 4, v_dst2);
v_store(dst + x + 6, v_dst3);
return x;
template <>
struct AccSqr_SIMD<uchar, float>
int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_src = v_load(src + x);
v_uint16x8 v_src0, v_src1;
v_expand(v_src, v_src0, v_src1);
v_src0 = v_src0 * v_src0;
v_src1 = v_src1 * v_src1;
v_uint32x4 v_src00, v_src01, v_src10, v_src11;
v_expand(v_src0, v_src00, v_src01);
v_expand(v_src1, v_src10, v_src11);
v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
else if (cn == 1)
v_uint8x16 v_0 = v_setall_u8(0);
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_mask = v_load(mask + x);
v_mask = ~(v_0 == v_mask);
v_uint8x16 v_src = v_load(src + x);
v_src = v_src & v_mask;
v_uint16x8 v_src0, v_src1;
v_expand(v_src, v_src0, v_src1);
v_src0 = v_src0 * v_src0;
v_src1 = v_src1 * v_src1;
v_uint32x4 v_src00, v_src01, v_src10, v_src11;
v_expand(v_src0, v_src00, v_src01);
v_expand(v_src1, v_src10, v_src11);
v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
return x;
template <>
struct AccSqr_SIMD<ushort, float>
int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_src = v_load(src + x);
v_uint32x4 v_src0, v_src1;
v_expand(v_src, v_src0, v_src1);
v_float32x4 v_float0, v_float1;
v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
v_float0 = v_float0 * v_float0;
v_float1 = v_float1 * v_float1;
v_store(dst + x, v_load(dst + x) + v_float0);
v_store(dst + x + 4, v_load(dst + x + 4) + v_float1);
return x;
#if CV_SIMD128_64F
template <>
struct AccSqr_SIMD<uchar, double>
int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint8x16 v_src = v_load(src + x);
v_uint16x8 v_int, dummy;
v_expand(v_src, v_int, dummy);
v_uint32x4 v_int0, v_int1;
v_expand(v_int, v_int0, v_int1);
v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
v_src0 = v_src0 * v_src0;
v_src1 = v_src1 * v_src1;
v_src2 = v_src2 * v_src2;
v_src3 = v_src3 * v_src3;
v_float64x2 v_dst0 = v_load(dst + x);
v_float64x2 v_dst1 = v_load(dst + x + 2);
v_float64x2 v_dst2 = v_load(dst + x + 4);
v_float64x2 v_dst3 = v_load(dst + x + 6);
v_dst0 += v_src0;
v_dst1 += v_src1;
v_dst2 += v_src2;
v_dst3 += v_src3;
v_store(dst + x, v_dst0);
v_store(dst + x + 2, v_dst1);
v_store(dst + x + 4, v_dst2);
v_store(dst + x + 6, v_dst3);
return x;
template <>
struct AccSqr_SIMD<ushort, double>
int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_src = v_load(src + x);
v_uint32x4 v_int_0, v_int_1;
v_expand(v_src, v_int_0, v_int_1);
v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
v_float64x2 v_src0 = v_cvt_f64(v_int0);
v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
v_float64x2 v_src2 = v_cvt_f64(v_int1);
v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
v_src0 = v_src0 * v_src0;
v_src1 = v_src1 * v_src1;
v_src2 = v_src2 * v_src2;
v_src3 = v_src3 * v_src3;
v_float64x2 v_dst0 = v_load(dst + x);
v_float64x2 v_dst1 = v_load(dst + x + 2);
v_float64x2 v_dst2 = v_load(dst + x + 4);
v_float64x2 v_dst3 = v_load(dst + x + 6);
v_dst0 += v_src0;
v_dst1 += v_src1;
v_dst2 += v_src2;
v_dst3 += v_src3;
v_store(dst + x, v_dst0);
v_store(dst + x + 2, v_dst1);
v_store(dst + x + 4, v_dst2);
v_store(dst + x + 6, v_dst3);
return x;
template <>
struct AccProd_SIMD<uchar, float>
int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
len *= cn;
if (!mask)
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_1src = v_load(src1 + x);
v_uint8x16 v_2src = v_load(src2 + x);
v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1;
v_expand(v_1src, v_1src0, v_1src1);
v_expand(v_2src, v_2src0, v_2src1);
v_uint16x8 v_src0, v_src1;
v_src0 = v_1src0 * v_2src0;
v_src1 = v_1src1 * v_2src1;
v_uint32x4 v_src00, v_src01, v_src10, v_src11;
v_expand(v_src0, v_src00, v_src01);
v_expand(v_src1, v_src10, v_src11);
v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
else if (cn == 1)
v_uint8x16 v_0 = v_setzero_u8();
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_mask = v_load(mask + x);
v_mask = ~(v_0 == v_mask);
v_uint8x16 v_1src = v_load(src1 + x) & v_mask;
v_uint8x16 v_2src = v_load(src2 + x) & v_mask;
v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1;
v_expand(v_1src, v_1src0, v_1src1);
v_expand(v_2src, v_2src0, v_2src1);
v_uint16x8 v_src0, v_src1;
v_src0 = v_1src0 * v_2src0;
v_src1 = v_1src1 * v_2src1;
v_uint32x4 v_src00, v_src01, v_src10, v_src11;
v_expand(v_src0, v_src00, v_src01);
v_expand(v_src1, v_src10, v_src11);
v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
return x;
template <>
struct AccProd_SIMD<ushort, float>
int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_1src = v_load(src1 + x);
v_uint16x8 v_2src = v_load(src2 + x);
v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1;
v_expand(v_1src, v_1src0, v_1src1);
v_expand(v_2src, v_2src0, v_2src1);
v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
v_float32x4 v_src0 = v_1float0 * v_2float0;
v_float32x4 v_src1 = v_1float1 * v_2float1;
v_store(dst + x, v_load(dst + x) + v_src0);
v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
else if (cn == 1)
v_uint16x8 v_0 = v_setzero_u16();
for ( ; x <= len - 8; x += 8)
v_uint8x16 v_mask = v_load_halves(mask + x, mask + x);
v_uint16x8 v_mask0, v_mask1;
v_expand(v_mask, v_mask0, v_mask1);
v_mask0 = ~(v_0 == v_mask0);
v_uint16x8 v_1src = v_load(src1 + x) & v_mask0;
v_uint16x8 v_2src = v_load(src2 + x) & v_mask0;
v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1;
v_expand(v_1src, v_1src0, v_1src1);
v_expand(v_2src, v_2src0, v_2src1);
v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
v_float32x4 v_src0 = v_1float0 * v_2float0;
v_float32x4 v_src1 = v_1float1 * v_2float1;
v_store(dst + x, v_load(dst + x) + v_src0);
v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
return x;
#if CV_SIMD128_64F
template <>
struct AccProd_SIMD<uchar, double>
int operator() (const uchar * src1, const uchar * src2, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint8x16 v_1src = v_load(src1 + x);
v_uint8x16 v_2src = v_load(src2 + x);
v_uint16x8 v_1int, v_2int, dummy;
v_expand(v_1src, v_1int, dummy);
v_expand(v_2src, v_2int, dummy);
v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
v_expand(v_1int, v_1int_0, v_1int_1);
v_expand(v_2int, v_2int_0, v_2int_1);
v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
v_float64x2 v_dst0 = v_load(dst + x);
v_float64x2 v_dst1 = v_load(dst + x + 2);
v_float64x2 v_dst2 = v_load(dst + x + 4);
v_float64x2 v_dst3 = v_load(dst + x + 6);
v_dst0 += v_src0;
v_dst1 += v_src1;
v_dst2 += v_src2;
v_dst3 += v_src3;
v_store(dst + x, v_dst0);
v_store(dst + x + 2, v_dst1);
v_store(dst + x + 4, v_dst2);
v_store(dst + x + 6, v_dst3);
return x;
template <>
struct AccProd_SIMD<ushort, double>
int operator() (const ushort * src1, const ushort * src2, double * dst, const uchar * mask, int len, int cn) const
int x = 0;
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_1src = v_load(src1 + x);
v_uint16x8 v_2src = v_load(src2 + x);
v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
v_expand(v_1src, v_1int_0, v_1int_1);
v_expand(v_2src, v_2int_0, v_2int_1);
v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
v_float64x2 v_dst0 = v_load(dst + x);
v_float64x2 v_dst1 = v_load(dst + x + 2);
v_float64x2 v_dst2 = v_load(dst + x + 4);
v_float64x2 v_dst3 = v_load(dst + x + 6);
v_dst0 = v_dst0 + v_src0;
v_dst1 = v_dst1 + v_src1;
v_dst2 = v_dst2 + v_src2;
v_dst3 = v_dst3 + v_src3;
v_store(dst + x, v_dst0);
v_store(dst + x + 2, v_dst1);
v_store(dst + x + 4, v_dst2);
v_store(dst + x + 6, v_dst3);
return x;
template <>
struct AccW_SIMD<uchar, float>
int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
int x = 0;
v_float32x4 v_alpha = v_setall_f32(alpha);
v_float32x4 v_beta = v_setall_f32(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 16; x += 16)
v_uint8x16 v_src = v_load(src + x);
v_uint16x8 v_src0, v_src1;
v_expand(v_src, v_src0, v_src1);
v_uint32x4 v_src00, v_src01, v_src10, v_src11;
v_expand(v_src0, v_src00, v_src01);
v_expand(v_src1, v_src10, v_src11);
v_float32x4 v_dst00 = v_load(dst + x);
v_float32x4 v_dst01 = v_load(dst + x + 4);
v_float32x4 v_dst10 = v_load(dst + x + 8);
v_float32x4 v_dst11 = v_load(dst + x + 12);
v_dst00 = (v_dst00 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
v_dst01 = (v_dst01 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
v_dst10 = (v_dst10 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
v_dst11 = (v_dst11 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
v_store(dst + x, v_dst00);
v_store(dst + x + 4, v_dst01);
v_store(dst + x + 8, v_dst10);
v_store(dst + x + 12, v_dst11);
return x;
template <>
struct AccW_SIMD<ushort, float>
int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
int x = 0;
v_float32x4 v_alpha = v_setall_f32(alpha);
v_float32x4 v_beta = v_setall_f32(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_src = v_load(src + x);
v_uint32x4 v_int0, v_int1;
v_expand(v_src, v_int0, v_int1);
v_float32x4 v_src0 = v_cvt_f32(v_reinterpret_as_s32(v_int0));
v_float32x4 v_src1 = v_cvt_f32(v_reinterpret_as_s32(v_int1));
v_src0 = v_src0 * v_alpha;
v_src1 = v_src1 * v_alpha;
v_float32x4 v_dst0 = v_load(dst + x) * v_beta;
v_float32x4 v_dst1 = v_load(dst + x + 4) * v_beta;
v_store(dst + x, v_dst0 + v_src0);
v_store(dst + x + 4, v_dst1 + v_src1);
return x;
#if CV_SIMD128_64F
template <>
struct AccW_SIMD<uchar, double>
int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
int x = 0;
v_float64x2 v_alpha = v_setall_f64(alpha);
v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint8x16 v_src = v_load(src + x);
v_uint16x8 v_int, dummy;
v_expand(v_src, v_int, dummy);
v_uint32x4 v_int_0, v_int_1;
v_expand(v_int, v_int_0, v_int_1);
v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
v_float64x2 v_src0 = v_cvt_f64(v_int0);
v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
v_float64x2 v_src2 = v_cvt_f64(v_int1);
v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
v_float64x2 v_dst0 = v_load(dst + x);
v_float64x2 v_dst1 = v_load(dst + x + 2);
v_float64x2 v_dst2 = v_load(dst + x + 4);
v_float64x2 v_dst3 = v_load(dst + x + 6);
v_dst0 = (v_dst0 * v_beta) + (v_src0 * v_alpha);
v_dst1 = (v_dst1 * v_beta) + (v_src1 * v_alpha);
v_dst2 = (v_dst2 * v_beta) + (v_src2 * v_alpha);
v_dst3 = (v_dst3 * v_beta) + (v_src3 * v_alpha);
v_store(dst + x, v_dst0);
v_store(dst + x + 2, v_dst1);
v_store(dst + x + 4, v_dst2);
v_store(dst + x + 6, v_dst3);
return x;
template <>
struct AccW_SIMD<ushort, double>
int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
int x = 0;
v_float64x2 v_alpha = v_setall_f64(alpha);
v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
if (!mask)
len *= cn;
for ( ; x <= len - 8; x += 8)
v_uint16x8 v_src = v_load(src + x);
v_uint32x4 v_int_0, v_int_1;
v_expand(v_src, v_int_0, v_int_1);
v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
v_float64x2 v_src00 = v_cvt_f64(v_int0);
v_float64x2 v_src01 = v_cvt_f64_high(v_int0);
v_float64x2 v_src10 = v_cvt_f64(v_int1);
v_float64x2 v_src11 = v_cvt_f64_high(v_int1);
v_float64x2 v_dst00 = v_load(dst + x);
v_float64x2 v_dst01 = v_load(dst + x + 2);
v_float64x2 v_dst10 = v_load(dst + x + 4);
v_float64x2 v_dst11 = v_load(dst + x + 6);
v_dst00 = (v_dst00 * v_beta) + (v_src00 * v_alpha);
v_dst01 = (v_dst01 * v_beta) + (v_src01 * v_alpha);
v_dst10 = (v_dst10 * v_beta) + (v_src10 * v_alpha);
v_dst11 = (v_dst11 * v_beta) + (v_src11 * v_alpha);
v_store(dst + x, v_dst00);
v_store(dst + x + 2, v_dst01);
v_store(dst + x + 4, v_dst10);
v_store(dst + x + 6, v_dst11);
return x;
#endif //CV_SIMD128_64F
#endif //CV_SIMD128
template<typename T, typename AT> void
acc_( const T* src, AT* dst, const uchar* mask, int len, int cn )
int i = Acc_SIMD<T, AT>()(src, dst, mask, len, cn);
if( !mask )
len *= cn;
for( ; i <= len - 4; i += 4 )
AT t0, t1;
t0 = src[i] + dst[i];
t1 = src[i+1] + dst[i+1];
dst[i] = t0; dst[i+1] = t1;
t0 = src[i+2] + dst[i+2];
t1 = src[i+3] + dst[i+3];
dst[i+2] = t0; dst[i+3] = t1;
for( ; i < len; i++ )
dst[i] += src[i];
else if( cn == 1 )
for( ; i < len; i++ )
if( mask[i] )
dst[i] += src[i];
else if( cn == 3 )
for( ; i < len; i++, src += 3, dst += 3 )
if( mask[i] )
AT t0 = src[0] + dst[0];
AT t1 = src[1] + dst[1];
AT t2 = src[2] + dst[2];
dst[0] = t0; dst[1] = t1; dst[2] = t2;
for( ; i < len; i++, src += cn, dst += cn )
if( mask[i] )
for( int k = 0; k < cn; k++ )
dst[k] += src[k];
template<typename T, typename AT> void
accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn )
int i = AccSqr_SIMD<T, AT>()(src, dst, mask, len, cn);
if( !mask )
len *= cn;
for( ; i <= len - 4; i += 4 )
AT t0, t1;
t0 = (AT)src[i]*src[i] + dst[i];
t1 = (AT)src[i+1]*src[i+1] + dst[i+1];
dst[i] = t0; dst[i+1] = t1;
t0 = (AT)src[i+2]*src[i+2] + dst[i+2];
t1 = (AT)src[i+3]*src[i+3] + dst[i+3];
dst[i+2] = t0; dst[i+3] = t1;
for( ; i < len; i++ )
dst[i] += (AT)src[i]*src[i];
else if( cn == 1 )
for( ; i < len; i++ )
if( mask[i] )
dst[i] += (AT)src[i]*src[i];
else if( cn == 3 )
for( ; i < len; i++, src += 3, dst += 3 )
if( mask[i] )
AT t0 = (AT)src[0]*src[0] + dst[0];
AT t1 = (AT)src[1]*src[1] + dst[1];
AT t2 = (AT)src[2]*src[2] + dst[2];
dst[0] = t0; dst[1] = t1; dst[2] = t2;
for( ; i < len; i++, src += cn, dst += cn )
if( mask[i] )
for( int k = 0; k < cn; k++ )
dst[k] += (AT)src[k]*src[k];
template<typename T, typename AT> void
accProd_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn )
int i = AccProd_SIMD<T, AT>()(src1, src2, dst, mask, len, cn);
if( !mask )
len *= cn;
for( ; i <= len - 4; i += 4 )
AT t0, t1;
t0 = (AT)src1[i]*src2[i] + dst[i];
t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1];
dst[i] = t0; dst[i+1] = t1;
t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2];
t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3];
dst[i+2] = t0; dst[i+3] = t1;
for( ; i < len; i++ )
dst[i] += (AT)src1[i]*src2[i];
else if( cn == 1 )
for( ; i < len; i++ )
if( mask[i] )
dst[i] += (AT)src1[i]*src2[i];
else if( cn == 3 )
for( ; i < len; i++, src1 += 3, src2 += 3, dst += 3 )
if( mask[i] )
AT t0 = (AT)src1[0]*src2[0] + dst[0];
AT t1 = (AT)src1[1]*src2[1] + dst[1];
AT t2 = (AT)src1[2]*src2[2] + dst[2];
dst[0] = t0; dst[1] = t1; dst[2] = t2;
for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn )
if( mask[i] )
for( int k = 0; k < cn; k++ )
dst[k] += (AT)src1[k]*src2[k];
template<typename T, typename AT> void
accW_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha )
AT a = (AT)alpha, b = 1 - a;
int i = AccW_SIMD<T, AT>()(src, dst, mask, len, cn, a);
if( !mask )
len *= cn;
for( ; i <= len - 4; i += 4 )
AT t0, t1;
t0 = src[i]*a + dst[i]*b;
t1 = src[i+1]*a + dst[i+1]*b;
dst[i] = t0; dst[i+1] = t1;
t0 = src[i+2]*a + dst[i+2]*b;
t1 = src[i+3]*a + dst[i+3]*b;
dst[i+2] = t0; dst[i+3] = t1;
for( ; i < len; i++ )
dst[i] = src[i]*a + dst[i]*b;
else if( cn == 1 )
for( ; i < len; i++ )
if( mask[i] )
dst[i] = src[i]*a + dst[i]*b;
else if( cn == 3 )
for( ; i < len; i++, src += 3, dst += 3 )
if( mask[i] )
AT t0 = src[0]*a + dst[0]*b;
AT t1 = src[1]*a + dst[1]*b;
AT t2 = src[2]*a + dst[2]*b;
dst[0] = t0; dst[1] = t1; dst[2] = t2;
for( ; i < len; i++, src += cn, dst += cn )
if( mask[i] )
for( int k = 0; k < cn; k++ )
dst[k] = src[k]*a + dst[k]*b;
#define DEF_ACC_FUNCS(suffix, type, acctype) \
static void acc_##suffix(const type* src, acctype* dst, \
const uchar* mask, int len, int cn) \
{ acc_(src, dst, mask, len, cn); } \
static void accSqr_##suffix(const type* src, acctype* dst, \
const uchar* mask, int len, int cn) \
{ accSqr_(src, dst, mask, len, cn); } \
static void accProd_##suffix(const type* src1, const type* src2, \
acctype* dst, const uchar* mask, int len, int cn) \
{ accProd_(src1, src2, dst, mask, len, cn); } \
static void accW_##suffix(const type* src, acctype* dst, \
const uchar* mask, int len, int cn, double alpha) \
{ accW_(src, dst, mask, len, cn, alpha); }
DEF_ACC_FUNCS(8u32f, uchar, float)
DEF_ACC_FUNCS(8u64f, uchar, double)
DEF_ACC_FUNCS(16u32f, ushort, float)
DEF_ACC_FUNCS(16u64f, ushort, double)
DEF_ACC_FUNCS(32f, float, float)
DEF_ACC_FUNCS(32f64f, float, double)
DEF_ACC_FUNCS(64f, double, double)
typedef void (*AccFunc)(const uchar*, uchar*, const uchar*, int, int);
typedef void (*AccProdFunc)(const uchar*, const uchar*, uchar*, const uchar*, int, int);
typedef void (*AccWFunc)(const uchar*, uchar*, const uchar*, int, int, double);
typedef void(*AccFunc)(const uchar*, uchar*, const uchar*, int, int);
typedef void(*AccProdFunc)(const uchar*, const uchar*, uchar*, const uchar*, int, int);
typedef void(*AccWFunc)(const uchar*, uchar*, const uchar*, int, int, double);
static AccFunc accTab[] =
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at
#include "precomp.hpp"
#include "accum.simd.hpp"
#include "accum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv {
DEF_ACC_INT_FUNCS(8u32f, uchar, float)
DEF_ACC_INT_FUNCS(8u64f, uchar, double)
DEF_ACC_INT_FUNCS(16u32f, ushort, float)
DEF_ACC_INT_FUNCS(16u64f, ushort, double)
DEF_ACC_FLT_FUNCS(32f, float, float)
DEF_ACC_FLT_FUNCS(32f64f, float, double)
DEF_ACC_FLT_FUNCS(64f, double, double)
} //cv::hal
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment