Commit 91d80d03 authored by Maksim Shabunin's avatar Maksim Shabunin

Merge pull request #6798 from tomoaki0705:featureSSEvideoAcc

parents 63d82904 3c2f7ecc
......@@ -83,6 +83,1370 @@ struct AccW_SIMD
}
};
#if CV_AVX
template <>
struct Acc_SIMD<float, float>
{
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8 ; x += 8)
{
__m256 v_src = _mm256_loadu_ps(src + x);
__m256 v_dst = _mm256_loadu_ps(dst + x);
v_dst = _mm256_add_ps(v_src, v_dst);
_mm256_storeu_ps(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct Acc_SIMD<float, double>
{
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8 ; x += 8)
{
__m256 v_src = _mm256_loadu_ps(src + x);
__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
_mm256_storeu_pd(dst + x, v_dst0);
_mm256_storeu_pd(dst + x + 4, v_dst1);
}
}
return x;
}
};
template <>
struct Acc_SIMD<double, double>
{
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m256d v_src = _mm256_loadu_pd(src + x);
__m256d v_dst = _mm256_loadu_pd(dst + x);
v_dst = _mm256_add_pd(v_dst, v_src);
_mm256_storeu_pd(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<float, float>
{
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8 ; x += 8)
{
__m256 v_src = _mm256_loadu_ps(src + x);
__m256 v_dst = _mm256_loadu_ps(dst + x);
v_src = _mm256_mul_ps(v_src, v_src);
v_dst = _mm256_add_ps(v_src, v_dst);
_mm256_storeu_ps(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<float, double>
{
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8 ; x += 8)
{
__m256 v_src = _mm256_loadu_ps(src + x);
__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
v_src0 = _mm256_mul_pd(v_src0, v_src0);
v_src1 = _mm256_mul_pd(v_src1, v_src1);
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
_mm256_storeu_pd(dst + x, v_dst0);
_mm256_storeu_pd(dst + x + 4, v_dst1);
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<double, double>
{
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m256d v_src = _mm256_loadu_pd(src + x);
__m256d v_dst = _mm256_loadu_pd(dst + x);
v_src = _mm256_mul_pd(v_src, v_src);
v_dst = _mm256_add_pd(v_dst, v_src);
_mm256_storeu_pd(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct AccProd_SIMD<float, float>
{
int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m256 v_src0 = _mm256_loadu_ps(src1 + x);
__m256 v_src1 = _mm256_loadu_ps(src2 + x);
__m256 v_dst = _mm256_loadu_ps(dst + x);
__m256 v_src = _mm256_mul_ps(v_src0, v_src1);
v_dst = _mm256_add_ps(v_src, v_dst);
_mm256_storeu_ps(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct AccProd_SIMD<float, double>
{
int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m256 v_1src = _mm256_loadu_ps(src1 + x);
__m256 v_2src = _mm256_loadu_ps(src2 + x);
__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0));
__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1));
__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0));
__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1));
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
__m256d v_src0 = _mm256_mul_pd(v_src00, v_src10);
__m256d v_src1 = _mm256_mul_pd(v_src01, v_src11);
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
_mm256_storeu_pd(dst + x, v_dst0);
_mm256_storeu_pd(dst + x + 4, v_dst1);
}
}
return x;
}
};
template <>
struct AccProd_SIMD<double, double>
{
int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m256d v_src0 = _mm256_loadu_pd(src1 + x);
__m256d v_src1 = _mm256_loadu_pd(src2 + x);
__m256d v_dst = _mm256_loadu_pd(dst + x);
v_src0 = _mm256_mul_pd(v_src0, v_src1);
v_dst = _mm256_add_pd(v_dst, v_src0);
_mm256_storeu_pd(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct AccW_SIMD<float, float>
{
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
{
int x = 0;
__m256 v_alpha = _mm256_set1_ps(alpha);
__m256 v_beta = _mm256_set1_ps(1.0f - alpha);
if (!mask)
{
len *= cn;
for ( ; x <= len - 16; x += 16)
{
_mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));
_mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
}
}
return x;
}
};
template <>
struct AccW_SIMD<float, double>
{
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
{
int x = 0;
__m256d v_alpha = _mm256_set1_pd(alpha);
__m256d v_beta = _mm256_set1_pd(1.0f - alpha);
if (!mask)
{
len *= cn;
for ( ; x <= len - 16; x += 16)
{
__m256 v_src0 = _mm256_loadu_ps(src + x);
__m256 v_src1 = _mm256_loadu_ps(src + x + 8);
__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0));
__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1));
__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0));
__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1));
_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha)));
_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha)));
_mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha)));
_mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
}
}
return x;
}
};
template <>
struct AccW_SIMD<double, double>
{
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
{
int x = 0;
__m256d v_alpha = _mm256_set1_pd(alpha);
__m256d v_beta = _mm256_set1_pd(1.0f - alpha);
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m256d v_src0 = _mm256_loadu_pd(src + x);
__m256d v_src1 = _mm256_loadu_pd(src + x + 4);
_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha)));
_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
}
}
return x;
}
};
#elif CV_SSE2
template <>
struct Acc_SIMD<float, float>
{
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_loadu_ps(src + x)));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_loadu_ps(src + x + 4)));
}
}
return x;
}
};
template <>
struct Acc_SIMD<float, double>
{
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m128 v_src = _mm_loadu_ps(src + x);
__m128d v_src0 = _mm_cvtps_pd(v_src);
__m128d v_src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_src, v_src, _MM_SHUFFLE(1, 0, 3, 2)));
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
}
}
return x;
}
};
template <>
struct Acc_SIMD<double, double>
{
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m128d v_src0 = _mm_loadu_pd(src + x);
__m128d v_src1 = _mm_loadu_pd(src + x + 2);
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<float, float>
{
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128 v_src0 = _mm_loadu_ps(src + x);
__m128 v_src1 = _mm_loadu_ps(src + x + 4);
v_src0 = _mm_mul_ps(v_src0, v_src0);
v_src1 = _mm_mul_ps(v_src1, v_src1);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1));
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<float, double>
{
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m128 v_src = _mm_loadu_ps(src + x);
__m128d v_src0 = _mm_cvtps_pd(v_src);
__m128d v_src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_src, v_src, _MM_SHUFFLE(1, 0, 3, 2)));
v_src0 = _mm_mul_pd(v_src0, v_src0);
v_src1 = _mm_mul_pd(v_src1, v_src1);
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<double, double>
{
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m128d v_src0 = _mm_loadu_pd(src + x);
__m128d v_src1 = _mm_loadu_pd(src + x + 2);
v_src0 = _mm_mul_pd(v_src0, v_src0);
v_src1 = _mm_mul_pd(v_src1, v_src1);
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
}
}
return x;
}
};
template <>
struct AccProd_SIMD<float, float>
{
int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_mul_ps(_mm_loadu_ps(src1 + x), _mm_loadu_ps(src2 + x))));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_mul_ps(_mm_loadu_ps(src1 + x + 4), _mm_loadu_ps(src2 + x + 4))));
}
}
return x;
}
};
template <>
struct AccProd_SIMD<float, double>
{
int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m128 v_1src = _mm_loadu_ps(src1 + x);
__m128 v_2src = _mm_loadu_ps(src2 + x);
__m128d v_1src0 = _mm_cvtps_pd(v_1src);
__m128d v_1src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_1src, v_1src, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_2src0 = _mm_cvtps_pd(v_2src);
__m128d v_2src1 = _mm_cvtps_pd(_mm_shuffle_ps(v_2src, v_2src, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
v_dst0 = _mm_add_pd(v_dst0, _mm_mul_pd(v_1src0, v_2src0));
v_dst1 = _mm_add_pd(v_dst1, _mm_mul_pd(v_1src1, v_2src1));
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
}
}
return x;
}
};
template <>
struct AccProd_SIMD<double, double>
{
int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m128d v_src00 = _mm_loadu_pd(src1 + x);
__m128d v_src01 = _mm_loadu_pd(src1 + x + 2);
__m128d v_src10 = _mm_loadu_pd(src2 + x);
__m128d v_src11 = _mm_loadu_pd(src2 + x + 2);
__m128d v_src0 = _mm_mul_pd(v_src00, v_src10);
__m128d v_src1 = _mm_mul_pd(v_src01, v_src11);
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
}
}
return x;
}
};
template <>
struct AccW_SIMD<float, float>
{
int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
{
int x = 0;
__m128 v_alpha = _mm_set1_ps(alpha);
__m128 v_beta = _mm_set1_ps(1.0f - alpha);
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(dst + x), v_beta), _mm_mul_ps(_mm_loadu_ps(src + x), v_alpha)));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(dst + x + 4), v_beta), _mm_mul_ps(_mm_loadu_ps(src + x + 4), v_alpha)));
}
}
return x;
}
};
template <>
struct AccW_SIMD<float, double>
{
int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
{
int x = 0;
__m128d v_alpha = _mm_set1_pd(alpha);
__m128d v_beta = _mm_set1_pd(1.0f - alpha);
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128 v_src0 = _mm_loadu_ps(src + x);
__m128 v_src1 = _mm_loadu_ps(src + x + 4);
__m128d v_src00 = _mm_cvtps_pd(v_src0);
__m128d v_src01 = _mm_cvtps_pd(_mm_shuffle_ps(v_src0, v_src0, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src10 = _mm_cvtps_pd(v_src1);
__m128d v_src11 = _mm_cvtps_pd(_mm_shuffle_ps(v_src1, v_src1, _MM_SHUFFLE(0, 0, 3, 2)));
_mm_storeu_pd(dst + x, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x), v_beta), _mm_mul_pd(v_src00, v_alpha)));
_mm_storeu_pd(dst + x + 2, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 2), v_beta), _mm_mul_pd(v_src01, v_alpha)));
_mm_storeu_pd(dst + x + 4, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 4), v_beta), _mm_mul_pd(v_src10, v_alpha)));
_mm_storeu_pd(dst + x + 6, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 6), v_beta), _mm_mul_pd(v_src11, v_alpha)));
}
}
return x;
}
};
template <>
struct AccW_SIMD<double, double>
{
int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
{
int x = 0;
__m128d v_alpha = _mm_set1_pd(alpha);
__m128d v_beta = _mm_set1_pd(1.0f - alpha);
if (!mask)
{
len *= cn;
for ( ; x <= len - 4; x += 4)
{
__m128d v_src0 = _mm_loadu_pd(src + x);
__m128d v_src1 = _mm_loadu_pd(src + x + 2);
_mm_storeu_pd(dst + x, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x), v_beta), _mm_mul_pd(v_src0, v_alpha)));
_mm_storeu_pd(dst + x + 2, _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(dst + x + 2), v_beta), _mm_mul_pd(v_src1, v_alpha)));
}
}
return x;
}
};
#endif
#if CV_SSE2
template <>
struct Acc_SIMD<uchar, float>
{
int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0))));
_mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0))));
}
}
else if (cn == 1)
{
__m128i v_255 = _mm_set1_epi8(-1);
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src + x)), _mm_xor_si128(v_255, _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(mask + x)), v_0)));
__m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0))));
_mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0))));
}
}
return x;
}
};
template <>
struct Acc_SIMD<ushort, float>
{
int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_src0 = _mm_unpacklo_epi16(v_src, v_0);
__m128i v_src1 = _mm_unpackhi_epi16(v_src, v_0);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(v_src0)));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(v_src1)));
}
}
return x;
}
};
template <>
struct Acc_SIMD<uchar, double>
{
int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
__m128i v_0 = _mm_setzero_si128();
len *= cn;
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_int0 = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_int1 = _mm_unpackhi_epi8(v_src, v_0);
__m128i v_int00 = _mm_unpacklo_epi16(v_int0, v_0);
__m128i v_int01 = _mm_unpackhi_epi16(v_int0, v_0);
__m128i v_int10 = _mm_unpacklo_epi16(v_int1, v_0);
__m128i v_int11 = _mm_unpackhi_epi16(v_int1, v_0);
__m128d v_src0 = _mm_cvtepi32_pd(v_int00);
__m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int00, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src2 = _mm_cvtepi32_pd(v_int01);
__m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int01, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src4 = _mm_cvtepi32_pd(v_int10);
__m128d v_src5 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int10, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src6 = _mm_cvtepi32_pd(v_int11);
__m128d v_src7 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int11, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst2 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst3 = _mm_loadu_pd(dst + x + 6);
__m128d v_dst4 = _mm_loadu_pd(dst + x + 8);
__m128d v_dst5 = _mm_loadu_pd(dst + x + 10);
__m128d v_dst6 = _mm_loadu_pd(dst + x + 12);
__m128d v_dst7 = _mm_loadu_pd(dst + x + 14);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
v_dst2 = _mm_add_pd(v_dst2, v_src2);
v_dst3 = _mm_add_pd(v_dst3, v_src3);
v_dst4 = _mm_add_pd(v_dst4, v_src4);
v_dst5 = _mm_add_pd(v_dst5, v_src5);
v_dst6 = _mm_add_pd(v_dst6, v_src6);
v_dst7 = _mm_add_pd(v_dst7, v_src7);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
_mm_storeu_pd(dst + x + 4, v_dst2);
_mm_storeu_pd(dst + x + 6, v_dst3);
_mm_storeu_pd(dst + x + 8, v_dst4);
_mm_storeu_pd(dst + x + 10, v_dst5);
_mm_storeu_pd(dst + x + 12, v_dst6);
_mm_storeu_pd(dst + x + 14, v_dst7);
}
}
return x;
}
};
template <>
struct Acc_SIMD<ushort, double>
{
int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
__m128i v_0 = _mm_setzero_si128();
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0);
__m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0);
__m128d v_src0 = _mm_cvtepi32_pd(v_int0);
__m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src2 = _mm_cvtepi32_pd(v_int1);
__m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst2 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst3 = _mm_loadu_pd(dst + x + 6);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
v_dst2 = _mm_add_pd(v_dst2, v_src2);
v_dst3 = _mm_add_pd(v_dst3, v_src3);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
_mm_storeu_pd(dst + x + 4, v_dst2);
_mm_storeu_pd(dst + x + 6, v_dst3);
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<uchar, float>
{
int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0);
v_src0 = _mm_mullo_epi16(v_src0, v_src0);
v_src1 = _mm_mullo_epi16(v_src1, v_src1);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0))));
_mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0))));
}
}
else if (cn == 1)
{
__m128i v_255 = _mm_set1_epi8(-1);
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src + x)), _mm_xor_si128(v_255, _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(mask + x)), v_0)));
__m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0);
v_src0 = _mm_mullo_epi16(v_src0, v_src0);
v_src1 = _mm_mullo_epi16(v_src1, v_src1);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0))));
_mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0))));
_mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0))));
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<ushort, float>
{
int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_src0 = _mm_unpacklo_epi16(v_src, v_0);
__m128i v_src1 = _mm_unpackhi_epi16(v_src, v_0);
v_src0 = _mm_mullo_epi16(v_src0, v_src0);
v_src1 = _mm_mullo_epi16(v_src1, v_src1);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(v_src0)));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(v_src1)));
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<uchar, double>
{
int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
__m128i v_0 = _mm_setzero_si128();
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadl_epi64((const __m128i*)(src + x));
__m128i v_int = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_int0 = _mm_unpacklo_epi16(v_int, v_0);
__m128i v_int1 = _mm_unpackhi_epi16(v_int, v_0);
__m128d v_src0 = _mm_cvtepi32_pd(v_int0);
__m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src2 = _mm_cvtepi32_pd(v_int1);
__m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2)));
v_src0 = _mm_mul_pd(v_src0, v_src0);
v_src1 = _mm_mul_pd(v_src1, v_src1);
v_src2 = _mm_mul_pd(v_src2, v_src2);
v_src3 = _mm_mul_pd(v_src3, v_src3);
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst2 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst3 = _mm_loadu_pd(dst + x + 6);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
v_dst2 = _mm_add_pd(v_dst2, v_src2);
v_dst3 = _mm_add_pd(v_dst3, v_src3);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
_mm_storeu_pd(dst + x + 4, v_dst2);
_mm_storeu_pd(dst + x + 6, v_dst3);
}
}
return x;
}
};
template <>
struct AccSqr_SIMD<ushort, double>
{
int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
__m128i v_0 = _mm_setzero_si128();
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0);
__m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0);
__m128d v_src0 = _mm_cvtepi32_pd(v_int0);
__m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src2 = _mm_cvtepi32_pd(v_int1);
__m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2)));
v_src0 = _mm_mul_pd(v_src0, v_src0);
v_src1 = _mm_mul_pd(v_src1, v_src1);
v_src2 = _mm_mul_pd(v_src2, v_src2);
v_src3 = _mm_mul_pd(v_src3, v_src3);
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst2 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst3 = _mm_loadu_pd(dst + x + 6);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
v_dst2 = _mm_add_pd(v_dst2, v_src2);
v_dst3 = _mm_add_pd(v_dst3, v_src3);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
_mm_storeu_pd(dst + x + 4, v_dst2);
_mm_storeu_pd(dst + x + 6, v_dst3);
}
}
return x;
}
};
template <>
struct AccProd_SIMD<uchar, float>
{
int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
__m128i v_0 = _mm_setzero_si128();
len *= cn;
if (!mask)
{
for ( ; x <= len - 16; x += 16)
{
__m128i v_1src = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i v_2src = _mm_loadu_si128((const __m128i*)(src2 + x));
__m128i v_1src0 = _mm_unpacklo_epi8(v_1src, v_0);
__m128i v_1src1 = _mm_unpackhi_epi8(v_1src, v_0);
__m128i v_2src0 = _mm_unpacklo_epi8(v_2src, v_0);
__m128i v_2src1 = _mm_unpackhi_epi8(v_2src, v_0);
__m128i v_src0 = _mm_mullo_epi16(v_1src0, v_2src0);
__m128i v_src1 = _mm_mullo_epi16(v_1src1, v_2src1);
__m128 v_src00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0));
__m128 v_src01 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0));
__m128 v_src10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0));
__m128 v_src11 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0));
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src00));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src01));
_mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), v_src10));
_mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), v_src11));
}
}
else if (cn == 1)
{
__m128i v_255 = _mm_set1_epi8(-1);
for ( ; x <= len - 16; x += 16)
{
__m128i v_mask = _mm_loadu_si128((const __m128i*)(mask + x));
v_mask = _mm_xor_si128(v_255, _mm_cmpeq_epi8(v_mask, v_0));
__m128i v_1src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src1 + x)), v_mask);
__m128i v_2src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src2 + x)), v_mask);
__m128i v_1src0 = _mm_unpacklo_epi8(v_1src, v_0);
__m128i v_1src1 = _mm_unpackhi_epi8(v_1src, v_0);
__m128i v_2src0 = _mm_unpacklo_epi8(v_2src, v_0);
__m128i v_2src1 = _mm_unpackhi_epi8(v_2src, v_0);
__m128i v_src0 = _mm_mullo_epi16(v_1src0, v_2src0);
__m128i v_src1 = _mm_mullo_epi16(v_1src1, v_2src1);
__m128 v_src00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0));
__m128 v_src01 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0));
__m128 v_src10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0));
__m128 v_src11 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0));
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src00));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src01));
_mm_storeu_ps(dst + x + 8, _mm_add_ps(_mm_loadu_ps(dst + x + 8), v_src10));
_mm_storeu_ps(dst + x + 12, _mm_add_ps(_mm_loadu_ps(dst + x + 12), v_src11));
}
}
return x;
}
};
template <>
struct AccProd_SIMD<ushort, float>
{
int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_1src = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i v_2src = _mm_loadu_si128((const __m128i*)(src2 + x));
__m128 v_1src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_1src, v_0));
__m128 v_1src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_1src, v_0));
__m128 v_2src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_2src, v_0));
__m128 v_2src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_2src, v_0));
__m128 v_src0 = _mm_mul_ps(v_1src0, v_2src0);
__m128 v_src1 = _mm_mul_ps(v_1src1, v_2src1);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1));
}
}
else if (cn == 1)
{
__m128i v_65535 = _mm_set1_epi16(-1);
for ( ; x <= len - 8; x += 8)
{
__m128i v_mask = _mm_loadl_epi64((const __m128i*)(mask + x));
__m128i v_mask0 = _mm_unpacklo_epi8(v_mask, v_0);
v_mask0 = _mm_xor_si128(v_65535, _mm_cmpeq_epi16(v_mask0, v_0));
__m128i v_1src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src1 + x)), v_mask0);
__m128i v_2src = _mm_and_si128(_mm_loadu_si128((const __m128i*)(src2 + x)), v_mask0);
__m128 v_1src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_1src, v_0));
__m128 v_1src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_1src, v_0));
__m128 v_2src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_2src, v_0));
__m128 v_2src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_2src, v_0));
__m128 v_src0 = _mm_mul_ps(v_1src0, v_2src0);
__m128 v_src1 = _mm_mul_ps(v_1src1, v_2src1);
_mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1));
}
}
return x;
}
};
template <>
struct AccProd_SIMD<uchar, double>
{
int operator() (const uchar * src1, const uchar * src2, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
__m128i v_0 = _mm_setzero_si128();
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_1src = _mm_loadl_epi64((const __m128i*)(src1 + x));
__m128i v_2src = _mm_loadl_epi64((const __m128i*)(src2 + x));
__m128i v_1int = _mm_unpacklo_epi8(v_1src, v_0);
__m128i v_2int = _mm_unpacklo_epi8(v_2src, v_0);
__m128i v_1int0 = _mm_unpacklo_epi16(v_1int, v_0);
__m128i v_1int1 = _mm_unpackhi_epi16(v_1int, v_0);
__m128i v_2int0 = _mm_unpacklo_epi16(v_2int, v_0);
__m128i v_2int1 = _mm_unpackhi_epi16(v_2int, v_0);
__m128d v_src0 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int0),_mm_cvtepi32_pd(v_2int0));
__m128d v_src1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int0, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int0, _MM_SHUFFLE(0, 0, 3, 2))));
__m128d v_src2 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int1),_mm_cvtepi32_pd(v_2int1));
__m128d v_src3 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int1, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int1, _MM_SHUFFLE(0, 0, 3, 2))));
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst2 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst3 = _mm_loadu_pd(dst + x + 6);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
v_dst2 = _mm_add_pd(v_dst2, v_src2);
v_dst3 = _mm_add_pd(v_dst3, v_src3);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
_mm_storeu_pd(dst + x + 4, v_dst2);
_mm_storeu_pd(dst + x + 6, v_dst3);
}
}
return x;
}
};
template <>
struct AccProd_SIMD<ushort, double>
{
int operator() (const ushort * src1, const ushort * src2, double * dst, const uchar * mask, int len, int cn) const
{
int x = 0;
if (!mask)
{
__m128i v_0 = _mm_setzero_si128();
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_1src = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i v_2src = _mm_loadu_si128((const __m128i*)(src2 + x));
__m128i v_1int0 = _mm_unpacklo_epi16(v_1src, v_0);
__m128i v_1int1 = _mm_unpackhi_epi16(v_1src, v_0);
__m128i v_2int0 = _mm_unpacklo_epi16(v_2src, v_0);
__m128i v_2int1 = _mm_unpackhi_epi16(v_2src, v_0);
__m128d v_src0 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int0),_mm_cvtepi32_pd(v_2int0));
__m128d v_src1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int0, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int0, _MM_SHUFFLE(0, 0, 3, 2))));
__m128d v_src2 = _mm_mul_pd(_mm_cvtepi32_pd(v_1int1),_mm_cvtepi32_pd(v_2int1));
__m128d v_src3 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(v_1int1, _MM_SHUFFLE(0, 0, 3, 2))),_mm_cvtepi32_pd(_mm_shuffle_epi32(v_2int1, _MM_SHUFFLE(0, 0, 3, 2))));
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst2 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst3 = _mm_loadu_pd(dst + x + 6);
v_dst0 = _mm_add_pd(v_dst0, v_src0);
v_dst1 = _mm_add_pd(v_dst1, v_src1);
v_dst2 = _mm_add_pd(v_dst2, v_src2);
v_dst3 = _mm_add_pd(v_dst3, v_src3);
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
_mm_storeu_pd(dst + x + 4, v_dst2);
_mm_storeu_pd(dst + x + 6, v_dst3);
}
}
return x;
}
};
template <>
struct AccW_SIMD<uchar, float>
{
int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
{
int x = 0;
__m128 v_alpha = _mm_set1_ps(alpha);
__m128 v_beta = _mm_set1_ps(1.0f - alpha);
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_src0 = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_src1 = _mm_unpackhi_epi8(v_src, v_0);
__m128 v_src00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src0, v_0));
__m128 v_src01 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src0, v_0));
__m128 v_src10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_0));
__m128 v_src11 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_0));
__m128 v_dst00 = _mm_loadu_ps(dst + x);
__m128 v_dst01 = _mm_loadu_ps(dst + x + 4);
__m128 v_dst10 = _mm_loadu_ps(dst + x + 8);
__m128 v_dst11 = _mm_loadu_ps(dst + x + 12);
v_dst00 = _mm_add_ps(_mm_mul_ps(v_dst00, v_beta), _mm_mul_ps(v_src00, v_alpha));
v_dst01 = _mm_add_ps(_mm_mul_ps(v_dst01, v_beta), _mm_mul_ps(v_src01, v_alpha));
v_dst10 = _mm_add_ps(_mm_mul_ps(v_dst10, v_beta), _mm_mul_ps(v_src10, v_alpha));
v_dst11 = _mm_add_ps(_mm_mul_ps(v_dst11, v_beta), _mm_mul_ps(v_src11, v_alpha));
_mm_storeu_ps(dst + x, v_dst00);
_mm_storeu_ps(dst + x + 4, v_dst01);
_mm_storeu_ps(dst + x + 8, v_dst10);
_mm_storeu_ps(dst + x + 12, v_dst11);
}
}
return x;
}
};
template <>
struct AccW_SIMD<ushort, float>
{
int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
{
int x = 0;
__m128 v_alpha = _mm_set1_ps(alpha);
__m128 v_beta = _mm_set1_ps(1.0f - alpha);
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128 v_src0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_0));
__m128 v_src1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_0));
v_src0 = _mm_mul_ps(v_src0, v_alpha);
v_src1 = _mm_mul_ps(v_src1, v_alpha);
__m128 v_dst0 = _mm_mul_ps(_mm_loadu_ps(dst + x), v_beta);
__m128 v_dst1 = _mm_mul_ps(_mm_loadu_ps(dst + x + 4), v_beta);
_mm_storeu_ps(dst + x, _mm_add_ps(v_dst0, v_src0));
_mm_storeu_ps(dst + x + 4, _mm_add_ps(v_dst1, v_src1));
}
}
return x;
}
};
template <>
struct AccW_SIMD<uchar, double>
{
int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
{
int x = 0;
__m128d v_alpha = _mm_set1_pd(alpha);
__m128d v_beta = _mm_set1_pd(1.0f - alpha);
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadl_epi64((const __m128i*)(src + x));
__m128i v_int = _mm_unpacklo_epi8(v_src, v_0);
__m128i v_int0 = _mm_unpacklo_epi16(v_int, v_0);
__m128i v_int1 = _mm_unpackhi_epi16(v_int, v_0);
__m128d v_src0 = _mm_cvtepi32_pd(v_int0);
__m128d v_src1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src2 = _mm_cvtepi32_pd(v_int1);
__m128d v_src3 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_dst0 = _mm_loadu_pd(dst + x);
__m128d v_dst1 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst2 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst3 = _mm_loadu_pd(dst + x + 6);
v_dst0 = _mm_add_pd(_mm_mul_pd(v_dst0, v_beta), _mm_mul_pd(v_src0, v_alpha));
v_dst1 = _mm_add_pd(_mm_mul_pd(v_dst1, v_beta), _mm_mul_pd(v_src1, v_alpha));
v_dst2 = _mm_add_pd(_mm_mul_pd(v_dst2, v_beta), _mm_mul_pd(v_src2, v_alpha));
v_dst3 = _mm_add_pd(_mm_mul_pd(v_dst3, v_beta), _mm_mul_pd(v_src3, v_alpha));
_mm_storeu_pd(dst + x, v_dst0);
_mm_storeu_pd(dst + x + 2, v_dst1);
_mm_storeu_pd(dst + x + 4, v_dst2);
_mm_storeu_pd(dst + x + 6, v_dst3);
}
}
return x;
}
};
template <>
struct AccW_SIMD<ushort, double>
{
int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn, double alpha) const
{
int x = 0;
__m128d v_alpha = _mm_set1_pd(alpha);
__m128d v_beta = _mm_set1_pd(1.0f - alpha);
__m128i v_0 = _mm_setzero_si128();
if (!mask)
{
len *= cn;
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
__m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0);
__m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0);
__m128d v_src00 = _mm_cvtepi32_pd(v_int0);
__m128d v_src01 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int0, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_src10 = _mm_cvtepi32_pd(v_int1);
__m128d v_src11 = _mm_cvtepi32_pd(_mm_shuffle_epi32(v_int1, _MM_SHUFFLE(0, 0, 3, 2)));
__m128d v_dst00 = _mm_loadu_pd(dst + x);
__m128d v_dst01 = _mm_loadu_pd(dst + x + 2);
__m128d v_dst10 = _mm_loadu_pd(dst + x + 4);
__m128d v_dst11 = _mm_loadu_pd(dst + x + 6);
v_dst00 = _mm_add_pd(_mm_mul_pd(v_dst00, v_beta), _mm_mul_pd(v_src00, v_alpha));
v_dst01 = _mm_add_pd(_mm_mul_pd(v_dst01, v_beta), _mm_mul_pd(v_src01, v_alpha));
v_dst10 = _mm_add_pd(_mm_mul_pd(v_dst10, v_beta), _mm_mul_pd(v_src10, v_alpha));
v_dst11 = _mm_add_pd(_mm_mul_pd(v_dst11, v_beta), _mm_mul_pd(v_src11, v_alpha));
_mm_storeu_pd(dst + x, v_dst00);
_mm_storeu_pd(dst + x + 2, v_dst01);
_mm_storeu_pd(dst + x + 4, v_dst10);
_mm_storeu_pd(dst + x + 6, v_dst11);
}
}
return x;
}
};
#endif //CV_SSE2
#if CV_NEON
template <>
......
......@@ -72,11 +72,11 @@ void CV_AccumBaseTest::get_test_array_types_and_sizes( int test_case_idx,
vector<vector<Size> >& sizes, vector<vector<int> >& types )
{
RNG& rng = ts->get_rng();
int depth = cvtest::randInt(rng) % 3, cn = cvtest::randInt(rng) & 1 ? 3 : 1;
int accdepth = std::max((int)(cvtest::randInt(rng) % 2 + 1), depth);
int depth = cvtest::randInt(rng) % 4, cn = cvtest::randInt(rng) & 1 ? 3 : 1;
int accdepth = (int)(cvtest::randInt(rng) % 2 + 1);
int i, input_count = (int)test_array[INPUT].size();
cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
depth = depth == 0 ? CV_8U : depth == 1 ? CV_32F : CV_64F;
depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_32F : CV_64F;
accdepth = accdepth == 1 ? CV_32F : CV_64F;
accdepth = MAX(accdepth, depth);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment