Commit 6b849906 authored by Vitaly Tuzov's avatar Vitaly Tuzov

integral() implementation updated to utilize wide universal intrinsics

parent 43e66e7f
...@@ -43,6 +43,8 @@ ...@@ -43,6 +43,8 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp" #include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
namespace cv namespace cv
{ {
...@@ -60,15 +62,12 @@ struct Integral_SIMD ...@@ -60,15 +62,12 @@ struct Integral_SIMD
} }
}; };
#if CV_SSE2 #if CV_SIMD && CV_SIMD_WIDTH <= 64
template <> template <>
struct Integral_SIMD<uchar, int, double> struct Integral_SIMD<uchar, int, double>
{ {
Integral_SIMD() Integral_SIMD() {}
{
haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
}
bool operator()(const uchar * src, size_t _srcstep, bool operator()(const uchar * src, size_t _srcstep,
int * sum, size_t _sumstep, int * sum, size_t _sumstep,
...@@ -76,15 +75,12 @@ struct Integral_SIMD<uchar, int, double> ...@@ -76,15 +75,12 @@ struct Integral_SIMD<uchar, int, double>
int * tilted, size_t, int * tilted, size_t,
int width, int height, int cn) const int width, int height, int cn) const
{ {
if (sqsum || tilted || cn != 1 || !haveSSE2) if (sqsum || tilted || cn != 1)
return false; return false;
// the first iteration // the first iteration
memset(sum, 0, (width + 1) * sizeof(int)); memset(sum, 0, (width + 1) * sizeof(int));
__m128i v_zero = _mm_setzero_si128(), prev = v_zero;
int j = 0;
// the others // the others
for (int i = 0; i < height; ++i) for (int i = 0; i < height; ++i)
{ {
...@@ -94,48 +90,113 @@ struct Integral_SIMD<uchar, int, double> ...@@ -94,48 +90,113 @@ struct Integral_SIMD<uchar, int, double>
sum_row[-1] = 0; sum_row[-1] = 0;
prev = v_zero; v_int32 prev = vx_setzero_s32();
j = 0; int j = 0;
for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
for ( ; j + 7 < width; j += 8)
{ {
__m128i vsuml = _mm_loadu_si128((const __m128i *)(prev_sum_row + j)); v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
__m128i vsumh = _mm_loadu_si128((const __m128i *)(prev_sum_row + j + 4)); v_int32 el4l, el4h;
#if CV_AVX2
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val);
el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask));
prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
#if CV_SIMD_WIDTH == 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_expand(el8, el4l, el4h);
el4l += prev;
el4h += el4l;
prev = vx_setall_s32(v_rotate_right<v_int32::nlanes - 1>(el4h).get0());
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
}
__m128i el8shr0 = _mm_loadl_epi64((const __m128i *)(src_row + j)); for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
__m128i el8shr1 = _mm_slli_si128(el8shr0, 1); sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
__m128i el8shr2 = _mm_slli_si128(el8shr0, 2); }
__m128i el8shr3 = _mm_slli_si128(el8shr0, 3); vx_cleanup();
return true;
}
};
vsuml = _mm_add_epi32(vsuml, prev); template <>
vsumh = _mm_add_epi32(vsumh, prev); struct Integral_SIMD<uchar, float, double>
{
Integral_SIMD() {}
__m128i el8shr12 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr1, v_zero), bool operator()(const uchar * src, size_t _srcstep,
_mm_unpacklo_epi8(el8shr2, v_zero)); float * sum, size_t _sumstep,
__m128i el8shr03 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr0, v_zero), double * sqsum, size_t,
_mm_unpacklo_epi8(el8shr3, v_zero)); float * tilted, size_t,
__m128i el8 = _mm_add_epi16(el8shr12, el8shr03); int width, int height, int cn) const
{
if (sqsum || tilted || cn != 1)
return false;
__m128i el4h = _mm_add_epi16(_mm_unpackhi_epi16(el8, v_zero), // the first iteration
_mm_unpacklo_epi16(el8, v_zero)); memset(sum, 0, (width + 1) * sizeof(int));
vsuml = _mm_add_epi32(vsuml, _mm_unpacklo_epi16(el8, v_zero)); // the others
vsumh = _mm_add_epi32(vsumh, el4h); for (int i = 0; i < height; ++i)
{
const uchar * src_row = src + _srcstep * i;
float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + 1;
float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + 1;
_mm_storeu_si128((__m128i *)(sum_row + j), vsuml); sum_row[-1] = 0;
_mm_storeu_si128((__m128i *)(sum_row + j + 4), vsumh);
prev = _mm_add_epi32(prev, _mm_shuffle_epi32(el4h, _MM_SHUFFLE(3, 3, 3, 3))); v_float32 prev = vx_setzero_f32();
int j = 0;
for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float32 el4l, el4h;
#if CV_AVX2
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
__m256i shmask = _mm256_set1_epi32(7);
el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val);
el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask));
prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask);
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
#if CV_SIMD_WIDTH == 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
#endif
#endif
v_int32 el4li, el4hi;
v_expand(el8, el4li, el4hi);
el4l = v_cvt_f32(el4li) + prev;
el4h = v_cvt_f32(el4hi) + el4l;
prev = vx_setall_f32(v_rotate_right<v_float32::nlanes - 1>(el4h).get0());
#endif
v_store(sum_row + j , el4l + vx_load(prev_sum_row + j ));
v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
} }
for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
} }
vx_cleanup();
return true; return true;
} }
bool haveSSE2;
}; };
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment