Unverified Commit 43820d89 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by GitHub

further improvements in split & merge; started using non-temporary store instructions (#12063)

* 1. changed static const __m128/256 to const __m128/256 to avoid wierd instructions and calls inserted by compiler.
2. added universal intrinsics that wrap MOVNTPS and other such (non-temporary or "no cache" store) instructions. v_store_interleave() and v_store() got respective flags/overloaded variants
3. rewrote split & merge to use the "no cache" store instructions. It resulted in dramatic performance improvement when processing big arrays

* hopefully, fixed some test failures where 4-channel v_store_interleave() is used

* added missing implementation of the new universal intrinsics (v_store_aligned_nocache() etc.)

* fixed silly typo in the new intrinsics in intrin_vsx.hpp

* still trying to fix VSX compiler errors

* still trying to fix VSX compiler errors

* still trying to fix VSX compiler errors

* still trying to fix VSX compiler errors
parent 5336b9ad
......@@ -60,6 +60,17 @@
// access from within opencv code more accessible
namespace cv {
namespace hal {
enum StoreMode
{
STORE_UNALIGNED = 0,
STORE_ALIGNED = 1,
STORE_ALIGNED_NOCACHE = 2
};
}
template<typename _Tp> struct V_TypeTraits
{
};
......
......@@ -1319,7 +1319,8 @@ Scheme:
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b)
const v_reg<_Tp, n>& b,
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
......@@ -1339,7 +1340,8 @@ Scheme:
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{
int i, i3;
for( i = i3 = 0; i < n; i++, i3 += 3 )
......@@ -1360,7 +1362,8 @@ Scheme:
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
const v_reg<_Tp, n>& d)
const v_reg<_Tp, n>& d,
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{
int i, i4;
for( i = i4 = 0; i < n; i++, i4 += 4 )
......@@ -1430,6 +1433,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
ptr[i] = a.s[i];
}
template<typename _Tp, int n>
inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
{
for( int i = 0; i < n; i++ )
ptr[i] = a.s[i];
}
template<typename _Tp, int n>
inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
{
for( int i = 0; i < n; i++ )
ptr[i] = a.s[i];
}
/** @brief Combine vector from first elements of two vectors
Scheme:
......
......@@ -864,6 +864,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
......@@ -1292,14 +1296,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
c.val = v.val[2]; \
d.val = v.val[3]; \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
_Tpvec##x2_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
_Tpvec##x3_t v; \
v.val[0] = a.val; \
......@@ -1308,7 +1314,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
vst3q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, const v_##_Tpvec& d) \
const v_##_Tpvec& c, const v_##_Tpvec& d, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
{ \
_Tpvec##x4_t v; \
v.val[0] = a.val; \
......@@ -1360,7 +1367,8 @@ inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
......@@ -1369,7 +1377,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
const v_##tp##x2& b, const v_##tp##x2& c ) \
const v_##tp##x2& b, const v_##tp##x2& c, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
......@@ -1380,7 +1389,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
const v_##tp##x2& c, const v_##tp##x2& d ) \
const v_##tp##x2& c, const v_##tp##x2& d, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
......
......@@ -249,6 +249,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ st(a.val, 0, ptr); } \
inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
{ st_a(a.val, 0, ptr); } \
inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
{ st_a(a.val, 0, ptr); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vec_st_l8(a.val, ptr); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
......@@ -281,13 +285,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
_Tpvec& c, _Tpvec& d) \
{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
const _Tpvec& b, const _Tpvec& c) \
const _Tpvec& b, const _Tpvec& c, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, c.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
const _Tpvec& c, const _Tpvec& d) \
const _Tpvec& c, const _Tpvec& d, \
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
......
......@@ -515,17 +515,17 @@ void exp32f( const float *_x, float *y, int n )
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
static const v_float32 vminval = vx_setall_f32(minval);
static const v_float32 vmaxval = vx_setall_f32(maxval);
const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
const v_float32 vminval = vx_setall_f32(minval);
const v_float32 vmaxval = vx_setall_f32(maxval);
static const v_float32 vA1 = vx_setall_f32((float)A1);
static const v_float32 vA2 = vx_setall_f32((float)A2);
static const v_float32 vA3 = vx_setall_f32((float)A3);
static const v_float32 vA4 = vx_setall_f32((float)A4);
const v_float32 vA1 = vx_setall_f32((float)A1);
const v_float32 vA2 = vx_setall_f32((float)A2);
const v_float32 vA3 = vx_setall_f32((float)A3);
const v_float32 vA4 = vx_setall_f32((float)A4);
static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
bool y_aligned = (size_t)(void*)y % 32 == 0;
for( ; i < n; i += VECSZ*2 )
......@@ -627,18 +627,18 @@ void exp64f( const double *_x, double *y, int n )
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
static const v_float64 vprescale = vx_setall_f64(exp_prescale);
static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
static const v_float64 vminval = vx_setall_f64(minval);
static const v_float64 vmaxval = vx_setall_f64(maxval);
static const v_float64 vA1 = vx_setall_f64(A1);
static const v_float64 vA2 = vx_setall_f64(A2);
static const v_float64 vA3 = vx_setall_f64(A3);
static const v_float64 vA4 = vx_setall_f64(A4);
static const v_float64 vA5 = vx_setall_f64(A5);
static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
const v_float64 vprescale = vx_setall_f64(exp_prescale);
const v_float64 vpostscale = vx_setall_f64(exp_postscale);
const v_float64 vminval = vx_setall_f64(minval);
const v_float64 vmaxval = vx_setall_f64(maxval);
const v_float64 vA1 = vx_setall_f64(A1);
const v_float64 vA2 = vx_setall_f64(A2);
const v_float64 vA3 = vx_setall_f64(A3);
const v_float64 vA4 = vx_setall_f64(A4);
const v_float64 vA5 = vx_setall_f64(A5);
const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
bool y_aligned = (size_t)(void*)y % 32 == 0;
for( ; i < n; i += VECSZ*2 )
......@@ -1024,13 +1024,13 @@ void log32f( const float *_x, float *y, int n )
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
static const v_float32 vln2 = vx_setall_f32((float)ln_2);
static const v_float32 v1 = vx_setall_f32(1.f);
static const v_float32 vshift = vx_setall_f32(-1.f/512);
const v_float32 vln2 = vx_setall_f32((float)ln_2);
const v_float32 v1 = vx_setall_f32(1.f);
const v_float32 vshift = vx_setall_f32(-1.f/512);
static const v_float32 vA0 = vx_setall_f32(A0);
static const v_float32 vA1 = vx_setall_f32(A1);
static const v_float32 vA2 = vx_setall_f32(A2);
const v_float32 vA0 = vx_setall_f32(A0);
const v_float32 vA1 = vx_setall_f32(A1);
const v_float32 vA2 = vx_setall_f32(A2);
for( ; i < n; i += VECSZ )
{
......@@ -1097,9 +1097,9 @@ void log64f( const double *x, double *y, int n )
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
static const v_float64 vln2 = vx_setall_f64(ln_2);
const v_float64 vln2 = vx_setall_f64(ln_2);
static const v_float64
const v_float64
vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
......
......@@ -9,21 +9,58 @@
namespace cv { namespace hal {
#if CV_SIMD
/*
The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
on IA there are instructions movntps and such to which
v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
Those instructions write directly into memory w/o touching cache
that results in dramatic speed improvements, especially on
large arrays (FullHD, 4K etc.).
Those intrinsics require the destination address to be aligned
by 16/32 bits (with SSE2 and AVX2, respectively).
So we potentially split the processing into 3 stages:
1) the optional prefix part [0:i0), where we use simple unaligned stores.
2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
But in some cases we have to use unaligned stores in this part.
3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
to process the remaining len - VECSZ elements.
In principle there can be very poorly aligned data where there is no main part.
For that we set i0=0 and use unaligned stores for the whole array.
*/
template<typename T, typename VecT> static void
vecmerge_( const T** src, T* dst, int len, int cn )
{
int i;
const int VECSZ = VecT::nlanes;
int i, i0 = 0;
const T* src0 = src[0];
const T* src1 = src[1];
const int VECSZ = VecT::nlanes;
int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
if( r != 0 )
{
mode = hal::STORE_UNALIGNED;
if( r % cn == 0 && len > VECSZ )
i0 = VECSZ - (r / cn);
}
if( cn == 2 )
{
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
v_store_interleave(dst + i*cn, a, b);
v_store_interleave(dst + i*cn, a, b, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
}
}
else if( cn == 3 )
......@@ -31,9 +68,18 @@ vecmerge_( const T** src, T* dst, int len, int cn )
const T* src2 = src[2];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
v_store_interleave(dst + i*cn, a, b, c);
v_store_interleave(dst + i*cn, a, b, c, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
}
}
else
......@@ -43,10 +89,19 @@ vecmerge_( const T** src, T* dst, int len, int cn )
const T* src3 = src[3];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
v_store_interleave(dst + i*cn, a, b, c, d);
v_store_interleave(dst + i*cn, a, b, c, d, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
}
}
vx_cleanup();
......
......@@ -9,23 +9,46 @@
namespace cv { namespace hal {
#if CV_SIMD
// see the comments for vecmerge_ in merge.cpp
template<typename T, typename VecT> static void
vecsplit_( const T* src, T** dst, int len, int cn )
{
int i;
const int VECSZ = VecT::nlanes;
int i, i0 = 0;
T* dst0 = dst[0];
T* dst1 = dst[1];
const int VECSZ = VecT::nlanes;
int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T)));
int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T)));
int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0;
int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0;
hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
if( (r0|r1|r2|r3) != 0 )
{
mode = hal::STORE_UNALIGNED;
if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
i0 = VECSZ - (r0 / cn);
}
if( cn == 2 )
{
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a, b;
v_load_deinterleave(src + i*cn, a, b);
v_store(dst0 + i, a);
v_store(dst1 + i, b);
v_store(dst0 + i, a, mode);
v_store(dst1 + i, b, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
}
}
else if( cn == 3 )
......@@ -33,12 +56,21 @@ vecsplit_( const T* src, T** dst, int len, int cn )
T* dst2 = dst[2];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a, b, c;
v_load_deinterleave(src + i*cn, a, b, c);
v_store(dst0 + i, a);
v_store(dst1 + i, b);
v_store(dst2 + i, c);
v_store(dst0 + i, a, mode);
v_store(dst1 + i, b, mode);
v_store(dst2 + i, c, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
}
}
else
......@@ -48,13 +80,22 @@ vecsplit_( const T* src, T** dst, int len, int cn )
T* dst3 = dst[3];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
if( i > len - VECSZ )
{
i = len - VECSZ;
mode = hal::STORE_UNALIGNED;
}
VecT a, b, c, d;
v_load_deinterleave(src + i*cn, a, b, c, d);
v_store(dst0 + i, a);
v_store(dst1 + i, b);
v_store(dst2 + i, c);
v_store(dst3 + i, d);
v_store(dst0 + i, a, mode);
v_store(dst1 + i, b, mode);
v_store(dst2 + i, c, mode);
v_store(dst3 + i, d, mode);
if( i < i0 )
{
i = i0 - VECSZ;
mode = hal::STORE_ALIGNED_NOCACHE;
}
}
}
vx_cleanup();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment