Commit 8d22ac20 authored by Alexander Alekhin's avatar Alexander Alekhin

core: workaround flipHoriz() alignment issues

parent 1c4a64f0
...@@ -514,6 +514,43 @@ static inline size_t roundUp(size_t a, unsigned int b) ...@@ -514,6 +514,43 @@ static inline size_t roundUp(size_t a, unsigned int b)
return a + b - 1 - (a + b - 1) % b; return a + b - 1 - (a + b - 1) % b;
} }
/** @brief Alignment check of passed values
Usage: `isAligned<sizeof(int)>(...)`
@note Alignment(N) must be a power of 2 (2**k, 2^k)
*/
template<int N, typename T> static inline
bool isAligned(const T& data)
{
CV_StaticAssert((N & (N - 1)) == 0, ""); // power of 2
return (((size_t)data) & (N - 1)) == 0;
}
/** @overload */
template<int N> static inline
bool isAligned(const void* p1)
{
return isAligned<N>((size_t)p1);
}
/** @overload */
template<int N> static inline
bool isAligned(const void* p1, const void* p2)
{
return isAligned<N>(((size_t)p1)|((size_t)p2));
}
/** @overload */
template<int N> static inline
bool isAligned(const void* p1, const void* p2, const void* p3)
{
return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3));
}
/** @overload */
template<int N> static inline
bool isAligned(const void* p1, const void* p2, const void* p3, const void* p4)
{
return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3)|((size_t)p4));
}
/** @brief Enables or disables the optimized code. /** @brief Enables or disables the optimized code.
The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2, The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2,
......
...@@ -563,6 +563,12 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask) ...@@ -563,6 +563,12 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
return *this; return *this;
} }
#if CV_NEON && !defined(__aarch64__)
#define CV_CHECK_ALIGNMENT 1
#else
#define CV_CHECK_ALIGNMENT 0
#endif
#if CV_SIMD128 #if CV_SIMD128
template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{ {
...@@ -572,6 +578,10 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s ...@@ -572,6 +578,10 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
int width_1 = width & -v_uint8x16::nlanes; int width_1 = width & -v_uint8x16::nlanes;
int i, j; int i, j;
#if CV_CHECK_ALIGNMENT
CV_Assert(isAligned<sizeof(T)>(src, dst));
#endif
for( ; size.height--; src += sstep, dst += dstep ) for( ; size.height--; src += sstep, dst += dstep )
{ {
for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes ) for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
...@@ -585,7 +595,7 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s ...@@ -585,7 +595,7 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
v_store((T*)(dst + j - v_uint8x16::nlanes), t0); v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
v_store((T*)(dst + i), t1); v_store((T*)(dst + i), t1);
} }
if (((size_t)src|(size_t)dst) % sizeof(T) == 0) if (isAligned<sizeof(T)>(src, dst))
{ {
for ( ; i < width; i += sizeof(T), j -= sizeof(T) ) for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
{ {
...@@ -620,6 +630,11 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const ...@@ -620,6 +630,11 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
int end = (int)(size.width*esz); int end = (int)(size.width*esz);
int width = (end + 1)/2; int width = (end + 1)/2;
#if CV_CHECK_ALIGNMENT
CV_Assert(isAligned<sizeof(T1)>(src, dst));
CV_Assert(isAligned<sizeof(T2)>(src, dst));
#endif
for( ; size.height--; src += sstep, dst += dstep ) for( ; size.height--; src += sstep, dst += dstep )
{ {
for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) ) for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
...@@ -644,6 +659,9 @@ static void ...@@ -644,6 +659,9 @@ static void
flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{ {
#if CV_SIMD #if CV_SIMD
#if CV_CHECK_ALIGNMENT
size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
#endif
if (esz == 2 * v_uint8x16::nlanes) if (esz == 2 * v_uint8x16::nlanes)
{ {
int end = (int)(size.width*esz); int end = (int)(size.width*esz);
...@@ -693,15 +711,27 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, ...@@ -693,15 +711,27 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
} }
} }
} }
else if (esz == 8) else if (esz == 8
#if CV_CHECK_ALIGNMENT
&& isAligned<sizeof(uint64)>(alignmentMark)
#endif
)
{ {
flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz); flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
} }
else if (esz == 4) else if (esz == 4
#if CV_CHECK_ALIGNMENT
&& isAligned<sizeof(unsigned)>(alignmentMark)
#endif
)
{ {
flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz); flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
} }
else if (esz == 2) else if (esz == 2
#if CV_CHECK_ALIGNMENT
&& isAligned<sizeof(ushort)>(alignmentMark)
#endif
)
{ {
flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz); flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
} }
...@@ -709,7 +739,11 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, ...@@ -709,7 +739,11 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
{ {
flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz); flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
} }
else if (esz == 24) else if (esz == 24
#if CV_CHECK_ALIGNMENT
&& isAligned<sizeof(uint64_t)>(alignmentMark)
#endif
)
{ {
int end = (int)(size.width*esz); int end = (int)(size.width*esz);
int width = (end + 1)/2; int width = (end + 1)/2;
...@@ -732,6 +766,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, ...@@ -732,6 +766,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
} }
} }
} }
#if !CV_CHECK_ALIGNMENT
else if (esz == 12) else if (esz == 12)
{ {
flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz); flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
...@@ -744,8 +779,9 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, ...@@ -744,8 +779,9 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
{ {
flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz); flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
} }
else
#endif #endif
else
#endif // CV_SIMD
{ {
int i, j, limit = (int)(((size.width + 1)/2)*esz); int i, j, limit = (int)(((size.width + 1)/2)*esz);
AutoBuffer<int> _tab(size.width*esz); AutoBuffer<int> _tab(size.width*esz);
...@@ -779,16 +815,33 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, ...@@ -779,16 +815,33 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
{ {
int i = 0; int i = 0;
#if CV_SIMD #if CV_SIMD
for( ; i <= size.width - (v_int32::nlanes * 4); i += v_int32::nlanes * 4 ) #if CV_CHECK_ALIGNMENT
if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
#endif
{ {
v_int32 t0 = vx_load((int*)(src0 + i)); for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
v_int32 t1 = vx_load((int*)(src1 + i)); {
vx_store((int*)(dst0 + i), t1); v_int32 t0 = vx_load((int*)(src0 + i));
vx_store((int*)(dst1 + i), t0); v_int32 t1 = vx_load((int*)(src1 + i));
vx_store((int*)(dst0 + i), t1);
vx_store((int*)(dst1 + i), t0);
}
} }
#if CV_CHECK_ALIGNMENT
else
{
for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
{
v_uint8 t0 = vx_load(src0 + i);
v_uint8 t1 = vx_load(src1 + i);
vx_store(dst0 + i, t1);
vx_store(dst1 + i, t0);
}
}
#endif
#endif #endif
if( ((size_t)src0|(size_t)dst0|(size_t)src1|(size_t)dst1) % sizeof(int) == 0 ) if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
{ {
for( ; i <= size.width - 16; i += 16 ) for( ; i <= size.width - 16; i += 16 )
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment