further improvements in split & merge; started using non-temporary store instructions (#12063)

* 1. changed static const __m128/256 to const __m128/256 to avoid wierd instructions and calls inserted by compiler. 2. added universal intrinsics that wrap MOVNTPS and other such (non-temporary or "no cache" store) instructions. v_store_interleave() and v_store() got respective flags/overloaded variants 3. rewrote split & merge to use the "no cache" store instructions. It resulted in dramatic performance improvement when processing big arrays * hopefully, fixed some test failures where 4-channel v_store_interleave() is used * added missing implementation of the new universal intrinsics (v_store_aligned_nocache() etc.) * fixed silly typo in the new intrinsics in intrin_vsx.hpp * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors

further improvements in split & merge; started using non-temporary store instructions (#12063)
* 1. changed static const __m128/256 to const __m128/256 to avoid wierd instructions and calls inserted by compiler. 2. added universal intrinsics that wrap MOVNTPS and other such (non-temporary or "no cache" store) instructions. v_store_interleave() and v_store() got respective flags/overloaded variants 3. rewrote split & merge to use the "no cache" store instructions. It resulted in dramatic performance improvement when processing big arrays * hopefully, fixed some test failures where 4-channel v_store_interleave() is used * added missing implementation of the new universal intrinsics (v_store_aligned_nocache() etc.) * fixed silly typo in the new intrinsics in intrin_vsx.hpp * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors
43820d89 · Vadim Pisarevsky · GitHub · 5336b9ad · 43820d89 · 43820d89
Unverified Commit 43820d89 authored Jul 26, 2018 by Vadim Pisarevsky Committed by GitHub Jul 26, 2018
9 changed files
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -60,6 +60,17 @@
 // access from within opencv code more accessible
 namespace cv {

+namespace hal {
+
+enum StoreMode
+{
+    STORE_UNALIGNED = 0,
+    STORE_ALIGNED = 1,
+    STORE_ALIGNED_NOCACHE = 2
+};
+
+}
+
 template<typename _Tp> struct V_TypeTraits
 {
 };

--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1319,7 +1319,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b)
+                               const v_reg<_Tp, n>& b,
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i2;
    for( i = i2 = 0; i < n; i++, i2 += 2 )
@@ -1339,7 +1340,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i3;
    for( i = i3 = 0; i < n; i++, i3 += 3 )
@@ -1360,7 +1362,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d)
+                                                            const v_reg<_Tp, n>& d,
+                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i4;
    for( i = i4 = 0; i < n; i++, i4 += 4 )
@@ -1430,6 +1433,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i];
 }

+template<typename _Tp, int n>
+inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
 /** @brief Combine vector from first elements of two vectors

 Scheme:

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -864,6 +864,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vst1q_##suffix(ptr, a.val); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@@ -1292,14 +1296,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
    c.val = v.val[2]; \
    d.val = v.val[3]; \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x2_t v; \
    v.val[0] = a.val; \
    v.val[1] = b.val; \
    vst2q_##suffix(ptr, v); \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x3_t v; \
    v.val[0] = a.val; \
@@ -1308,7 +1314,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
    vst3q_##suffix(ptr, v); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
    _Tpvec##x4_t v; \
    v.val[0] = a.val; \
@@ -1360,7 +1367,8 @@ inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
 } \
 \
-inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@@ -1369,7 +1377,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
 } \
 \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
-                                const v_##tp##x2& b, const v_##tp##x2& c ) \
+                                const v_##tp##x2& b, const v_##tp##x2& c, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@@ -1380,7 +1389,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
 } \
 \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
-                                const v_##tp##x2& c, const v_##tp##x2& d ) \
+                                const v_##tp##x2& c, const v_##tp##x2& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -249,6 +249,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
 { st(a.val, 0, ptr); }                                                      \
 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
 { st_a(a.val, 0, ptr); }                                                    \
+inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)  \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)         \
+{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
 { vec_st_l8(a.val, ptr); }                                                  \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
@@ -281,13 +285,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a,                   \
 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b,        \
                                                _Tpvec& c, _Tpvec& d)        \
 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); }                    \
-inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)   \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, ptr); }                                    \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a,                    \
-                               const _Tpvec& b, const _Tpvec& c)             \
+                               const _Tpvec& b, const _Tpvec& c,             \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, ptr); }                             \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
-                                         const _Tpvec& c, const _Tpvec& d)   \
+                                         const _Tpvec& c, const _Tpvec& d,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }

 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)

--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -515,17 +515,17 @@ void exp32f( const float *_x, float *y, int n )

 #if CV_SIMD
    const int VECSZ = v_float32::nlanes;
-    static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
-    static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
-    static const v_float32 vminval = vx_setall_f32(minval);
-    static const v_float32 vmaxval = vx_setall_f32(maxval);
+    const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+    const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+    const v_float32 vminval = vx_setall_f32(minval);
+    const v_float32 vmaxval = vx_setall_f32(maxval);

-    static const v_float32 vA1 = vx_setall_f32((float)A1);
-    static const v_float32 vA2 = vx_setall_f32((float)A2);
-    static const v_float32 vA3 = vx_setall_f32((float)A3);
-    static const v_float32 vA4 = vx_setall_f32((float)A4);
+    const v_float32 vA1 = vx_setall_f32((float)A1);
+    const v_float32 vA2 = vx_setall_f32((float)A2);
+    const v_float32 vA3 = vx_setall_f32((float)A3);
+    const v_float32 vA4 = vx_setall_f32((float)A4);

-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;

    for( ; i < n; i += VECSZ*2 )
@@ -627,18 +627,18 @@ void exp64f( const double *_x, double *y, int n )

 #if CV_SIMD_64F
    const int VECSZ = v_float64::nlanes;
-    static const v_float64 vprescale = vx_setall_f64(exp_prescale);
-    static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
-    static const v_float64 vminval = vx_setall_f64(minval);
-    static const v_float64 vmaxval = vx_setall_f64(maxval);
-
-    static const v_float64 vA1 = vx_setall_f64(A1);
-    static const v_float64 vA2 = vx_setall_f64(A2);
-    static const v_float64 vA3 = vx_setall_f64(A3);
-    static const v_float64 vA4 = vx_setall_f64(A4);
-    static const v_float64 vA5 = vx_setall_f64(A5);
-
-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_float64 vprescale = vx_setall_f64(exp_prescale);
+    const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+    const v_float64 vminval = vx_setall_f64(minval);
+    const v_float64 vmaxval = vx_setall_f64(maxval);
+
+    const v_float64 vA1 = vx_setall_f64(A1);
+    const v_float64 vA2 = vx_setall_f64(A2);
+    const v_float64 vA3 = vx_setall_f64(A3);
+    const v_float64 vA4 = vx_setall_f64(A4);
+    const v_float64 vA5 = vx_setall_f64(A5);
+
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;

    for( ; i < n; i += VECSZ*2 )
@@ -1024,13 +1024,13 @@ void log32f( const float *_x, float *y, int n )

 #if CV_SIMD
    const int VECSZ = v_float32::nlanes;
-    static const v_float32 vln2 = vx_setall_f32((float)ln_2);
-    static const v_float32 v1 = vx_setall_f32(1.f);
-    static const v_float32 vshift = vx_setall_f32(-1.f/512);
+    const v_float32 vln2 = vx_setall_f32((float)ln_2);
+    const v_float32 v1 = vx_setall_f32(1.f);
+    const v_float32 vshift = vx_setall_f32(-1.f/512);

-    static const v_float32 vA0 = vx_setall_f32(A0);
-    static const v_float32 vA1 = vx_setall_f32(A1);
-    static const v_float32 vA2 = vx_setall_f32(A2);
+    const v_float32 vA0 = vx_setall_f32(A0);
+    const v_float32 vA1 = vx_setall_f32(A1);
+    const v_float32 vA2 = vx_setall_f32(A2);

    for( ; i < n; i += VECSZ )
    {
@@ -1097,9 +1097,9 @@ void log64f( const double *x, double *y, int n )

 #if CV_SIMD_64F
    const int VECSZ = v_float64::nlanes;
-    static const v_float64 vln2 = vx_setall_f64(ln_2);
+    const v_float64 vln2 = vx_setall_f64(ln_2);

-    static const v_float64
+    const v_float64
        vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
        vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
        vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),

--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -9,21 +9,58 @@
 namespace cv { namespace hal {

 #if CV_SIMD
+/*
+  The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
+  on IA there are instructions movntps and such to which
+  v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
+  Those instructions write directly into memory w/o touching cache
+  that results in dramatic speed improvements, especially on
+  large arrays (FullHD, 4K etc.).
+
+  Those intrinsics require the destination address to be aligned
+  by 16/32 bits (with SSE2 and AVX2, respectively).
+  So we potentially split the processing into 3 stages:
+  1) the optional prefix part [0:i0), where we use simple unaligned stores.
+  2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
+     But in some cases we have to use unaligned stores in this part.
+  3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
+     to process the remaining len - VECSZ elements.
+  In principle there can be very poorly aligned data where there is no main part.
+  For that we set i0=0 and use unaligned stores for the whole array.
+*/
 template<typename T, typename VecT> static void
 vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
+    int i, i0 = 0;
    const T* src0 = src[0];
    const T* src1 = src[1];

-    const int VECSZ = VecT::nlanes;
+    int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
+    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+    if( r != 0 )
+    {
+        mode = hal::STORE_UNALIGNED;
+        if( r % cn == 0 && len > VECSZ )
+            i0 = VECSZ - (r / cn);
+    }
+
    if( cn == 2 )
    {
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
-            v_store_interleave(dst + i*cn, a, b);
+            v_store_interleave(dst + i*cn, a, b, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else if( cn == 3 )
@@ -31,9 +68,18 @@ vecmerge_( const T** src, T* dst, int len, int cn )
        const T* src2 = src[2];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
-            v_store_interleave(dst + i*cn, a, b, c);
+            v_store_interleave(dst + i*cn, a, b, c, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else
@@ -43,10 +89,19 @@ vecmerge_( const T** src, T* dst, int len, int cn )
        const T* src3 = src[3];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
            VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
-            v_store_interleave(dst + i*cn, a, b, c, d);
+            v_store_interleave(dst + i*cn, a, b, c, d, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    vx_cleanup();

--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -9,23 +9,46 @@
 namespace cv { namespace hal {

 #if CV_SIMD
+// see the comments for vecmerge_ in merge.cpp
 template<typename T, typename VecT> static void
 vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
+    int i, i0 = 0;
    T* dst0 = dst[0];
    T* dst1 = dst[1];

-    const int VECSZ = VecT::nlanes;
+    int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T)));
+    int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T)));
+    int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0;
+    int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0;
+
+    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+    if( (r0|r1|r2|r3) != 0 )
+    {
+        mode = hal::STORE_UNALIGNED;
+        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
+            i0 = VECSZ - (r0 / cn);
+    }
+
    if( cn == 2 )
    {
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a, b;
            v_load_deinterleave(src + i*cn, a, b);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else if( cn == 3 )
@@ -33,12 +56,21 @@ vecsplit_( const T* src, T** dst, int len, int cn )
        T* dst2 = dst[2];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a, b, c;
            v_load_deinterleave(src + i*cn, a, b, c);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
-            v_store(dst2 + i, c);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            v_store(dst2 + i, c, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else
@@ -48,13 +80,22 @@ vecsplit_( const T* src, T** dst, int len, int cn )
        T* dst3 = dst[3];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a, b, c, d;
            v_load_deinterleave(src + i*cn, a, b, c, d);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
-            v_store(dst2 + i, c);
-            v_store(dst3 + i, d);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            v_store(dst2 + i, c, mode);
+            v_store(dst3 + i, d, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    vx_cleanup();