imgproc: dispatch box_filter

5a01227a · Alexander Alekhin · ce3c92eb · 5a01227a · 5a01227a · 5a01227a
Commit 5a01227a authored Mar 09, 2019 by Alexander Alekhin
Showing with 77 additions and 1777 deletions

CMakeLists.txt modules/imgproc/CMakeLists.txt +1 -0

box_filter.dispatch.cpp modules/imgproc/src/box_filter.dispatch.cpp +30 -1277

box_filter.simd.hpp modules/imgproc/src/box_filter.simd.hpp +46 -500

No files found.
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
 set(the_description "Image Processing")
 ocv_add_dispatched_file(accum SSE4_1 AVX AVX2)
 ocv_add_dispatched_file(bilateral_filter SSE2 AVX2)
+ocv_add_dispatched_file(box_filter SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2)

--- a/modules/imgproc/src/box_filter.dispatch.cpp
+++ b/modules/imgproc/src/box_filter.dispatch.cpp
@@ -50,1119 +50,11 @@
 #include "opencv2/core/openvx/ovx_defs.hpp"
-namespace cv
+#include "box_filter.simd.hpp"
-{
+#include "box_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
-/****************************************************************************************\
-                                         Box Filter
-\****************************************************************************************/
-template<typename T, typename ST>
-struct RowSum :
-        public BaseRowFilter
-{
-    RowSum( int _ksize, int _anchor ) :
-        BaseRowFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-    }
-    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
-    {
-        const T* S = (const T*)src;
-        ST* D = (ST*)dst;
-        int i = 0, k, ksz_cn = ksize*cn;
-        width = (width - 1)*cn;
-        if( ksize == 3 )
-        {
-            for( i = 0; i < width + cn; i++ )
-            {
-                D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2];
-            }
-        }
-        else if( ksize == 5 )
-        {
-            for( i = 0; i < width + cn; i++ )
-            {
-                D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4];
-            }
-        }
-        else if( cn == 1 )
-        {
-            ST s = 0;
-            for( i = 0; i < ksz_cn; i++ )
-                s += (ST)S[i];
-            D[0] = s;
-            for( i = 0; i < width; i++ )
-            {
-                s += (ST)S[i + ksz_cn] - (ST)S[i];
-                D[i+1] = s;
-            }
-        }
-        else if( cn == 3 )
-        {
-            ST s0 = 0, s1 = 0, s2 = 0;
-            for( i = 0; i < ksz_cn; i += 3 )
-            {
-                s0 += (ST)S[i];
-                s1 += (ST)S[i+1];
-                s2 += (ST)S[i+2];
-            }
-            D[0] = s0;
-            D[1] = s1;
-            D[2] = s2;
-            for( i = 0; i < width; i += 3 )
-            {
-                s0 += (ST)S[i + ksz_cn] - (ST)S[i];
-                s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
-                s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
-                D[i+3] = s0;
-                D[i+4] = s1;
-                D[i+5] = s2;
-            }
-        }
-        else if( cn == 4 )
-        {
-            ST s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-            for( i = 0; i < ksz_cn; i += 4 )
-            {
-                s0 += (ST)S[i];
-                s1 += (ST)S[i+1];
-                s2 += (ST)S[i+2];
-                s3 += (ST)S[i+3];
-            }
-            D[0] = s0;
-            D[1] = s1;
-            D[2] = s2;
-            D[3] = s3;
-            for( i = 0; i < width; i += 4 )
-            {
-                s0 += (ST)S[i + ksz_cn] - (ST)S[i];
-                s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
-                s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
-                s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3];
-                D[i+4] = s0;
-                D[i+5] = s1;
-                D[i+6] = s2;
-                D[i+7] = s3;
-            }
-        }
-        else
-            for( k = 0; k < cn; k++, S++, D++ )
-            {
-                ST s = 0;
-                for( i = 0; i < ksz_cn; i += cn )
-                    s += (ST)S[i];
-                D[0] = s;
-                for( i = 0; i < width; i += cn )
-                {
-                    s += (ST)S[i + ksz_cn] - (ST)S[i];
-                    D[i+cn] = s;
-                }
-            }
-    }
-};
-template<typename ST, typename T>
-struct ColumnSum :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int i;
-        ST* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(ST));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const ST* Sp = (const ST*)src[0];
-                for( i = 0; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-        for( ; count--; src++ )
-        {
-            const ST* Sp = (const ST*)src[0];
-            const ST* Sm = (const ST*)src[1-ksize];
-            T* D = (T*)dst;
-            if( haveScale )
-            {
-                for( i = 0; i <= width - 2; i += 2 )
-                {
-                    ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
-                    D[i] = saturate_cast<T>(s0*_scale);
-                    D[i+1] = saturate_cast<T>(s1*_scale);
-                    s0 -= Sm[i]; s1 -= Sm[i+1];
-                    SUM[i] = s0; SUM[i+1] = s1;
-                }
-                for( ; i < width; i++ )
-                {
-                    ST s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<T>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                for( i = 0; i <= width - 2; i += 2 )
-                {
-                    ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
-                    D[i] = saturate_cast<T>(s0);
-                    D[i+1] = saturate_cast<T>(s1);
-                    s0 -= Sm[i]; s1 -= Sm[i+1];
-                    SUM[i] = s0; SUM[i+1] = s1;
-                }
-                for( ; i < width; i++ )
-                {
-                    ST s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<T>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-    }
-    double scale;
-    int sumCount;
-    std::vector<ST> sum;
-};
-template<>
-struct ColumnSum<int, uchar> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
-                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
-                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            uchar* D = (uchar*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD
-                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
-                    v_uint16 v_dst = v_pack(v_s0d, v_s01d);
-                    v_pack_store(D + i, v_dst);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
-                }
-#if CV_SIMD_WIDTH > 16
-                v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
-                    v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
-                    v_pack_store(D + i, v_dst);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
-            }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<uchar>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
-                    v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
-                    v_pack_store(D + i, v_dst);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
-                    v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
-                    v_pack_store(D + i, v_dst);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<uchar>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-template<>
-struct ColumnSum<ushort, uchar> :
-public BaseColumnFilter
-{
-    enum { SHIFT = 23 };
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-    BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-        divDelta = 0;
-        divScale = 1;
-        if( scale != 1 )
-        {
-            int d = cvRound(1./scale);
-            double scalef = ((double)(1 << SHIFT))/d;
-            divScale = cvFloor(scalef);
-            scalef -= divScale;
-            divDelta = d/2;
-            if( scalef < 0.5 )
-                divDelta++;
-            else
-                divScale++;
-        }
-    }
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        const int ds = divScale;
-        const int dd = divDelta;
-        ushort* SUM;
-        const bool haveScale = scale != 1;
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(SUM[0]));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const ushort* Sp = (const ushort*)src[0];
-                int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
-                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
-                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-        for( ; count--; src++ )
-        {
-            const ushort* Sp = (const ushort*)src[0];
-            const ushort* Sm = (const ushort*)src[1-ksize];
-            uchar* D = (uchar*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD
-                v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
-                v_uint16 _dd8 = vx_setall_u16((ushort)dd);
-                for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
-                {
-                    v_uint16 _sm0 = vx_load(Sm + i);
-                    v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
-                    v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
-                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
-                    v_uint32 _s00, _s01, _s10, _s11;
-                    v_expand(_s0 + _dd8, _s00, _s01);
-                    v_expand(_s1 + _dd8, _s10, _s11);
-                    _s00 = v_shr<SHIFT>(_s00*_ds4);
-                    _s01 = v_shr<SHIFT>(_s01*_ds4);
-                    _s10 = v_shr<SHIFT>(_s10*_ds4);
-                    _s11 = v_shr<SHIFT>(_s11*_ds4);
-                    v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
-                    v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
-                    _s0 = v_sub_wrap(_s0, _sm0);
-                    _s1 = v_sub_wrap(_s1, _sm1);
-                    v_store(D + i, v_pack_u(r0, r1));
-                    v_store(SUM + i, _s0);
-                    v_store(SUM + i + v_uint16::nlanes, _s1);
-                }
-#if CV_SIMD_WIDTH > 16
-                v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
-                v_uint16x8 dd8 = v_setall_u16((ushort)dd);
-                for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes )
-                {
-                    v_uint16x8 _sm0 = v_load(Sm + i);
-                    v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes);
-                    v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
-                    v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes));
-                    v_uint32x4 _s00, _s01, _s10, _s11;
-                    v_expand(_s0 + dd8, _s00, _s01);
-                    v_expand(_s1 + dd8, _s10, _s11);
-                    _s00 = v_shr<SHIFT>(_s00*ds4);
-                    _s01 = v_shr<SHIFT>(_s01*ds4);
-                    _s10 = v_shr<SHIFT>(_s10*ds4);
-                    _s11 = v_shr<SHIFT>(_s11*ds4);
-                    v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
-                    v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
-                    _s0 = v_sub_wrap(_s0, _sm0);
-                    _s1 = v_sub_wrap(_s1, _sm1);
-                    v_store(D + i, v_pack_u(r0, r1));
-                    v_store(SUM + i, _s0);
-                    v_store(SUM + i + v_uint16x8::nlanes, _s1);
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = (uchar)((s0 + dd)*ds >> SHIFT);
-                    SUM[i] = (ushort)(s0 - Sm[i]);
-                }
-            }
-            else
-            {
-                int i = 0;
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<uchar>(s0);
-                    SUM[i] = (ushort)(s0 - Sm[i]);
-                }
-            }
-            dst += dststep;
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-    double scale;
-    int sumCount;
-    int divDelta;
-    int divScale;
-    std::vector<ushort> sum;
-};
-template<>
-struct ColumnSum<int, short> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int i;
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
-                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
-                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            short* D = (short*)dst;
-            if( haveScale )
-            {
-                i = 0;
-#if CV_SIMD
-                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
-                    v_int32 v_s0d =  v_round(v_cvt_f32(v_s0) * _v_scale);
-                    v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
-                    v_store(D + i, v_pack(v_s0d, v_s01d));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
-                }
-#if CV_SIMD_WIDTH > 16
-                v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
-                    v_int32x4 v_s0d =  v_round(v_cvt_f32(v_s0) * v_scale);
-                    v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
-                    v_store(D + i, v_pack(v_s0d, v_s01d));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<short>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
-                    v_store(D + i, v_pack(v_s0, v_s01));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
-                    v_store(D + i, v_pack(v_s0, v_s01));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<short>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-template<>
-struct ColumnSum<int, ushort> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
-                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
-                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            ushort* D = (ushort*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD
-                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
-                    v_store(D + i, v_pack(v_s0d, v_s01d));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
-                }
-#if CV_SIMD_WIDTH > 16
-                v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
-                    v_store(D + i, v_pack(v_s0d, v_s01d));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<ushort>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
-                    v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
-                    v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<ushort>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-template<>
-struct ColumnSum<int, int> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
-                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
-                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            int* D = (int*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD
-                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
-                    v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
-                    v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<int>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = s0;
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-template<>
-struct ColumnSum<int, float> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
-                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
-                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-        for( ; count--; src++ )
-        {
-            const int * Sp = (const int*)src[0];
-            const int * Sm = (const int*)src[1-ksize];
-            float* D = (float*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD
-                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * v_scale);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = (float)(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
-                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
-                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                }
-#endif
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = (float)(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-    double scale;
+namespace cv {
-    int sumCount;
-    std::vector<int> sum;
-};
 #ifdef HAVE_OPENCL
@@ -1396,109 +288,34 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
 #endif
-}
+Ptr<BaseRowFilter> getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
-cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
 {
-    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
+    CV_INSTRUMENT_REGION();
-    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
+    CV_CPU_DISPATCH(getRowSumFilter, (srcType, sumType, ksize, anchor),
-    if( anchor < 0 )
+        CV_CPU_DISPATCH_MODES_ALL);
-        anchor = ksize/2;
-    if( sdepth == CV_8U && ddepth == CV_32S )
-        return makePtr<RowSum<uchar, int> >(ksize, anchor);
-    if( sdepth == CV_8U && ddepth == CV_16U )
-        return makePtr<RowSum<uchar, ushort> >(ksize, anchor);
-    if( sdepth == CV_8U && ddepth == CV_64F )
-        return makePtr<RowSum<uchar, double> >(ksize, anchor);
-    if( sdepth == CV_16U && ddepth == CV_32S )
-        return makePtr<RowSum<ushort, int> >(ksize, anchor);
-    if( sdepth == CV_16U && ddepth == CV_64F )
-        return makePtr<RowSum<ushort, double> >(ksize, anchor);
-    if( sdepth == CV_16S && ddepth == CV_32S )
-        return makePtr<RowSum<short, int> >(ksize, anchor);
-    if( sdepth == CV_32S && ddepth == CV_32S )
-        return makePtr<RowSum<int, int> >(ksize, anchor);
-    if( sdepth == CV_16S && ddepth == CV_64F )
-        return makePtr<RowSum<short, double> >(ksize, anchor);
-    if( sdepth == CV_32F && ddepth == CV_64F )
-        return makePtr<RowSum<float, double> >(ksize, anchor);
-    if( sdepth == CV_64F && ddepth == CV_64F )
-        return makePtr<RowSum<double, double> >(ksize, anchor);
-    CV_Error_( CV_StsNotImplemented,
-        ("Unsupported combination of source format (=%d), and buffer format (=%d)",
-        srcType, sumType));
 }
-cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
+Ptr<BaseColumnFilter> getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale)
-                                                     int anchor, double scale)
 {
-    int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
+    CV_INSTRUMENT_REGION();
-    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
+    CV_CPU_DISPATCH(getColumnSumFilter, (sumType, dstType, ksize, anchor, scale),
-    if( anchor < 0 )
+        CV_CPU_DISPATCH_MODES_ALL);
-        anchor = ksize/2;
-    if( ddepth == CV_8U && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
-    if( ddepth == CV_8U && sdepth == CV_16U )
-        return makePtr<ColumnSum<ushort, uchar> >(ksize, anchor, scale);
-    if( ddepth == CV_8U && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
-    if( ddepth == CV_16U && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
-    if( ddepth == CV_16U && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
-    if( ddepth == CV_16S && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
-    if( ddepth == CV_16S && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
-    if( ddepth == CV_32S && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
-    if( ddepth == CV_32F && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
-    if( ddepth == CV_32F && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
-    if( ddepth == CV_64F && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
-    if( ddepth == CV_64F && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
-    CV_Error_( CV_StsNotImplemented,
-        ("Unsupported combination of sum format (=%d), and destination format (=%d)",
-        sumType, dstType));
 }
-cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
+Ptr<FilterEngine> createBoxFilter(int srcType, int dstType, Size ksize,
-                    Point anchor, bool normalize, int borderType )
+                                  Point anchor, bool normalize, int borderType)
 {
-    int sdepth = CV_MAT_DEPTH(srcType);
+    CV_INSTRUMENT_REGION();
-    int cn = CV_MAT_CN(srcType), sumType = CV_64F;
-    if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U &&
-        ksize.width*ksize.height <= 256 )
-        sumType = CV_16U;
-    else if( sdepth <= CV_32S && (!normalize ||
-        ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
-            sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
-        sumType = CV_32S;
-    sumType = CV_MAKETYPE( sumType, cn );
-    Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
-    Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
-        dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
-    return makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
+    CV_CPU_DISPATCH(createBoxFilter, (srcType, dstType, ksize, anchor, normalize, borderType),
-           srcType, dstType, sumType, borderType );
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 #ifdef HAVE_OPENVX
-namespace cv
-{
    namespace ovx {
        template <> inline bool skipSmallImages<VX_KERNEL_BOX_3x3>(int w, int h) { return w*h < 640 * 480; }
    }
@@ -1570,12 +387,9 @@ namespace cv
        return true;
    }
-}
 #endif
 #if defined(HAVE_IPP)
-namespace cv
-{
 static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType)
 {
 #ifdef HAVE_IPP_IW
@@ -1620,13 +434,12 @@ static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool nor
    return false;
 #endif
 }
-}
 #endif
-void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
+void boxFilter(InputArray _src, OutputArray _dst, int ddepth,
-                Size ksize, Point anchor,
+               Size ksize, Point anchor,
-                bool normalize, int borderType )
+               bool normalize, int borderType)
 {
    CV_INSTRUMENT_REGION();
@@ -1674,8 +487,8 @@ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
 }
-void cv::blur( InputArray src, OutputArray dst,
+void blur(InputArray src, OutputArray dst,
-           Size ksize, Point anchor, int borderType )
+          Size ksize, Point anchor, int borderType)
 {
    CV_INSTRUMENT_REGION();
@@ -1687,77 +500,17 @@ void cv::blur( InputArray src, OutputArray dst,
                                    Squared Box Filter
 \****************************************************************************************/
-namespace cv
-{
-template<typename T, typename ST>
-struct SqrRowSum :
-        public BaseRowFilter
-{
-    SqrRowSum( int _ksize, int _anchor ) :
-        BaseRowFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-    }
-    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
-    {
-        const T* S = (const T*)src;
-        ST* D = (ST*)dst;
-        int i = 0, k, ksz_cn = ksize*cn;
-        width = (width - 1)*cn;
-        for( k = 0; k < cn; k++, S++, D++ )
-        {
-            ST s = 0;
-            for( i = 0; i < ksz_cn; i += cn )
-            {
-                ST val = (ST)S[i];
-                s += val*val;
-            }
-            D[0] = s;
-            for( i = 0; i < width; i += cn )
-            {
-                ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
-                s += val1*val1 - val0*val0;
-                D[i+cn] = s;
-            }
-        }
-    }
-};
 static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
 {
-    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
+    CV_INSTRUMENT_REGION();
-    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
-    if( anchor < 0 )
-        anchor = ksize/2;
-    if( sdepth == CV_8U && ddepth == CV_32S )
-        return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
-    if( sdepth == CV_8U && ddepth == CV_64F )
-        return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
-    if( sdepth == CV_16U && ddepth == CV_64F )
-        return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
-    if( sdepth == CV_16S && ddepth == CV_64F )
-        return makePtr<SqrRowSum<short, double> >(ksize, anchor);
-    if( sdepth == CV_32F && ddepth == CV_64F )
-        return makePtr<SqrRowSum<float, double> >(ksize, anchor);
-    if( sdepth == CV_64F && ddepth == CV_64F )
-        return makePtr<SqrRowSum<double, double> >(ksize, anchor);
-    CV_Error_( CV_StsNotImplemented,
-              ("Unsupported combination of source format (=%d), and buffer format (=%d)",
-               srcType, sumType));
-}
+    CV_CPU_DISPATCH(getSqrRowSumFilter, (srcType, sumType, ksize, anchor),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
-void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
+void sqrBoxFilter(InputArray _src, OutputArray _dst, int ddepth,
-                       Size ksize, Point anchor,
+                  Size ksize, Point anchor,
-                       bool normalize, int borderType )
+                  bool normalize, int borderType)
 {
    CV_INSTRUMENT_REGION();
@@ -1801,4 +554,4 @@ void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
    f->apply( src, dst, wsz, ofs );
 }
-/* End of file. */
+} // namespace
--- a/modules/imgproc/src/box_filter.simd.hpp
+++ b/modules/imgproc/src/box_filter.simd.hpp
@@ -42,21 +42,25 @@
 //M*/
 #include "precomp.hpp"
-#include <vector>
 #include "opencv2/core/hal/intrin.hpp"
-#include "opencl_kernels_imgproc.hpp"
-#include "opencv2/core/openvx/ovx_defs.hpp"
+namespace cv {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+// forward declarations
+Ptr<BaseRowFilter> getRowSumFilter(int srcType, int sumType, int ksize, int anchor);
+Ptr<BaseColumnFilter> getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale);
+Ptr<FilterEngine> createBoxFilter(int srcType, int dstType, Size ksize,
+                                  Point anchor, bool normalize, int borderType);
-namespace cv
+Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor);
-{
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 /****************************************************************************************\
                                         Box Filter
 \****************************************************************************************/
+namespace {
 template<typename T, typename ST>
 struct RowSum :
        public BaseRowFilter
@@ -70,6 +74,8 @@ struct RowSum :
    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        const T* S = (const T*)src;
        ST* D = (ST*)dst;
        int i = 0, k, ksz_cn = ksize*cn;
@@ -183,6 +189,8 @@ struct ColumnSum :
    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        int i;
        ST* SUM;
        bool haveScale = scale != 1;
@@ -281,6 +289,8 @@ struct ColumnSum<int, uchar> :
    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        int* SUM;
        bool haveScale = scale != 1;
        double _scale = scale;
@@ -408,9 +418,6 @@ struct ColumnSum<int, uchar> :
            }
            dst += dststep;
        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
    }
    double scale;
@@ -452,6 +459,8 @@ public BaseColumnFilter
    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        const int ds = divScale;
        const int dd = divDelta;
        ushort* SUM;
@@ -586,9 +595,6 @@ public BaseColumnFilter
            }
            dst += dststep;
        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
    }
    double scale;
@@ -616,6 +622,8 @@ struct ColumnSum<int, short> :
    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        int i;
        int* SUM;
        bool haveScale = scale != 1;
@@ -739,9 +747,6 @@ struct ColumnSum<int, short> :
            }
            dst += dststep;
        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
    }
    double scale;
@@ -767,6 +772,8 @@ struct ColumnSum<int, ushort> :
    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        int* SUM;
        bool haveScale = scale != 1;
        double _scale = scale;
@@ -888,9 +895,6 @@ struct ColumnSum<int, ushort> :
            }
            dst += dststep;
        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
    }
    double scale;
@@ -915,6 +919,8 @@ struct ColumnSum<int, int> :
    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        int* SUM;
        bool haveScale = scale != 1;
        double _scale = scale;
@@ -1022,9 +1028,6 @@ struct ColumnSum<int, int> :
            }
            dst += dststep;
        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
    }
    double scale;
@@ -1050,6 +1053,8 @@ struct ColumnSum<int, float> :
    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        int* SUM;
        bool haveScale = scale != 1;
        double _scale = scale;
@@ -1154,9 +1159,6 @@ struct ColumnSum<int, float> :
            }
            dst += dststep;
        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
    }
    double scale;
@@ -1164,243 +1166,13 @@ struct ColumnSum<int, float> :
    std::vector<int> sum;
 };
-#ifdef HAVE_OPENCL
+}  // namespace anon
-static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth,
-                                   Size ksize, Point anchor, int borderType, bool normalize )
-{
-    const ocl::Device & dev = ocl::Device::getDefault();
-    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    if (ddepth < 0)
-        ddepth = sdepth;
-    if (anchor.x < 0)
-        anchor.x = ksize.width / 2;
-    if (anchor.y < 0)
-        anchor.y = ksize.height / 2;
-    if ( !(dev.isIntel() && (type == CV_8UC1) &&
-         (_src.offset() == 0) && (_src.step() % 4 == 0) &&
-         (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) &&
-         (anchor.x == 1) && (anchor.y == 1) &&
-         (ksize.width == 3) && (ksize.height == 3)) )
-        return false;
-    float alpha = 1.0f / (ksize.height * ksize.width);
-    Size size = _src.size();
-    size_t globalsize[2] = { 0, 0 };
-    size_t localsize[2] = { 0, 0 };
-    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
-    globalsize[0] = size.width / 16;
-    globalsize[1] = size.height / 2;
-    char build_opts[1024];
-    sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
-    ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts);
-    if (kernel.empty())
-        return false;
-    UMat src = _src.getUMat();
-    _dst.create(size, CV_MAKETYPE(ddepth, cn));
-    if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
-        return false;
-    UMat dst = _dst.getUMat();
-    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernel.set(idxArg, (int)src.step);
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
-    idxArg = kernel.set(idxArg, (int)dst.step);
-    idxArg = kernel.set(idxArg, (int)dst.rows);
-    idxArg = kernel.set(idxArg, (int)dst.cols);
-    if (normalize)
-        idxArg = kernel.set(idxArg, (float)alpha);
-    return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
-}
-static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
+Ptr<BaseRowFilter> getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
-                           Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
 {
-    const ocl::Device & dev = ocl::Device::getDefault();
+    CV_INSTRUMENT_REGION();
-    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
-    bool doubleSupport = dev.doubleFPConfig() > 0;
-    if (ddepth < 0)
-        ddepth = sdepth;
-    if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
-        _src.offset() % esz != 0 || _src.step() % esz != 0)
-        return false;
-    if (anchor.x < 0)
-        anchor.x = ksize.width / 2;
-    if (anchor.y < 0)
-        anchor.y = ksize.height / 2;
-    int computeUnits = ocl::Device::getDefault().maxComputeUnits();
-    float alpha = 1.0f / (ksize.height * ksize.width);
-    Size size = _src.size(), wholeSize;
-    bool isolated = (borderType & BORDER_ISOLATED) != 0;
-    borderType &= ~BORDER_ISOLATED;
-    int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
-        wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
-    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
-    size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
-    size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
-    UMat src = _src.getUMat();
-    if (!isolated)
-    {
-        Point ofs;
-        src.locateROI(wholeSize, ofs);
-    }
-    int h = isolated ? size.height : wholeSize.height;
-    int w = isolated ? size.width : wholeSize.width;
-    size_t maxWorkItemSizes[32];
-    ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
-    int tryWorkItems = (int)maxWorkItemSizes[0];
-    ocl::Kernel kernel;
-    if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
-        ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
-         (ksize.width == 5 && ksize.height == 5 && cn == 1)))
-    {
-        if (w < ksize.width || h < ksize.height)
-            return false;
-        // Figure out what vector size to use for loading the pixels.
-        int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
-        int pxLoadVecSize = cn * pxLoadNumPixels;
-        // Figure out how many pixels per work item to compute in X and Y
-        // directions.  Too many and we run out of registers.
-        int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
-        if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
-        {
-            pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
-            pxPerWorkItemY = size.height % 2 ? 1 : 2;
-        }
-        else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
-        {
-            pxPerWorkItemX = size.width % 2 ? 1 : 2;
-            pxPerWorkItemY = size.height % 2 ? 1 : 2;
-        }
-        globalsize[0] = size.width / pxPerWorkItemX;
-        globalsize[1] = size.height / pxPerWorkItemY;
-        // Need some padding in the private array for pixels
-        int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
-        // Make the global size a nice round number so the runtime can pick
-        // from reasonable choices for the workgroup size
-        const int wgRound = 256;
-        globalsize[0] = roundUp(globalsize[0], wgRound);
-        char build_options[1024], cvt[2][40];
-        sprintf(build_options, "-D cn=%d "
-                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
-                "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
-                "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
-                "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
-                "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
-                "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
-                cn, anchor.x, anchor.y, ksize.width, ksize.height,
-                pxLoadVecSize, pxLoadNumPixels,
-                pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
-                isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
-                privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
-                ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
-                ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
-                ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
-                ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
-                normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
-                ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
-                );
-        if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
-            return false;
-    }
-    else
-    {
-        localsize = localsize_general;
-        for ( ; ; )
-        {
-            int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
-            while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
-                BLOCK_SIZE_X /= 2;
-            while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
-                BLOCK_SIZE_Y *= 2;
-            if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
-                return false;
-            char cvt[2][50];
-            String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
-                                 " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
-                                 " -D ST1=%s -D DT1=%s -D cn=%d",
-                                 BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
-                                 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
-                                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
-                                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
-                                 anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
-                                 isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                                 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
-                                 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
-            localsize[0] = BLOCK_SIZE_X;
-            globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
-            globalsize[1] = divUp(size.height, BLOCK_SIZE_Y);
-            kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
-            if (kernel.empty())
-                return false;
-            size_t kernelWorkGroupSize = kernel.workGroupSize();
-            if (localsize[0] <= kernelWorkGroupSize)
-                break;
-            if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
-                return false;
-            tryWorkItems = (int)kernelWorkGroupSize;
-        }
-    }
-    _dst.create(size, CV_MAKETYPE(ddepth, cn));
-    UMat dst = _dst.getUMat();
-    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernel.set(idxArg, (int)src.step);
-    int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
-    int srcOffsetY = (int)(src.offset / src.step);
-    int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
-    int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
-    idxArg = kernel.set(idxArg, srcOffsetX);
-    idxArg = kernel.set(idxArg, srcOffsetY);
-    idxArg = kernel.set(idxArg, srcEndX);
-    idxArg = kernel.set(idxArg, srcEndY);
-    idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
-    if (normalize)
-        idxArg = kernel.set(idxArg, (float)alpha);
-    return kernel.run(2, globalsize, localsize, false);
-}
-#endif
-}
-cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
-{
    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
@@ -1434,9 +1206,10 @@ cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksi
 }
-cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
+Ptr<BaseColumnFilter> getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale)
-                                                     int anchor, double scale)
 {
+    CV_INSTRUMENT_REGION();
    int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
@@ -1474,9 +1247,11 @@ cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, i
 }
-cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
+Ptr<FilterEngine> createBoxFilter(int srcType, int dstType, Size ksize,
-                    Point anchor, bool normalize, int borderType )
+                                  Point anchor, bool normalize, int borderType)
 {
+    CV_INSTRUMENT_REGION();
    int sdepth = CV_MAT_DEPTH(srcType);
    int cn = CV_MAT_CN(srcType), sumType = CV_64F;
    if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U &&
@@ -1496,199 +1271,12 @@ cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ks
           srcType, dstType, sumType, borderType );
 }
-#ifdef HAVE_OPENVX
-namespace cv
-{
-    namespace ovx {
-        template <> inline bool skipSmallImages<VX_KERNEL_BOX_3x3>(int w, int h) { return w*h < 640 * 480; }
-    }
-    static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth,
-                                 Size ksize, Point anchor,
-                                 bool normalize, int borderType)
-    {
-        if (ddepth < 0)
-            ddepth = CV_8UC1;
-        if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize ||
-            _src.cols() < 3 || _src.rows() < 3 ||
-            ksize.width != 3 || ksize.height != 3 ||
-            (anchor.x >= 0 && anchor.x != 1) ||
-            (anchor.y >= 0 && anchor.y != 1) ||
-            ovx::skipSmallImages<VX_KERNEL_BOX_3x3>(_src.cols(), _src.rows()))
-            return false;
-        Mat src = _src.getMat();
-        if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
-            return false; //Process isolated borders only
-        vx_enum border;
-        switch (borderType & ~BORDER_ISOLATED)
-        {
-        case BORDER_CONSTANT:
-            border = VX_BORDER_CONSTANT;
-            break;
-        case BORDER_REPLICATE:
-            border = VX_BORDER_REPLICATE;
-            break;
-        default:
-            return false;
-        }
-        _dst.create(src.size(), CV_8UC1);
-        Mat dst = _dst.getMat();
-        try
-        {
-            ivx::Context ctx = ovx::getOpenVXContext();
-            Mat a;
-            if (dst.data != src.data)
-                a = src;
-            else
-                src.copyTo(a);
-            ivx::Image
-                ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
-                                                  ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
-                ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
-                                                  ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
-            //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
-            //since OpenVX standard says nothing about thread-safety for now
-            ivx::border_t prevBorder = ctx.immediateBorder();
-            ctx.setImmediateBorder(border, (vx_uint8)(0));
-            ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib));
-            ctx.setImmediateBorder(prevBorder);
-        }
-        catch (const ivx::RuntimeError & e)
-        {
-            VX_DbgThrow(e.what());
-        }
-        catch (const ivx::WrapperError & e)
-        {
-            VX_DbgThrow(e.what());
-        }
-        return true;
-    }
-}
-#endif
-#if defined(HAVE_IPP)
-namespace cv
-{
-static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP();
-#if IPP_VERSION_X100 < 201801
-    // Problem with SSE42 optimization for 16s and some 8u modes
-    if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5))))
-        return false;
-    // Other optimizations has some degradations too
-    if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5))))
-        return false;
-#endif
-    if(!normalize)
-        return false;
-    if(!ippiCheckAnchor(anchor, ksize))
-        return false;
-    try
-    {
-        ::ipp::IwiImage       iwSrc      = ippiGetImage(src);
-        ::ipp::IwiImage       iwDst      = ippiGetImage(dst);
-        ::ipp::IwiSize        iwKSize    = ippiGetSize(ksize);
-        ::ipp::IwiBorderSize  borderSize(iwKSize);
-        ::ipp::IwiBorderType  ippBorder(ippiGetBorder(iwSrc, borderType, borderSize));
-        if(!ippBorder)
-            return false;
-        CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder);
-    }
-    catch (const ::ipp::IwException &)
-    {
-        return false;
-    }
-    return true;
-#else
-    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType);
-    return false;
-#endif
-}
-}
-#endif
-void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
-                Size ksize, Point anchor,
-                bool normalize, int borderType )
-{
-    CV_INSTRUMENT_REGION();
-    CV_OCL_RUN(_dst.isUMat() &&
-               (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT ||
-                borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101),
-               ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
-    CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
-    Mat src = _src.getMat();
-    int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
-    if( ddepth < 0 )
-        ddepth = sdepth;
-    _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
-    Mat dst = _dst.getMat();
-    if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
-    {
-        if( src.rows == 1 )
-            ksize.height = 1;
-        if( src.cols == 1 )
-            ksize.width = 1;
-    }
-    Point ofs;
-    Size wsz(src.cols, src.rows);
-    if(!(borderType&BORDER_ISOLATED))
-        src.locateROI( wsz, ofs );
-    CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn,
-             ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
-             anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED);
-    CV_OVX_RUN(true,
-               openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType))
-    CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType));
-    borderType = (borderType&~BORDER_ISOLATED);
-    Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
-                        ksize, anchor, normalize, borderType );
-    f->apply( src, dst, wsz, ofs );
-}
-void cv::blur( InputArray src, OutputArray dst,
-           Size ksize, Point anchor, int borderType )
-{
-    CV_INSTRUMENT_REGION();
-    boxFilter( src, dst, -1, ksize, anchor, true, borderType );
-}
 /****************************************************************************************\
                                    Squared Box Filter
 \****************************************************************************************/
+namespace {
-namespace cv
-{
 template<typename T, typename ST>
 struct SqrRowSum :
@@ -1703,6 +1291,8 @@ struct SqrRowSum :
    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
    {
+        CV_INSTRUMENT_REGION();
        const T* S = (const T*)src;
        ST* D = (ST*)dst;
        int i = 0, k, ksz_cn = ksize*cn;
@@ -1727,7 +1317,9 @@ struct SqrRowSum :
    }
 };
-static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
+} // namespace anon
+Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
 {
    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
@@ -1753,52 +1345,6 @@ static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize
               srcType, sumType));
 }
-}
+#endif
+CV_CPU_OPTIMIZATION_NAMESPACE_END
-void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
+} // namespace
-                       Size ksize, Point anchor,
-                       bool normalize, int borderType )
-{
-    CV_INSTRUMENT_REGION();
-    int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
-    Size size = _src.size();
-    if( ddepth < 0 )
-        ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
-    if( borderType != BORDER_CONSTANT && normalize )
-    {
-        if( size.height == 1 )
-            ksize.height = 1;
-        if( size.width == 1 )
-            ksize.width = 1;
-    }
-    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
-               ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
-    int sumDepth = CV_64F;
-    if( sdepth == CV_8U )
-        sumDepth = CV_32S;
-    int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
-    Mat src = _src.getMat();
-    _dst.create( size, dstType );
-    Mat dst = _dst.getMat();
-    Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
-    Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
-                                                            dstType, ksize.height, anchor.y,
-                                                            normalize ? 1./(ksize.width*ksize.height) : 1);
-    Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
-                                                srcType, dstType, sumType, borderType );
-    Point ofs;
-    Size wsz(src.cols, src.rows);
-    src.locateROI( wsz, ofs );
-    f->apply( src, dst, wsz, ofs );
-}
-/* End of file. */