Merge pull request #3532 from oresths:filter_neon

5d15676b · Vadim Pisarevsky · 8d4d36f8 · fffe2464 · 5d15676b · 5d15676b
Commit 5d15676b authored Dec 24, 2014 by Vadim Pisarevsky
Showing with 852 additions and 2 deletions

test_descriptors_regression.cpp modules/features2d/test/test_descriptors_regression.cpp +5 -1

perf_blur.cpp modules/imgproc/perf/perf_blur.cpp +1 -1

filter.cpp modules/imgproc/src/filter.cpp +846 -0

No files found.
--- a/modules/features2d/test/test_descriptors_regression.cpp
+++ b/modules/features2d/test/test_descriptors_regression.cpp
@@ -132,7 +132,7 @@ protected:

        stringstream ss;
        ss << "Max distance between valid and computed descriptors " << curMaxDist;
-        if( curMaxDist < maxDist )
+        if( curMaxDist <= maxDist )
            ss << "." << endl;
        else
        {
@@ -322,7 +322,11 @@ TEST( Features2d_DescriptorExtractor_ORB, regression )
 {
    // TODO adjust the parameters below
    CV_DescriptorExtractorTest<Hamming> test( "descriptor-orb",
+#if CV_NEON
+                                              (CV_DescriptorExtractorTest<Hamming>::DistanceType)25.f,
+#else
                                              (CV_DescriptorExtractorTest<Hamming>::DistanceType)12.f,
+#endif
                                             ORB::create() );
    test.safe_run();
 }

--- a/modules/imgproc/perf/perf_blur.cpp
+++ b/modules/imgproc/perf/perf_blur.cpp
@@ -183,7 +183,7 @@ PERF_TEST_P(Size_MatType_BorderType, gaussianBlur5x5,

    TEST_CYCLE() GaussianBlur(src, dst, Size(5,5), 0, 0, btype);

-    SANITY_CHECK(dst, 1e-3);
+    SANITY_CHECK(dst, 1);
 }

 PERF_TEST_P(Size_MatType_BorderType, blur5x5,

--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2207,6 +2207,852 @@ struct FilterVec_32f
 };


+#elif CV_NEON
+
+struct SymmRowSmallVec_8u32s
+{
+    SymmRowSmallVec_8u32s() { smallValues = false; }
+    SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
+    {
+        kernel = _kernel;
+        symmetryType = _symmetryType;
+        smallValues = true;
+        int k, ksize = kernel.rows + kernel.cols - 1;
+        for( k = 0; k < ksize; k++ )
+        {
+            int v = kernel.ptr<int>()[k];
+            if( v < SHRT_MIN || v > SHRT_MAX )
+            {
+                smallValues = false;
+                break;
+            }
+        }
+    }
+
+    int operator()(const uchar* src, uchar* _dst, int width, int cn) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
+        int* dst = (int*)_dst;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const int* kx = kernel.ptr<int>() + _ksize/2;
+        if( !smallValues )
+            return 0;
+
+        src += (_ksize/2)*cn;
+        width *= cn;
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 2 && kx[1] == 1 )
+                {
+                    uint16x8_t zq = vdupq_n_u16(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1, x2;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src) );
+                        x2 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        uint16x8_t y0, y1, y2;
+                        y0 = vaddl_u8(x0, x2);
+                        y1 = vshll_n_u8(x1, 1);
+                        y2 = vaddq_u16(y0, y1);
+
+                        uint16x8x2_t str;
+                        str.val[0] = y2; str.val[1] = zq;
+                        vst2q_u16( (uint16_t *) (dst + i), str );
+                    }
+                }
+                else if( kx[0] == -2 && kx[1] == 1 )
+                    return 0;
+                else
+                {
+                    int32x4_t k32 = vdupq_n_s32(0);
+                    k32 = vld1q_lane_s32(kx, k32, 0);
+                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
+
+                    int16x4_t k = vqmovn_s32(k32);
+
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1, x2;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src) );
+                        x2 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0, y1;
+                        int32x4_t y2, y3;
+                        y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
+                        y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
+                        y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
+                        y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
+                        y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
+                        y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
+
+                        vst1q_s32((int32_t *)(dst + i), y2);
+                        vst1q_s32((int32_t *)(dst + i + 4), y3);
+                    }
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
+                    return 0;
+                else
+                {
+                    int32x4_t k32 = vdupq_n_s32(0);
+                    k32 = vld1q_lane_s32(kx, k32, 0);
+                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
+                    k32 = vld1q_lane_s32(kx + 2, k32, 2);
+
+                    int16x4_t k = vqmovn_s32(k32);
+
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1, x2, x3, x4;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src) );
+                        x2 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0, y1;
+                        int32x4_t accl, acch;
+                        y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
+                        y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
+                        accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
+                        accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
+                        acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
+                        acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
+
+                        int16x8_t y2;
+                        x3 = vld1_u8( (uint8_t *) (src - cn*2) );
+                        x4 = vld1_u8( (uint8_t *) (src + cn*2) );
+                        y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
+                        accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
+                        acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
+
+                        vst1q_s32((int32_t *)(dst + i), accl);
+                        vst1q_s32((int32_t *)(dst + i + 4), acch);
+                    }
+                }
+            }
+        }
+        else
+        {
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 0 && kx[1] == 1 )
+                {
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0;
+                        y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
+                                vreinterpretq_s16_u16(vaddl_u8(x0, z)));
+
+                        vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
+                        vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
+                    }
+                }
+                else
+                {
+                    int32x4_t k32 = vdupq_n_s32(0);
+                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
+
+                    int16x4_t k = vqmovn_s32(k32);
+
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0;
+                        int32x4_t y1, y2;
+                        y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
+                            vreinterpretq_s16_u16(vaddl_u8(x0, z)));
+                        y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
+                        y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
+
+                        vst1q_s32((int32_t *)(dst + i), y1);
+                        vst1q_s32((int32_t *)(dst + i + 4), y2);
+                    }
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                int32x4_t k32 = vdupq_n_s32(0);
+                k32 = vld1q_lane_s32(kx + 1, k32, 1);
+                k32 = vld1q_lane_s32(kx + 2, k32, 2);
+
+                int16x4_t k = vqmovn_s32(k32);
+
+                uint8x8_t z = vdup_n_u8(0);
+
+                for( ; i <= width - 8; i += 8, src += 8 )
+                {
+                    uint8x8_t x0, x1;
+                    x0 = vld1_u8( (uint8_t *) (src - cn) );
+                    x1 = vld1_u8( (uint8_t *) (src + cn) );
+
+                    int32x4_t accl, acch;
+                    int16x8_t y0;
+                    y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
+                        vreinterpretq_s16_u16(vaddl_u8(x0, z)));
+                    accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
+                    acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
+
+                    uint8x8_t x2, x3;
+                    x2 = vld1_u8( (uint8_t *) (src - cn*2) );
+                    x3 = vld1_u8( (uint8_t *) (src + cn*2) );
+
+                    int16x8_t y1;
+                    y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
+                        vreinterpretq_s16_u16(vaddl_u8(x2, z)));
+                    accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
+                    acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
+
+                    vst1q_s32((int32_t *)(dst + i), accl);
+                    vst1q_s32((int32_t *)(dst + i + 4), acch);
+                }
+            }
+        }
+
+        return i;
+    }
+
+    Mat kernel;
+    int symmetryType;
+    bool smallValues;
+};
+
+
+struct SymmColumnVec_32s8u
+{
+    SymmColumnVec_32s8u() { symmetryType=0; }
+    SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
+    {
+        symmetryType = _symmetryType;
+        _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+        delta = (float)(_delta/(1 << _bits));
+        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
+    }
+
+    int operator()(const uchar** _src, uchar* dst, int width) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int _ksize = kernel.rows + kernel.cols - 1;
+        int ksize2 = _ksize / 2;
+        const float* ky = kernel.ptr<float>() + ksize2;
+        int i = 0, k;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const int** src = (const int**)_src;
+        const int *S, *S2;
+
+        float32x4_t d4 = vdupq_n_f32(delta);
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+
+
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky, k32, 0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t accl, acch;
+                float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
+
+                S = src[0] + i;
+
+                f0l = vcvtq_f32_s32( vld1q_s32(S) );
+                f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                f1l = vcvtq_f32_s32( vld1q_s32(S) );
+                f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                f2l = vcvtq_f32_s32( vld1q_s32(S2) );
+                f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, f0l, k32, 0);
+                acch = vmlaq_lane_f32(acch, f0h, k32, 0);
+                accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t f3l, f3h, f4l, f4h;
+                    f3l = vcvtq_f32_s32( vld1q_s32(S) );
+                    f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                    f4l = vcvtq_f32_s32( vld1q_s32(S2) );
+                    f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                    accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                uint8x8_t u8;
+                u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
+
+                vst1_u8((uint8_t *)(dst + i), u8);
+            }
+        }
+        else
+        {
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t accl, acch;
+                float32x4_t f1l, f1h, f2l, f2h;
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                f1l = vcvtq_f32_s32( vld1q_s32(S) );
+                f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                f2l = vcvtq_f32_s32( vld1q_s32(S2) );
+                f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t f3l, f3h, f4l, f4h;
+                    f3l = vcvtq_f32_s32( vld1q_s32(S) );
+                    f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                    f4l = vcvtq_f32_s32( vld1q_s32(S2) );
+                    f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                    accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                uint8x8_t u8;
+                u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
+
+                vst1_u8((uint8_t *)(dst + i), u8);
+            }
+        }
+
+        return i;
+    }
+
+    int symmetryType;
+    float delta;
+    Mat kernel;
+};
+
+
+struct SymmColumnSmallVec_32s16s
+{
+    SymmColumnSmallVec_32s16s() { symmetryType=0; }
+    SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
+    {
+        symmetryType = _symmetryType;
+        _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+        delta = (float)(_delta/(1 << _bits));
+        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
+    }
+
+    int operator()(const uchar** _src, uchar* _dst, int width) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+        const float* ky = kernel.ptr<float>() + ksize2;
+        int i = 0;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const int** src = (const int**)_src;
+        const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
+        short* dst = (short*)_dst;
+        float32x4_t df4 = vdupq_n_f32(delta);
+        int32x4_t d4 = vcvtq_s32_f32(df4);
+
+        if( symmetrical )
+        {
+            if( ky[0] == 2 && ky[1] == 1 )
+            {
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    int32x4_t y0, y1, y2, y3;
+                    y0 = vaddq_s32(x0, x2);
+                    y1 = vqshlq_n_s32(x1, 1);
+                    y2 = vaddq_s32(y0, y1);
+                    y3 = vaddq_s32(y2, d4);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y3);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
+            }
+            else if( ky[0] == -2 && ky[1] == 1 )
+            {
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    int32x4_t y0, y1, y2, y3;
+                    y0 = vaddq_s32(x0, x2);
+                    y1 = vqshlq_n_s32(x1, 1);
+                    y2 = vsubq_s32(y0, y1);
+                    y3 = vaddq_s32(y2, d4);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y3);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
+            }
+            else if( ky[0] == 10 && ky[1] == 3 )
+            {
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2, x3;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    x3 = vaddq_s32(x0, x2);
+
+                    int32x4_t y0;
+                    y0 = vmlaq_n_s32(d4, x1, 10);
+                    y0 = vmlaq_n_s32(y0, x3, 3);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y0);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
+            }
+            else
+            {
+                float32x2_t k32 = vdup_n_f32(0);
+                k32 = vld1_lane_f32(ky, k32, 0);
+                k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2, x3, x4;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    x3 = vaddq_s32(x0, x2);
+
+                    float32x4_t s0, s1, s2;
+                    s0 = vcvtq_f32_s32(x1);
+                    s1 = vcvtq_f32_s32(x3);
+                    s2 = vmlaq_lane_f32(df4, s0, k32, 0);
+                    s2 = vmlaq_lane_f32(s2, s1, k32, 1);
+
+                    x4 = vcvtq_s32_f32(s2);
+
+                    int16x4_t x5;
+                    x5 = vqmovn_s32(x4);
+
+                    vst1_s16((int16_t *)(dst + i), x5);
+                }
+            }
+        }
+        else
+        {
+            if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
+            {
+                if( ky[1] < 0 )
+                    std::swap(S0, S2);
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    int32x4_t y0, y1;
+                    y0 = vsubq_s32(x1, x0);
+                    y1 = vqaddq_s32(y0, d4);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y1);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
+            }
+            else
+            {
+                float32x2_t k32 = vdup_n_f32(0);
+                k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2, x3;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    x2 = vsubq_s32(x1, x0);
+
+                    float32x4_t s0, s1;
+                    s0 = vcvtq_f32_s32(x2);
+                    s1 = vmlaq_lane_f32(df4, s0, k32, 1);
+
+                    x3 = vcvtq_s32_f32(s1);
+
+                    int16x4_t x4;
+                    x4 = vqmovn_s32(x3);
+
+                    vst1_s16((int16_t *)(dst + i), x4);
+                }
+            }
+        }
+
+        return i;
+    }
+
+    int symmetryType;
+    float delta;
+    Mat kernel;
+};
+
+
+struct SymmColumnVec_32f16s
+{
+    SymmColumnVec_32f16s() { symmetryType=0; }
+    SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
+    {
+        symmetryType = _symmetryType;
+        kernel = _kernel;
+        delta = (float)_delta;
+        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
+        //Uncomment the following line when runtime support for neon is implemented.
+        // neon_supported = checkHardwareSupport(CV_CPU_NEON);
+    }
+
+    int operator()(const uchar** _src, uchar* _dst, int width) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !neon_supported )
+        //     return 0;
+
+        int _ksize = kernel.rows + kernel.cols - 1;
+        int ksize2 = _ksize / 2;
+        const float* ky = kernel.ptr<float>() + ksize2;
+        int i = 0, k;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const float** src = (const float**)_src;
+        const float *S, *S2;
+        short* dst = (short*)_dst;
+
+        float32x4_t d4 = vdupq_n_f32(delta);
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+
+
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky, k32, 0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
+                float32x4_t accl, acch;
+
+                S = src[0] + i;
+
+                x0l = vld1q_f32(S);
+                x0h = vld1q_f32(S + 4);
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                x1l = vld1q_f32(S);
+                x1h = vld1q_f32(S + 4);
+                x2l = vld1q_f32(S2);
+                x2h = vld1q_f32(S2 + 4);
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, x0l, k32, 0);
+                acch = vmlaq_lane_f32(acch, x0h, k32, 0);
+                accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t x3l, x3h, x4l, x4h;
+                    x3l = vld1q_f32(S);
+                    x3h = vld1q_f32(S + 4);
+                    x4l = vld1q_f32(S2);
+                    x4h = vld1q_f32(S2 + 4);
+
+                    accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                vst1_s16((int16_t *)(dst + i), s16l);
+                vst1_s16((int16_t *)(dst + i + 4), s16h);
+            }
+        }
+        else
+        {
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t x1l, x1h, x2l, x2h;
+                float32x4_t accl, acch;
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                x1l = vld1q_f32(S);
+                x1h = vld1q_f32(S + 4);
+                x2l = vld1q_f32(S2);
+                x2h = vld1q_f32(S2 + 4);
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t x3l, x3h, x4l, x4h;
+                    x3l = vld1q_f32(S);
+                    x3h = vld1q_f32(S + 4);
+                    x4l = vld1q_f32(S2);
+                    x4h = vld1q_f32(S2 + 4);
+
+                    accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                vst1_s16((int16_t *)(dst + i), s16l);
+                vst1_s16((int16_t *)(dst + i + 4), s16h);
+            }
+        }
+
+        return i;
+    }
+
+    int symmetryType;
+    float delta;
+    Mat kernel;
+    bool neon_supported;
+};
+
+
+struct SymmRowSmallVec_32f
+{
+    SymmRowSmallVec_32f() {}
+    SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
+    {
+        kernel = _kernel;
+        symmetryType = _symmetryType;
+    }
+
+    int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
+        float* dst = (float*)_dst;
+        const float* src = (const float*)_src + (_ksize/2)*cn;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const float* kx = kernel.ptr<float>() + _ksize/2;
+        width *= cn;
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 2 && kx[1] == 1 )
+                    return 0;
+                else if( kx[0] == -2 && kx[1] == 1 )
+                    return 0;
+                else
+                {
+                    return 0;
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
+                    return 0;
+                else
+                {
+                    float32x2_t k0, k1;
+                    k0 = k1 = vdup_n_f32(0);
+                    k0 = vld1_lane_f32(kx + 0, k0, 0);
+                    k0 = vld1_lane_f32(kx + 1, k0, 1);
+                    k1 = vld1_lane_f32(kx + 2, k1, 0);
+
+                    for( ; i <= width - 4; i += 4, src += 4 )
+                    {
+                        float32x4_t x0, x1, x2, x3, x4;
+                        x0 = vld1q_f32(src);
+                        x1 = vld1q_f32(src - cn);
+                        x2 = vld1q_f32(src + cn);
+                        x3 = vld1q_f32(src - cn*2);
+                        x4 = vld1q_f32(src + cn*2);
+
+                        float32x4_t y0;
+                        y0 = vmulq_lane_f32(x0, k0, 0);
+                        y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
+                        y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
+
+                        vst1q_f32(dst + i, y0);
+                    }
+                }
+            }
+        }
+        else
+        {
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 0 && kx[1] == 1 )
+                    return 0;
+                else
+                {
+                    return 0;
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                float32x2_t k;
+                k = vdup_n_f32(0);
+                k = vld1_lane_f32(kx + 1, k, 0);
+                k = vld1_lane_f32(kx + 2, k, 1);
+
+                for( ; i <= width - 4; i += 4, src += 4 )
+                {
+                    float32x4_t x0, x1, x2, x3;
+                    x0 = vld1q_f32(src - cn);
+                    x1 = vld1q_f32(src + cn);
+                    x2 = vld1q_f32(src - cn*2);
+                    x3 = vld1q_f32(src + cn*2);
+
+                    float32x4_t y0;
+                    y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
+                    y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
+
+                    vst1q_f32(dst + i, y0);
+                }
+            }
+        }
+
+        return i;
+    }
+
+    Mat kernel;
+    int symmetryType;
+};
+
+
+typedef RowNoVec RowVec_8u32s;
+typedef RowNoVec RowVec_16s32f;
+typedef RowNoVec RowVec_32f;
+typedef ColumnNoVec SymmColumnVec_32f;
+typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
+typedef FilterNoVec FilterVec_8u;
+typedef FilterNoVec FilterVec_8u16s;
+typedef FilterNoVec FilterVec_32f;
+
+
 #else

 typedef RowNoVec RowVec_8u32s;