Unrolled bilateral filter neighbor processing loop

9ad1a848 · Vitaly Tuzov · f5b6bea2 · 9ad1a848
Commit 9ad1a848 authored Nov 15, 2018 by Vitaly Tuzov
Show whitespace changes
Inline Side-by-side

Showing with 584 additions and 4 deletions

bilateral_filter.cpp modules/imgproc/src/bilateral_filter.cpp +584 -4

No files found.
--- a/modules/imgproc/src/bilateral_filter.cpp
+++ b/modules/imgproc/src/bilateral_filter.cpp
@@ -82,7 +82,84 @@ public:
                memset(buf.data(), 0, buf.size() * sizeof(float));
                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
-                for( k = 0; k < maxk; k++ )
+                k = 0;
+                for(; k <= maxk-4; k+=4)
+                {
+                    const uchar* ksptr0 = sptr + space_ofs[k];
+                    const uchar* ksptr1 = sptr + space_ofs[k+1];
+                    const uchar* ksptr2 = sptr + space_ofs[k+2];
+                    const uchar* ksptr3 = sptr + space_ofs[k+3];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    {
+                        v_uint32 rval = vx_load_expand_q(sptr + j);
+                        v_uint32 val = vx_load_expand_q(ksptr0 + j);
+                        v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j));
+                        val = vx_load_expand_q(ksptr1 + j);
+                        w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_wsum += w;
+                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
+                        val = vx_load_expand_q(ksptr2 + j);
+                        w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_wsum += w;
+                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
+                        val = vx_load_expand_q(ksptr3 + j);
+                        w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_wsum += w;
+                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
+                        v_store_aligned(wsum + j, v_wsum);
+                        v_store_aligned(sum + j, v_sum);
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for (; j < size.width; j++)
+                    {
+#if CV_SIMD128
+                        v_uint32x4 rval = v_setall_u32(sptr[j]);
+                        v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
+                        v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        wsum[j] += v_reduce_sum(w);
+                        sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w);
+#else
+                        int rval = sptr[j];
+                        int val = ksptr0[j];
+                        float w = space_weight[k] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+                        val = ksptr1[j];
+                        w = space_weight[k+1] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+                        val = ksptr2[j];
+                        w = space_weight[k+2] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+                        val = ksptr3[j];
+                        w = space_weight[k+3] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+#endif
+                    }
+                }
+                for(; k < maxk; k++)
                {
                    const uchar* ksptr = sptr + space_ofs[k];
                    j = 0;
@@ -126,7 +203,232 @@ public:
                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
-                for(k = 0; k < maxk; k++ )
+                k = 0;
+                for(; k <= maxk-4; k+=4)
+                {
+                    const uchar* ksptr0 = sptr + space_ofs[k];
+                    const uchar* ksptr1 = sptr + space_ofs[k+1];
+                    const uchar* ksptr2 = sptr + space_ofs[k+2];
+                    const uchar* ksptr3 = sptr + space_ofs[k+3];
+                    const uchar* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes,
+                                                              ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes)
+                    {
+                        v_uint8 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+                        v_load_deinterleave(ksptr0, kb, kg, kr);
+                        v_uint16 val0, val1, val2, val3, val4;
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_uint32 vall, valh;
+                        v_expand(val0, vall, valh);
+                        v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_expand(val1, vall, valh);
+                        w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                        v_load_deinterleave(ksptr1, kb, kg, kr);
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(val0, vall, valh);
+                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_expand(val1, vall, valh);
+                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_load_deinterleave(ksptr2, kb, kg, kr);
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(val0, vall, valh);
+                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_expand(val1, vall, valh);
+                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_load_deinterleave(ksptr3, kb, kg, kr);
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(val0, vall, valh);
+                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_expand(val1, vall, valh);
+                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for(; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
+                    {
+#if CV_SIMD128
+                            v_uint32x4 rb = v_setall_u32(rsptr[0]);
+                            v_uint32x4 rg = v_setall_u32(rsptr[1]);
+                            v_uint32x4 rr = v_setall_u32(rsptr[2]);
+                            v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
+                            v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
+                            v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
+                            v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr)));
+                            wsum[j] += v_reduce_sum(w);
+                            sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w);
+                            sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w);
+                            sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w);
+#else
+                        int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+                        int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
+                        float w = space_weight[k]*color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+                        b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
+                        w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+                        b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
+                        w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+                        b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
+                        w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+#endif
+                    }
+                }
+                for(; k < maxk; k++)
                {
                    const uchar* ksptr = sptr + space_ofs[k];
                    const uchar* rsptr = sptr;
@@ -421,7 +723,130 @@ public:
                v_float32 v_one = vx_setall_f32(1.f);
                v_float32 sindex = vx_setall_f32(scale_index);
 #endif
-                for( k = 0; k < maxk; k++ )
+                k = 0;
+                for(; k <= maxk - 4; k+=4)
+                {
+                    const float* ksptr0 = sptr + space_ofs[k];
+                    const float* ksptr1 = sptr + space_ofs[k + 1];
+                    const float* ksptr2 = sptr + space_ofs[k + 2];
+                    const float* ksptr3 = sptr + space_ofs[k + 3];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    {
+                        v_float32 rval = vx_load(sptr + j);
+                        v_float32 val = vx_load(ksptr0 + j);
+                        v_float32 knan = v_not_nan(val);
+                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
+                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j));
+                        val = vx_load(ksptr1 + j);
+                        knan = v_not_nan(val);
+                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum = v_muladd(val & knan, w, v_sum);
+                        val = vx_load(ksptr2 + j);
+                        knan = v_not_nan(val);
+                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum = v_muladd(val & knan, w, v_sum);
+                        val = vx_load(ksptr3 + j);
+                        knan = v_not_nan(val);
+                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum = v_muladd(val & knan, w, v_sum);
+                        v_store_aligned(wsum + j, v_wsum);
+                        v_store_aligned(sum + j, v_sum);
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 v_one4 = v_setall_f32(1.f);
+                    v_float32x4 sindex4 = v_setall_f32(scale_index);
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for (; j < size.width; j++)
+                    {
+#if CV_SIMD128
+                        v_float32x4 rval = v_setall_f32(sptr[j]);
+                        v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
+                        v_float32x4 knan = v_not_nan(val);
+                        v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan;
+                        v_int32x4 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        wsum[j] += v_reduce_sum(w);
+                        sum[j] += v_reduce_sum((val & knan) * w);
+#else
+                        float rval = sptr[j];
+                        float val = ksptr0[j];
+                        float alpha = std::abs(val - rval) * scale_index;
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+                        val = ksptr1[j];
+                        alpha = std::abs(val - rval) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k+1] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+                        val = ksptr2[j];
+                        alpha = std::abs(val - rval) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k+2] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+                        val = ksptr3[j];
+                        alpha = std::abs(val - rval) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k+3] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+#endif
+                    }
+                }
+                for(; k < maxk; k++)
                {
                    const float* ksptr = sptr + space_ofs[k];
                    j = 0;
@@ -483,7 +908,162 @@ public:
                v_float32 v_one = vx_setall_f32(1.f);
                v_float32 sindex = vx_setall_f32(scale_index);
 #endif
-                for (k = 0; k < maxk; k++)
+                k = 0;
+                for (; k <= maxk-4; k+=4)
+                {
+                    const float* ksptr0 = sptr + space_ofs[k];
+                    const float* ksptr1 = sptr + space_ofs[k+1];
+                    const float* ksptr2 = sptr + space_ofs[k+2];
+                    const float* ksptr3 = sptr + space_ofs[k+3];
+                    const float* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes,
+                                                                ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes)
+                    {
+                        v_float32 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+                        v_load_deinterleave(ksptr0, kb, kg, kr);
+                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j));
+                        v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j));
+                        v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j));
+                        v_load_deinterleave(ksptr1, kb, kg, kr);
+                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
+                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
+                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        v_load_deinterleave(ksptr2, kb, kg, kr);
+                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
+                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
+                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        v_load_deinterleave(ksptr3, kb, kg, kr);
+                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
+                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
+                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        v_store_aligned(wsum + j, v_wsum);
+                        v_store_aligned(sum_b + j, v_sum_b);
+                        v_store_aligned(sum_g + j, v_sum_g);
+                        v_store_aligned(sum_r + j, v_sum_r);
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 v_one4 = v_setall_f32(1.f);
+                    v_float32x4 sindex4 = v_setall_f32(scale_index);
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for (; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
+                    {
+#if CV_SIMD128
+                        v_float32x4 rb = v_setall_f32(rsptr[0]);
+                        v_float32x4 rg = v_setall_f32(rsptr[1]);
+                        v_float32x4 rr = v_setall_f32(rsptr[2]);
+                        v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
+                        v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
+                        v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
+                        v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_int32x4 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        wsum[j] += v_reduce_sum(w);
+                        sum_b[j] += v_reduce_sum((kb & knan) * w);
+                        sum_g[j] += v_reduce_sum((kg & knan) * w);
+                        sum_r[j] += v_reduce_sum((kr & knan) * w);
+#else
+                        float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+                        bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
+                        float b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
+                        bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+                        b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
+                        v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k+1] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+                        b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
+                        v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k+2] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+                        b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
+                        v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k+3] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+#endif
+                    }
+                }
+                for (; k < maxk; k++)
                {
                    const float* ksptr = sptr + space_ofs[k];
                    const float* rsptr = sptr;