Merge remote-tracking branch 'upstream/3.4' into merge-3.4

fcec053d · Alexander Alekhin · d073215f · a63f66c9 · fcec053d · fcec053d
Commit fcec053d authored Feb 05, 2019 by Alexander Alekhin
13 changed files
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -2993,7 +2993,11 @@ int Kernel::set(int i, const KernelArg& arg)
    if( !p || !p->handle )
        return -1;
    if (i < 0)
+    {
+        CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
+                p->name.c_str(), (int)i));
        return i;
+    }
    if( i == 0 )
        p->cleanupUMats();
    cl_int status = 0;
@@ -3002,10 +3006,19 @@ int Kernel::set(int i, const KernelArg& arg)
        AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
                                 ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
        bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
+        if (ptronly && arg.m->empty())
+        {
+            cl_mem h_null = (cl_mem)NULL;
+            status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
+            CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
+            return i + 1;
+        }
        cl_mem h = (cl_mem)arg.m->handle(accessFlags);

        if (!h)
        {
+            CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
+                    p->name.c_str(), (int)i, (int)arg.flags, arg.m));
            p->release();
            p = 0;
            return -1;

--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -138,9 +138,12 @@ public:
        UMat& bnorm_weight = umat_scale;
        UMat& bnorm_bias = umat_shift;

+        const unsigned LOCAL_SIZE = 128;
        bool use_half = (inputs[0].depth() == CV_16S);
-        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
-                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");
+        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
+                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
+                             LOCAL_SIZE
+        );

        int splitDim = (acrossChannels) ? 1 : 2;
        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
@@ -155,8 +158,8 @@ public:
            float alpha = 1.0f / s[1];

            String buildopt = "-DNUM=4" + opts;
-            ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
-            size_t localsize[] = { 128 };
+            ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN_FUSE");
+            size_t localsize[] = { LOCAL_SIZE };
            size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };

            int argId = 0;
@@ -165,7 +168,6 @@ public:
            k.set(argId++, alpha);
            k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat));
            k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat));
-            k.set(argId++, NULL, localsize[0] * sizeof(cl_float4));
            bool ret = k.run(1, globalsize, localsize, false);
            if (!ret)
                return false;
@@ -173,7 +175,7 @@ public:
            buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "",
                               (fuse_relu) ? "-DFUSE_RELU" : "");

-            ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
+            ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MVN_FUSE");
            argId = 0;
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat));
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
@@ -185,7 +187,6 @@ public:
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
            k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
-            k1.set(argId++, NULL, localsize[0] * sizeof(cl_float4));
            ret = k1.run(1, globalsize, localsize, false);
            if (!ret)
                return false;
@@ -243,7 +244,7 @@ public:
            if (normVariance)
            {
                String kname = format("calc_mean%d", number);
-                ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt);
+                ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN");
                if (kernel.empty())
                    return false;

@@ -263,7 +264,7 @@ public:
            }

            String kname = format("mvn%d", number);
-            buildopt += format("%s%s%s", (normVariance) ? " -DNORM_VARIANCE" : "",
+            buildopt += format("%s%s%s -DKERNEL_MVN", (normVariance) ? " -DNORM_VARIANCE" : "",
                               (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "",
                               (fuse_relu) ? " -DFUSE_RELU" : "");
            ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt);

--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@@ -74,6 +74,8 @@
    #define MVN_FUSE mvn_fuse1
 #endif

+#ifdef KERNEL_MEAN
+
 __kernel void CALC_MEAN(__global const Dtype* src,
                        const int rows,
                        const int cols,
@@ -94,6 +96,8 @@ __kernel void CALC_MEAN(__global const Dtype* src,
    store(dst_vec, dst, index);
 }

+#elif defined KERNEL_MVN
+
 __kernel void MVN(__global const Dtype* src,
                  const int rows,
                  const int cols,
@@ -140,12 +144,13 @@ __kernel void MVN(__global const Dtype* src,
    store(dst_vec, dst, index);
 }

+#elif defined KERNEL_MEAN_FUSE
+
 __kernel void MEAN_FUSE(__global const T * A,
                        unsigned int A_col_size,
                        float alpha,
                        __global T4 * mean,
-                        __global Dtype * tmp,
-                        __local Dtype4 * work)
+                        __global Dtype * tmp)
 {
    unsigned int row_gid = get_group_id(0);
    unsigned int lid = get_local_id(0);
@@ -168,15 +173,16 @@ __kernel void MEAN_FUSE(__global const T * A,
        dot2 += convert_float4(a2);
        dot3 += convert_float4(a3);

-        i += get_local_size(0);
+        i += LOCAL_SIZE;
    }

+    __local Dtype4 work[LOCAL_SIZE];
    work[lid].s0 = dot(dot0, b0);
    work[lid].s1 = dot(dot1, b0);
    work[lid].s2 = dot(dot2, b0);
    work[lid].s3 = dot(dot3, b0);

-    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1)
+    for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)
    {
        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < stride)
@@ -212,10 +218,12 @@ __kernel void MEAN_FUSE(__global const T * A,
        vstore4(dot2, i, dst0_read + 2 * A_col_size);
        vstore4(dot3, i, dst0_read + 3 * A_col_size);

-        i += get_local_size(0);
+        i += LOCAL_SIZE;
    }
 }

+#elif defined KERNEL_MVN_FUSE
+
 __kernel void MVN_FUSE(__global const Dtype * tmp,
                       __global const T * A,
                       __global const T4 * mean,
@@ -225,8 +233,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
                       const float relu_slope,
                       __global const Dtype4 * bnorm_weight,
                       __global const Dtype4 * bnorm_bias,
-                       __global T * B,
-                       __local Dtype4 * work)
+                       __global T * B)
 {
    unsigned int row_gid = get_group_id(0);
    unsigned int lid = get_local_id(0);
@@ -250,15 +257,16 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
        dot2 += a2;
        dot3 += a3;

-        i += get_local_size(0);
+        i += LOCAL_SIZE;
    }

+    __local Dtype4 work[LOCAL_SIZE];
    work[lid].s0 = dot(dot0, b0);
    work[lid].s1 = dot(dot1, b0);
    work[lid].s2 = dot(dot2, b0);
    work[lid].s3 = dot(dot3, b0);

-    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1)
+    for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)
    {
        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < stride)
@@ -314,6 +322,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
        vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
        vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);

-        i += get_local_size(0);
+        i += LOCAL_SIZE;
    }
 }
+
+#else
+#error "Configuration error!"
+#endif
--- a/modules/imgproc/src/color_yuv.cpp
+++ b/modules/imgproc/src/color_yuv.cpp
--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@@ -289,13 +289,16 @@ namespace binding_utils
        float radius;
    };

+#ifdef HAVE_OPENCV_IMGPROC
    Circle minEnclosingCircle(const cv::Mat& points)
    {
        Circle circle;
        cv::minEnclosingCircle(points, circle.center, circle.radius);
        return circle;
    }
+#endif

+#ifdef HAVE_OPENCV_VIDEO
    emscripten::val CamShiftWrapper(const cv::Mat& arg1, Rect& arg2, TermCriteria arg3)
    {
        RotatedRect rotatedRect = cv::CamShift(arg1, arg2, arg3);
@@ -313,6 +316,7 @@ namespace binding_utils
        result.call<void>("push", arg2);
        return result;
    }
+#endif  // HAVE_OPENCV_VIDEO

    std::string getExceptionMsg(const cv::Exception& e) {
        return e.msg;
@@ -551,19 +555,25 @@ EMSCRIPTEN_BINDINGS(binding_utils)

    function("exceptionFromPtr", &binding_utils::exceptionFromPtr, allow_raw_pointers());

+#ifdef HAVE_OPENCV_IMGPROC
    function("minEnclosingCircle", select_overload<binding_utils::Circle(const cv::Mat&)>(&binding_utils::minEnclosingCircle));
+#endif

    function("minMaxLoc", select_overload<binding_utils::MinMaxLoc(const cv::Mat&, const cv::Mat&)>(&binding_utils::minMaxLoc));

    function("minMaxLoc", select_overload<binding_utils::MinMaxLoc(const cv::Mat&)>(&binding_utils::minMaxLoc_1));

+#ifdef HAVE_OPENCV_IMGPROC
    function("morphologyDefaultBorderValue", &cv::morphologyDefaultBorderValue);
+#endif

    function("CV_MAT_DEPTH", &binding_utils::cvMatDepth);

+#ifdef HAVE_OPENCV_VIDEO
    function("CamShift", select_overload<emscripten::val(const cv::Mat&, Rect&, TermCriteria)>(&binding_utils::CamShiftWrapper));

    function("meanShift", select_overload<emscripten::val(const cv::Mat&, Rect&, TermCriteria)>(&binding_utils::meanShiftWrapper));
+#endif

    function("getBuildInformation", &binding_utils::getBuildInformation);


--- a/modules/js/src/embindgen.py
+++ b/modules/js/src/embindgen.py
@@ -140,7 +140,7 @@ features2d = {'Feature2D': ['detect', 'compute', 'detectAndCompute', 'descriptor
              'AKAZE': ['create', 'setDescriptorType', 'getDescriptorType', 'setDescriptorSize', 'getDescriptorSize', 'setDescriptorChannels', 'getDescriptorChannels', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'],
              'DescriptorMatcher': ['add', 'clear', 'empty', 'isMaskSupported', 'train', 'match', 'knnMatch', 'radiusMatch', 'clone', 'create'],
              'BFMatcher': ['isMaskSupported', 'create'],
-              '': ['FAST', 'AGAST', 'drawKeypoints', 'drawMatches']}
+              '': ['drawKeypoints', 'drawMatches']}

 photo = {'': ['createAlignMTB', 'createCalibrateDebevec', 'createCalibrateRobertson', \
              'createMergeDebevec', 'createMergeMertens', 'createMergeRobertson', \

--- a/modules/js/test/test_features2d.js
+++ b/modules/js/test/test_features2d.js
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+if (typeof module !== 'undefined' && module.exports) {
+    // The envrionment is Node.js
+    var cv = require('./opencv.js'); // eslint-disable-line no-var
+}
+
+function generateTestFrame(width, height) {
+  let w = width || 200;
+  let h = height || 200;
+  let img = new cv.Mat(h, w, cv.CV_8UC1, new cv.Scalar(0, 0, 0, 0));
+  let s = new cv.Scalar(255, 255, 255, 255);
+  let s128 = new cv.Scalar(128, 128, 128, 128);
+  let rect = new cv.Rect(w / 4, h / 4, w / 2, h / 2);
+  img.roi(rect).setTo(s);
+  img.roi(new cv.Rect(w / 2 - w / 8, h / 2 - h / 8, w / 4, h / 4)).setTo(s128);
+  cv.rectangle(img, new cv.Point(w / 8, h / 8), new cv.Point(w - w / 8, h - h / 8), s, 5);
+  cv.rectangle(img, new cv.Point(w / 5, h / 5), new cv.Point(w - w / 5, h - h / 5), s128, 3);
+  cv.line(img, new cv.Point(-w, 0), new cv.Point(w / 2, h / 2), s128, 5);
+  cv.line(img, new cv.Point(2*w, 0), new cv.Point(w / 2, h / 2), s, 5);
+  return img;
+}
+
+QUnit.module('Features2D', {});
+QUnit.test('Detectors', function(assert) {
+  let image = generateTestFrame();
+
+  let kp = new cv.KeyPointVector();
+
+  let orb = new cv.ORB();
+  orb.detect(image, kp);
+  assert.equal(kp.size(), 67, 'ORB');
+
+  let mser = new cv.MSER();
+  mser.detect(image, kp);
+  assert.equal(kp.size(), 7, 'MSER');
+
+  let brisk = new cv.BRISK();
+  brisk.detect(image, kp);
+  assert.equal(kp.size(), 191, 'BRISK');
+
+  let ffd = new cv.FastFeatureDetector();
+  ffd.detect(image, kp);
+  assert.equal(kp.size(), 12, 'FastFeatureDetector');
+
+  let afd = new cv.AgastFeatureDetector();
+  afd.detect(image, kp);
+  assert.equal(kp.size(), 67, 'AgastFeatureDetector');
+
+  let gftt = new cv.GFTTDetector();
+  gftt.detect(image, kp);
+  assert.equal(kp.size(), 168, 'GFTTDetector');
+
+  let kaze = new cv.KAZE();
+  kaze.detect(image, kp);
+  assert.equal(kp.size(), 159, 'KAZE');
+
+  let akaze = new cv.AKAZE();
+  akaze.detect(image, kp);
+  assert.equal(kp.size(), 52, 'AKAZE');
+});
+
+QUnit.test('BFMatcher', function(assert) {
+  // Generate key points.
+  let image = generateTestFrame();
+
+  let kp = new cv.KeyPointVector();
+  let descriptors = new cv.Mat();
+  let orb = new cv.ORB();
+  orb.detectAndCompute(image, new cv.Mat(), kp, descriptors);
+
+  assert.equal(kp.size(), 67);
+
+  // Run a matcher.
+  let dm = new cv.DMatchVector();
+  let matcher = new cv.BFMatcher();
+  matcher.match(descriptors, descriptors, dm);
+
+  assert.equal(dm.size(), 67);
+});
--- a/modules/js/test/tests.html
+++ b/modules/js/test/tests.html
@@ -29,6 +29,7 @@
        <script type="application/javascript" src="test_objdetect.js"></script>
        <script type="application/javascript" src="test_video.js"></script>
        <script type="application/javascript" src="test_photo.js"></script>
+        <script type="application/javascript" src="test_features2d.js"></script>
        <script  type='text/javascript'>
            QUnit.config.autostart = false;

@@ -69,11 +70,5 @@
            };
          };
        </script>
-
-<!--
-    TODO
-    <script type="application/javascript" src="test_features2d.js"></script>
-->
-
    </body>
 </html>
--- a/modules/js/test/tests.js
+++ b/modules/js/test/tests.js
@@ -45,7 +45,8 @@ testrunner.run(
    {
        code: 'opencv.js',
        tests: ['test_mat.js', 'test_utils.js', 'test_imgproc.js',
-                'test_objdetect.js', 'test_video.js', 'test_photo.js'],
+                'test_objdetect.js', 'test_video.js', 'test_features2d.js',
+                'test_photo.js'],
    },
    function(err, report) {
        console.log(report.failed + ' failed, ' + report.passed + ' passed');

--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@@ -42,6 +42,7 @@

 #include "precomp.hpp"
 #include "opencl_kernels_video.hpp"
+#include "opencv2/core/hal/intrin.hpp"

 #if defined __APPLE__ || defined __ANDROID__
 #define SMALL_LOCALSIZE
@@ -433,13 +434,11 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
    for( i = 0; i <= m; i++ )
        kernel[i] = (float)(kernel[i]*s);

-#if CV_SSE2
+#if CV_SIMD128
    float* simd_kernel = alignPtr(kernel + m+1, 16);
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
-    if( useSIMD )
    {
        for( i = 0; i <= m; i++ )
-            _mm_store_ps(simd_kernel + i*4, _mm_set1_ps(kernel[i]));
+            v_store(simd_kernel + i*4, v_setall_f32(kernel[i]));
    }
 #endif

@@ -457,54 +456,53 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
        }

        x = 0;
-#if CV_SSE2
-        if( useSIMD )
+#if CV_SIMD128
        {
            for( ; x <= width*5 - 16; x += 16 )
            {
                const float *sptr0 = srow[m], *sptr1;
-                __m128 g4 = _mm_load_ps(simd_kernel);
-                __m128 s0, s1, s2, s3;
-                s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4);
-                s1 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 4), g4);
-                s2 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 8), g4);
-                s3 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 12), g4);
+                v_float32x4 g4 = v_load(simd_kernel);
+                v_float32x4 s0, s1, s2, s3;
+                s0 = v_load(sptr0 + x) * g4;
+                s1 = v_load(sptr0 + x + 4) * g4;
+                s2 = v_load(sptr0 + x + 8) * g4;
+                s3 = v_load(sptr0 + x + 12) * g4;

                for( i = 1; i <= m; i++ )
                {
-                    __m128 x0, x1;
+                    v_float32x4 x0, x1;
                    sptr0 = srow[m+i], sptr1 = srow[m-i];
-                    g4 = _mm_load_ps(simd_kernel + i*4);
-                    x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x));
-                    x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 4), _mm_loadu_ps(sptr1 + x + 4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4));
-                    x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 8), _mm_loadu_ps(sptr1 + x + 8));
-                    x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 12), _mm_loadu_ps(sptr1 + x + 12));
-                    s2 = _mm_add_ps(s2, _mm_mul_ps(x0, g4));
-                    s3 = _mm_add_ps(s3, _mm_mul_ps(x1, g4));
+                    g4 = v_load(simd_kernel + i*4);
+                    x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
+                    x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4);
+                    s0 = v_muladd(x0, g4, s0);
+                    s1 = v_muladd(x1, g4, s1);
+                    x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8);
+                    x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12);
+                    s2 = v_muladd(x0, g4, s2);
+                    s3 = v_muladd(x1, g4, s3);
                }

-                _mm_store_ps(vsum + x, s0);
-                _mm_store_ps(vsum + x + 4, s1);
-                _mm_store_ps(vsum + x + 8, s2);
-                _mm_store_ps(vsum + x + 12, s3);
+                v_store(vsum + x, s0);
+                v_store(vsum + x + 4, s1);
+                v_store(vsum + x + 8, s2);
+                v_store(vsum + x + 12, s3);
            }

            for( ; x <= width*5 - 4; x += 4 )
            {
                const float *sptr0 = srow[m], *sptr1;
-                __m128 g4 = _mm_load_ps(simd_kernel);
-                __m128 s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4);
+                v_float32x4 g4 = v_load(simd_kernel);
+                v_float32x4 s0 = v_load(sptr0 + x) * g4;

                for( i = 1; i <= m; i++ )
                {
                    sptr0 = srow[m+i], sptr1 = srow[m-i];
-                    g4 = _mm_load_ps(simd_kernel + i*4);
-                    __m128 x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4));
+                    g4 = v_load(simd_kernel + i*4);
+                    v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
+                    s0 = v_muladd(x0, g4, s0);
                }
-                _mm_store_ps(vsum + x, s0);
+                v_store(vsum + x, s0);
            }
        }
 #endif
@@ -525,28 +523,25 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,

        // horizontal blur
        x = 0;
-#if CV_SSE2
-        if( useSIMD )
+#if CV_SIMD128
        {
            for( ; x <= width*5 - 8; x += 8 )
            {
-                __m128 g4 = _mm_load_ps(simd_kernel);
-                __m128 s0 = _mm_mul_ps(_mm_loadu_ps(vsum + x), g4);
-                __m128 s1 = _mm_mul_ps(_mm_loadu_ps(vsum + x + 4), g4);
+                v_float32x4 g4 = v_load(simd_kernel);
+                v_float32x4 s0 = v_load(vsum + x) * g4;
+                v_float32x4 s1 = v_load(vsum + x + 4) * g4;

                for( i = 1; i <= m; i++ )
                {
-                    g4 = _mm_load_ps(simd_kernel + i*4);
-                    __m128 x0 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5),
-                                           _mm_loadu_ps(vsum + x + i*5));
-                    __m128 x1 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5 + 4),
-                                           _mm_loadu_ps(vsum + x + i*5 + 4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4));
+                    g4 = v_load(simd_kernel + i*4);
+                    v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5);
+                    v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4);
+                    s0 = v_muladd(x0, g4, s0);
+                    s1 = v_muladd(x1, g4, s1);
                }

-                _mm_store_ps(hsum + x, s0);
-                _mm_store_ps(hsum + x + 4, s1);
+                v_store(hsum + x, s0);
+                v_store(hsum + x + 4, s1);
            }
        }
 #endif

--- a/platforms/js/build_js.py
+++ b/platforms/js/build_js.py
@@ -113,6 +113,7 @@ class Builder:
               "-DWITH_GPHOTO2=OFF",
               "-DWITH_LAPACK=OFF",
               "-DWITH_ITT=OFF",
+               "-DWITH_QUIRC=OFF",
               "-DBUILD_ZLIB=ON",
               "-DBUILD_opencv_apps=OFF",
               "-DBUILD_opencv_calib3d=ON",  # No bindings provided. This module is used as a dependency for other modules.
@@ -130,9 +131,11 @@ class Builder:
               "-DBUILD_opencv_superres=OFF",
               "-DBUILD_opencv_stitching=OFF",
               "-DBUILD_opencv_java=OFF",
+               "-DBUILD_opencv_java_bindings_generator=OFF",
               "-DBUILD_opencv_js=ON",
               "-DBUILD_opencv_python2=OFF",
               "-DBUILD_opencv_python3=OFF",
+               "-DBUILD_opencv_python_bindings_generator=OFF",
               "-DBUILD_EXAMPLES=OFF",
               "-DBUILD_PACKAGE=OFF",
               "-DBUILD_TESTS=OFF",

--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -153,14 +153,16 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
    std::vector<int> classIds;
    std::vector<float> confidences;
    std::vector<Rect> boxes;
-    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
+    if (outLayerType == "DetectionOutput")
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
-        CV_Assert(outs.size() == 1);
-        float* data = (float*)outs[0].data;
-        for (size_t i = 0; i < outs[0].total(); i += 7)
+        CV_Assert(outs.size() > 0);
+        for (size_t k = 0; k < outs.size(); k++)
+        {
+            float* data = (float*)outs[k].data;
+            for (size_t i = 0; i < outs[k].total(); i += 7)
            {
                float confidence = data[i + 2];
                if (confidence > confThreshold)
@@ -171,36 +173,22 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                    int bottom = (int)data[i + 6];
                    int width  = right - left + 1;
                    int height = bottom - top + 1;
-                classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
-                boxes.push_back(Rect(left, top, width, height));
-                confidences.push_back(confidence);
-            }
-        }
-    }
-    else if (outLayerType == "DetectionOutput")
+                    if (width * height <= 1)
                    {
-        // Network produces output blob with a shape 1x1xNx7 where N is a number of
-        // detections and an every detection is a vector of values
-        // [batchId, classId, confidence, left, top, right, bottom]
-        CV_Assert(outs.size() == 1);
-        float* data = (float*)outs[0].data;
-        for (size_t i = 0; i < outs[0].total(); i += 7)
-        {
-            float confidence = data[i + 2];
-            if (confidence > confThreshold)
-            {
-                int left = (int)(data[i + 3] * frame.cols);
-                int top = (int)(data[i + 4] * frame.rows);
-                int right = (int)(data[i + 5] * frame.cols);
-                int bottom = (int)(data[i + 6] * frame.rows);
-                int width = right - left + 1;
-                int height = bottom - top + 1;
+                        left   = (int)(data[i + 3] * frame.cols);
+                        top    = (int)(data[i + 4] * frame.rows);
+                        right  = (int)(data[i + 5] * frame.cols);
+                        bottom = (int)(data[i + 6] * frame.rows);
+                        width  = right - left + 1;
+                        height = bottom - top + 1;
+                    }
                    classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
                    boxes.push_back(Rect(left, top, width, height));
                    confidences.push_back(confidence);
                }
            }
        }
+    }
    else if (outLayerType == "Region")
    {
        for (size_t i = 0; i < outs.size(); ++i)

--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@@ -102,7 +102,7 @@ def postprocess(frame, outs):
    classIds = []
    confidences = []
    boxes = []
-    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
+    if lastLayer.type == 'DetectionOutput':
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
@@ -116,17 +116,7 @@ def postprocess(frame, outs):
                    bottom = int(detection[6])
                    width = right - left + 1
                    height = bottom - top + 1
-                    classIds.append(int(detection[1]) - 1)  # Skip background label
-                    confidences.append(float(confidence))
-                    boxes.append([left, top, width, height])
-    elif lastLayer.type == 'DetectionOutput':
-        # Network produces output blob with a shape 1x1xNx7 where N is a number of
-        # detections and an every detection is a vector of values
-        # [batchId, classId, confidence, left, top, right, bottom]
-        for out in outs:
-            for detection in out[0, 0]:
-                confidence = detection[2]
-                if confidence > confThreshold:
+                    if width * height <= 1:
                        left = int(detection[3] * frameWidth)
                        top = int(detection[4] * frameHeight)
                        right = int(detection[5] * frameWidth)