Merge remote-tracking branch 'upstream/3.4' into merge-3.4

d0032b07 · Alexander Alekhin · Alexander Alekhin · 8120595c · 8eb685de · d0032b07
Commit d0032b07 authored Apr 22, 2019 by Alexander Alekhin Committed by Alexander Alekhin Apr 22, 2019
17 changed files
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@@ -302,7 +302,7 @@ MACRO(ADD_PRECOMPILED_HEADER _targetName _input)
 if [ -n \"$VERBOSE\" ]; then
  tail -n1 \$0
 fi
-${_command} -D$<JOIN:$<TARGET_PROPERTY:${_targetName},COMPILE_DEFINITIONS>, -D>
+${_command} '-D$<JOIN:$<TARGET_PROPERTY:${_targetName},COMPILE_DEFINITIONS>,' '-D>'
 ")
    GET_FILENAME_COMPONENT(_outdir ${_output} PATH)
    ADD_CUSTOM_COMMAND(

--- a/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/py_svm_opencv.markdown
+++ b/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/py_svm_opencv.markdown
@@ -47,7 +47,7 @@ area and try to implement them.
 Additional Resources
 --------------------

-#  [Histograms of Oriented Gradients Video](www.youtube.com/watch?v=0Zib1YEE4LU‎)
+-#  [Histograms of Oriented Gradients Video](https://www.youtube.com/watch?v=0Zib1YEE4LU)

 Exercises
 ---------

--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -431,19 +431,6 @@ inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
 { return v_float64x4(_mm256_castps_pd(a.val)); }

-#if CV_FP16
-inline v_float32x8 v256_load_fp16_f32(const short* ptr)
-{
-    return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
-}
-
-inline void v_store_fp16(short* ptr, const v_float32x8& a)
-{
-    __m128i fp16_value = _mm256_cvtps_ph(a.val, 0);
-    _mm_store_si128((__m128i*)ptr, fp16_value);
-}
-#endif
-
 /* Recombine */
 /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \
    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
@@ -1400,7 +1387,7 @@ inline v_float32x8 v_cvt_f32(const v_float64x4& a)
 inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
 {
    __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
-    return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
+    return v_float32x8(_v256_combine(af, bf));
 }

 inline v_float64x4 v_cvt_f64(const v_int32x8& a)
@@ -1474,7 +1461,7 @@ inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
 }
 inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
 {
-    return v_int32x8(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
+    return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
 }
 inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
 inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
@@ -1490,7 +1477,7 @@ inline v_int64x4 v256_lut(const int64* tab, const int* idx)
 }
 inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
 {
-    return v_int64x4(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
+    return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
 }
 inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
 inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
@@ -1506,7 +1493,7 @@ inline v_float64x4 v256_lut(const double* tab, const int* idx)
 {
    return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
 }
-inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_loadu_pd(tab + idx[0])), _mm_loadu_pd(tab + idx[1]), 0x1)); }
+inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }

 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
 {

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -278,48 +278,6 @@ struct v_float64x2
 };
 #endif

-#if CV_FP16
-// Workaround for old compilers
-static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
-static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
-
-static inline float16x4_t cv_vld1_f16(const void* ptr)
-{
-#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
-    return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
-#else
-    return vld1_f16((const __fp16*)ptr);
-#endif
-}
-static inline void cv_vst1_f16(void* ptr, float16x4_t a)
-{
-#ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
-    vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
-#else
-    vst1_f16((__fp16*)ptr, a);
-#endif
-}
-
-#ifndef vdup_n_f16
-    #define vdup_n_f16(v) (float16x4_t){v, v, v, v}
-#endif
-
-#endif // CV_FP16
-
-#if CV_FP16
-inline v_float32x4 v128_load_fp16_f32(const short* ptr)
-{
-    float16x4_t a = cv_vld1_f16((const __fp16*)ptr);
-    return v_float32x4(vcvt_f32_f16(a));
-}
-
-inline void v_store_fp16(short* ptr, const v_float32x4& a)
-{
-    float16x4_t fp16 = vcvt_f16_f32(a.val);
-    cv_vst1_f16((short*)ptr, fp16);
-}
-#endif
-
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -2684,19 +2684,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
 }

-#if CV_FP16
-inline v_float32x4 v128_load_fp16_f32(const short* ptr)
-{
-    return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
-}
-
-inline void v_store_fp16(short* ptr, const v_float32x4& a)
-{
-    __m128i fp16_value = _mm_cvtps_ph(a.val, 0);
-    _mm_storel_epi64((__m128i*)ptr, fp16_value);
-}
-#endif
-
 ////////////// Lookup table access ////////////////////

 inline v_int8x16 v_lut(const schar* tab, const int* idx)
@@ -2956,6 +2943,9 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }

 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
+#if CV_FP16
+    return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+#else
    const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
    const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
    const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
@@ -2968,10 +2958,15 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
    __m128i zmask = _mm_cmpeq_epi32(e, z);
    __m128i ft = v_select_si128(zmask, zt, t);
    return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
+#endif
 }

 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
+#if CV_FP16
+    __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
+    _mm_storel_epi64((__m128i*)ptr, fp16_value);
+#else
    const __m128i signmask = _mm_set1_epi32(0x80000000);
    const __m128i rval = _mm_set1_epi32(0x3f000000);

@@ -2993,6 +2988,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
    t = _mm_or_si128(t, sign);
    t = _mm_packs_epi32(t, t);
    _mm_storel_epi64((__m128i*)ptr, t);
+#endif
 }

 inline void v_cleanup() {}

--- a/modules/core/src/directx.cpp
+++ b/modules/core/src/directx.cpp
@@ -256,6 +256,12 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
        CV_Error(cv::Error::OpenCLInitError, "OpenCL: No available platforms");

    std::vector<cl_platform_id> platforms(numPlatforms);
+    status = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
+    if (status != CL_SUCCESS)
+        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platforms");
+
+    // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE
+
    size_t exts_len;
    cv::AutoBuffer<char> extensions;
    bool is_support_cl_khr_d3d11_sharing = false;
@@ -264,9 +270,6 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
 #endif
    for (int i = 0; i < (int)numPlatforms; i++)
    {
-        status = clGetPlatformIDs(numPlatforms, &platforms[i], NULL);
-        if (status != CL_SUCCESS)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get number of platforms");
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &exts_len);
        if (status != CL_SUCCESS)
            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get length of CL_PLATFORM_EXTENSIONS");
@@ -479,7 +482,7 @@ Context& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device)
    std::vector<cl_platform_id> platforms(numPlatforms);
    status = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
    if (status != CL_SUCCESS)
-        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get number of platforms");
+        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platforms");

    // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE

@@ -587,7 +590,7 @@ Context& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDev
    std::vector<cl_platform_id> platforms(numPlatforms);
    status = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
    if (status != CL_SUCCESS)
-        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get number of platforms");
+        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platforms");

    // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE

@@ -697,7 +700,7 @@ Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9
    std::vector<cl_platform_id> platforms(numPlatforms);
    status = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
    if (status != CL_SUCCESS)
-        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get number of platforms");
+        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platforms");

    // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE


--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -44,6 +44,9 @@

 #include <vector>
 #include <opencv2/core.hpp>
+#ifdef CV_CXX11
+#include <future>
+#endif

 #include "../dnn/version.hpp"

@@ -57,6 +60,18 @@ CV__DNN_INLINE_NS_BEGIN

    typedef std::vector<int> MatShape;

+#if defined(CV_CXX11) || defined(CV_DOXYGEN)
+    typedef std::future<Mat> AsyncMat;
+#else
+    // Just a workaround for bindings.
+    struct AsyncMat
+    {
+        Mat get() { return Mat(); }
+        void wait() const {}
+        size_t wait_for(size_t milliseconds) const { CV_UNUSED(milliseconds); return -1; }
+    };
+#endif
+
    /**
     * @brief Enum of computation backends supported by layers.
     * @see Net::setPreferableBackend
@@ -68,7 +83,7 @@ CV__DNN_INLINE_NS_BEGIN
        //! DNN_BACKEND_OPENCV otherwise.
        DNN_BACKEND_DEFAULT,
        DNN_BACKEND_HALIDE,
-        DNN_BACKEND_INFERENCE_ENGINE,
+        DNN_BACKEND_INFERENCE_ENGINE,  //!< Intel's Inference Engine computational backend.
        DNN_BACKEND_OPENCV,
        DNN_BACKEND_VKCOM
    };
@@ -84,8 +99,7 @@ CV__DNN_INLINE_NS_BEGIN
        DNN_TARGET_OPENCL_FP16,
        DNN_TARGET_MYRIAD,
        DNN_TARGET_VULKAN,
-        //! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
-        DNN_TARGET_FPGA
+        DNN_TARGET_FPGA  //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
    };

    CV_EXPORTS std::vector< std::pair<Backend, Target> > getAvailableBackends();
@@ -458,6 +472,15 @@ CV__DNN_INLINE_NS_BEGIN
         */
        CV_WRAP Mat forward(const String& outputName = String());

+        /** @brief Runs forward pass to compute output of layer with name @p outputName.
+         *  @param outputName name for layer which output is needed to get
+         *  @details By default runs forward pass for the whole network.
+         *
+         *  This is an asynchronous version of forward(const String&).
+         *  dnn::DNN_BACKEND_INFERENCE_ENGINE backend is required.
+         */
+        CV_WRAP AsyncMat forwardAsync(const String& outputName = String());
+
        /** @brief Runs forward pass to compute output of layer with name @p outputName.
         *  @param outputBlobs contains all output blobs for specified layer.
         *  @param outputName name for layer which output is needed to get

--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP

 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20190412
+#define OPENCV_DNN_API_VERSION 20190422

 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)

--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@@ -2,7 +2,13 @@
 typedef dnn::DictValue LayerId;
 typedef std::vector<dnn::MatShape> vector_MatShape;
 typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;
-
+#ifdef CV_CXX11
+typedef std::chrono::milliseconds chrono_milliseconds;
+typedef std::future_status AsyncMatStatus;
+#else
+typedef size_t chrono_milliseconds;
+typedef size_t AsyncMatStatus;
+#endif

 template<>
 bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
@@ -40,6 +46,46 @@ bool pyopencv_to(PyObject *o, std::vector<Mat> &blobs, const char *name) //requi
  return pyopencvVecConverter<Mat>::to(o, blobs, ArgInfo(name, false));
 }

+#ifdef CV_CXX11
+
+template<>
+PyObject* pyopencv_from(const std::future<Mat>& f_)
+{
+    std::future<Mat>& f = const_cast<std::future<Mat>&>(f_);
+    Ptr<cv::dnn::AsyncMat> p(new std::future<Mat>(std::move(f)));
+    return pyopencv_from(p);
+}
+
+template<>
+PyObject* pyopencv_from(const std::future_status& status)
+{
+    return pyopencv_from((int)status);
+}
+
+template<>
+bool pyopencv_to(PyObject* src, std::chrono::milliseconds& dst, const char* name)
+{
+    size_t millis = 0;
+    if (pyopencv_to(src, millis, name))
+    {
+        dst = std::chrono::milliseconds(millis);
+        return true;
+    }
+    else
+        return false;
+}
+
+#else
+
+template<>
+PyObject* pyopencv_from(const cv::dnn::AsyncMat&)
+{
+    CV_Error(Error::StsNotImplemented, "C++11 is required.");
+    return 0;
+}
+
+#endif  // CV_CXX11
+
 template<typename T>
 PyObject* pyopencv_from(const dnn::DictValue &dv)
 {

--- a/modules/dnn/misc/python/shadow_async_mat.hpp
+++ b/modules/dnn/misc/python/shadow_async_mat.hpp
+#error This is a shadow header file, which is not intended for processing by any compiler. \
+       Only bindings parser should handle this file.
+
+namespace cv { namespace dnn {
+
+class CV_EXPORTS_W AsyncMat
+{
+public:
+    //! Wait for Mat object readiness and return it.
+    CV_WRAP Mat get();
+
+    //! Wait for Mat object readiness.
+    CV_WRAP void wait() const;
+
+    /** @brief Wait for Mat object readiness specific amount of time.
+     *  @param timeout Timeout in milliseconds
+     *  @returns [std::future_status](https://en.cppreference.com/w/cpp/thread/future_status)
+     */
+    CV_WRAP AsyncMatStatus wait_for(std::chrono::milliseconds timeout) const;
+};
+
+}}
--- a/modules/dnn/misc/python/test/test_dnn.py
+++ b/modules/dnn/misc/python/test/test_dnn.py
@@ -5,8 +5,8 @@ import numpy as np

 from tests_common import NewOpenCVTests, unittest

-def normAssert(test, a, b, lInf=1e-5):
-    test.assertLess(np.max(np.abs(a - b)), lInf)
+def normAssert(test, a, b, msg=None, lInf=1e-5):
+    test.assertLess(np.max(np.abs(a - b)), lInf, msg)

 def inter_area(box1, box2):
    x_min, x_max = max(box1[0], box2[0]), min(box1[2], box2[2])
@@ -53,53 +53,6 @@ def normAssertDetections(test, ref, out, confThreshold=0.0, scores_diff=1e-5, bo
    if errMsg:
        test.fail(errMsg)

-
-# Returns a simple one-layer network created from Caffe's format
-def getSimpleNet():
-    prototxt = """
-        name: "simpleNet"
-        input: "data"
-        layer {
-          type: "Identity"
-          name: "testLayer"
-          top: "testLayer"
-          bottom: "data"
-        }
-    """
-    return cv.dnn.readNetFromCaffe(bytearray(prototxt, 'utf8'))
-
-
-def testBackendAndTarget(backend, target):
-    net = getSimpleNet()
-    net.setPreferableBackend(backend)
-    net.setPreferableTarget(target)
-    inp = np.random.standard_normal([1, 2, 3, 4]).astype(np.float32)
-    try:
-        net.setInput(inp)
-        net.forward()
-    except BaseException as e:
-        return False
-    return True
-
-
-haveInfEngine = testBackendAndTarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU)
-dnnBackendsAndTargets = [
-    [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
-]
-
-if haveInfEngine:
-    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
-    if testBackendAndTarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
-        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
-
-if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
-    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
-    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
-    if haveInfEngine and cv.ocl_Device.getDefault().isIntel():
-        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
-        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
-
-
 def printParams(backend, target):
    backendNames = {
        cv.dnn.DNN_BACKEND_OPENCV: 'OCV',
@@ -116,8 +69,44 @@ def printParams(backend, target):

 class dnn_test(NewOpenCVTests):

+    def __init__(self, *args, **kwargs):
+        super(dnn_test, self).__init__(*args, **kwargs)
+        self.dnnBackendsAndTargets = [
+            [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
+        ]
+
+        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU):
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
+        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
+
+        if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
+            if cv.ocl_Device.getDefault().isIntel():
+                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL):
+                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
+                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16):
+                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
+
    def find_dnn_file(self, filename, required=True):
-        return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd())], required=required)
+        return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd()),
+                                         os.environ['OPENCV_TEST_DATA_PATH']],
+                              required=required)
+
+    def checkIETarget(self, backend, target):
+        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt', required=True)
+        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel', required=True)
+        net = cv.dnn.readNet(proto, model)
+        net.setPreferableBackend(backend)
+        net.setPreferableTarget(target)
+        inp = np.random.standard_normal([1, 2, 10, 11]).astype(np.float32)
+        try:
+            net.setInput(inp)
+            net.forward()
+        except BaseException as e:
+            return False
+        return True

    def test_blobFromImage(self):
        np.random.seed(324)
@@ -148,7 +137,7 @@ class dnn_test(NewOpenCVTests):

    def test_face_detection(self):
        testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
-        proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt2', required=testdata_required)
+        proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt', required=testdata_required)
        model = self.find_dnn_file('dnn/opencv_face_detector.caffemodel', required=testdata_required)
        if proto is None or model is None:
            raise unittest.SkipTest("Missing DNN test files (dnn/opencv_face_detector.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
@@ -164,7 +153,7 @@ class dnn_test(NewOpenCVTests):
               [0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427,  0.5347801]]

        print('\n')
-        for backend, target in dnnBackendsAndTargets:
+        for backend, target in self.dnnBackendsAndTargets:
            printParams(backend, target)

            net = cv.dnn.readNet(proto, model)
@@ -178,5 +167,52 @@ class dnn_test(NewOpenCVTests):

            normAssertDetections(self, ref, out, 0.5, scoresDiff, iouDiff)

+    def test_async(self):
+        timeout = 5000  # in milliseconds
+        testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
+        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt', required=testdata_required)
+        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel', required=testdata_required)
+        if proto is None or model is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/layers/layer_convolution.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        print('\n')
+        for backend, target in self.dnnBackendsAndTargets:
+            if backend != cv.dnn.DNN_BACKEND_INFERENCE_ENGINE:
+                continue
+
+            printParams(backend, target)
+
+            netSync = cv.dnn.readNet(proto, model)
+            netSync.setPreferableBackend(backend)
+            netSync.setPreferableTarget(target)
+
+            netAsync = cv.dnn.readNet(proto, model)
+            netAsync.setPreferableBackend(backend)
+            netAsync.setPreferableTarget(target)
+
+            # Generate inputs
+            numInputs = 10
+            inputs = []
+            for _ in range(numInputs):
+                inputs.append(np.random.standard_normal([2, 6, 75, 113]).astype(np.float32))
+
+            # Run synchronously
+            refs = []
+            for i in range(numInputs):
+                netSync.setInput(inputs[i])
+                refs.append(netSync.forward())
+
+            # Run asynchronously. To make test more robust, process inputs in the reversed order.
+            outs = []
+            for i in reversed(range(numInputs)):
+                netAsync.setInput(inputs[i])
+                outs.insert(0, netAsync.forwardAsync())
+
+            for i in reversed(range(numInputs)):
+                if outs[i].wait_for(timeout) == 1:
+                    self.fail("Timeout")
+                normAssert(self, refs[i], outs[i].get(), 'Index: %d' % i, 1e-10)
+
+
 if __name__ == '__main__':
    NewOpenCVTests.bootstrap()
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1043,6 +1043,7 @@ struct Net::Impl
        lastLayerId = 0;
        netWasAllocated = false;
        fusion = true;
+        isAsync = false;
        preferableBackend = DNN_BACKEND_DEFAULT;
        preferableTarget = DNN_TARGET_CPU;
        skipInfEngineInit = false;
@@ -1064,6 +1065,7 @@ struct Net::Impl

    bool netWasAllocated;
    bool fusion;
+    bool isAsync;
    std::vector<int64> layersTimings;
    Mat output_blob;

@@ -2321,6 +2323,9 @@ struct Net::Impl
            std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
            if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
            {
+                if (isAsync)
+                    CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
+
                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
                    std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
@@ -2476,7 +2481,7 @@ struct Net::Impl
                }
                else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
                {
-                    forwardInfEngine(node);
+                    forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
                }
                else if (preferableBackend == DNN_BACKEND_VKCOM)
                {
@@ -2535,15 +2540,6 @@ struct Net::Impl
        forwardLayer(ld);
    }

-    void forwardAll()
-    {
-        CV_TRACE_FUNCTION();
-
-        MapIdToLayerData::reverse_iterator last_layer = layers.rbegin();
-        CV_Assert(last_layer != layers.rend());
-        forwardToLayer(last_layer->second, true);
-    }
-
    void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
    {
        std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
@@ -2634,6 +2630,42 @@ struct Net::Impl
    {
        return getBlob(getPinByAlias(outputName));
    }
+
+#ifdef CV_CXX11
+    std::future<Mat> getBlobAsync(const LayerPin& pin)
+    {
+        CV_TRACE_FUNCTION();
+#ifdef HAVE_INF_ENGINE
+        if (!pin.valid())
+            CV_Error(Error::StsObjectNotFound, "Requested blob not found");
+
+        LayerData &ld = layers[pin.lid];
+        if ((size_t)pin.oid >= ld.outputBlobs.size())
+        {
+            CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
+                                           "the #%d was requested", ld.name.c_str(),
+                                           (int)ld.outputBlobs.size(), (int)pin.oid));
+        }
+        if (preferableTarget != DNN_TARGET_CPU)
+        {
+            CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
+            // Transfer data to CPU if it's require.
+            ld.outputBlobsWrappers[pin.oid]->copyToHost();
+        }
+        CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
+
+        Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
+        return std::move(wrapper->futureMat);
+#else
+        CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE backend is required");
+#endif
+    }
+
+    std::future<Mat> getBlobAsync(String outputName)
+    {
+        return getBlobAsync(getPinByAlias(outputName));
+    }
+#endif  // CV_CXX11
 };

 Net::Net() : impl(new Net::Impl)
@@ -2757,6 +2789,31 @@ Mat Net::forward(const String& outputName)
    return impl->getBlob(layerName);
 }

+AsyncMat Net::forwardAsync(const String& outputName)
+{
+    CV_TRACE_FUNCTION();
+#ifdef CV_CXX11
+    if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
+        CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
+
+    String layerName = outputName;
+
+    if (layerName.empty())
+        layerName = getLayerNames().back();
+
+    std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
+    impl->setUpNet(pins);
+
+    impl->isAsync = true;
+    impl->forwardToLayer(impl->getLayerData(layerName));
+    impl->isAsync = false;
+
+    return impl->getBlobAsync(layerName);
+#else
+    CV_Error(Error::StsNotImplemented, "Asynchronous forward without C++11");
+#endif  // CV_CXX11
+}
+
 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
 {
    CV_TRACE_FUNCTION();

--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -140,9 +140,10 @@ Mat getMatFromTensor(opencv_onnx::TensorProto& tensor_proto)
    return blob;
 }

-void runLayer(Ptr<Layer> layer, const std::vector<Mat>& inputs,
+void runLayer(LayerParams& params, const std::vector<Mat>& inputs,
              std::vector<Mat>& outputs)
 {
+    Ptr<Layer> layer = LayerFactory::createLayerInstance(params.type, params);
    std::vector<MatShape> inpShapes(inputs.size());
    int ddepth = CV_32F;
    for (size_t i = 0; i < inputs.size(); ++i)
@@ -669,14 +670,15 @@ void ONNXImporter::populateNet(Net dstNet)
                Mat blob = getBlob(node_proto, constBlobs, 1);
                CV_Assert(blob.type() == CV_32SC1);

+                layerParams.set("dim", DictValue::arrayInt<int*>(
+                            blob.ptr<int>(), blob.total() ));
+
                if (layer_id.find(node_proto.input(0)) == layer_id.end()) {
-                    Mat input = getBlob(node_proto, constBlobs, 0);
-                    Mat out = input.reshape(0, static_cast<std::vector<int> >(blob));
-                    constBlobs.insert(std::make_pair(layerParams.name, out));
+                    std::vector<Mat> inputs(1, getBlob(node_proto, constBlobs, 0)), outputs;
+                    runLayer(layerParams, inputs, outputs);
+                    constBlobs.insert(std::make_pair(layerParams.name, outputs[0]));
                    continue;
                }
-                layerParams.set("dim", DictValue::arrayInt<int*>(
-                            blob.ptr<int>(), blob.total() ));
            }
            else {
                DictValue shape = layerParams.get("shape");
@@ -749,8 +751,7 @@ void ONNXImporter::populateNet(Net dstNet)
                {
                    inputs[i] = getBlob(node_proto, constBlobs, i);
                }
-                Ptr<Layer> concat = ConcatLayer::create(layerParams);
-                runLayer(concat, inputs, concatenated);
+                runLayer(layerParams, inputs, concatenated);

                CV_Assert(concatenated.size() == 1);
                constBlobs.insert(std::make_pair(layerParams.name, concatenated[0]));

--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -168,7 +168,6 @@ void InfEngineBackendNet::init(int targetId)
        const std::string& name = it.first;
        auto blobIt = allBlobs.find(name);
        CV_Assert(blobIt != allBlobs.end());
-        inpBlobs[name] = blobIt->second;
        it.second->setPrecision(blobIt->second->precision());
    }
    for (const auto& it : cnn.getOutputsInfo())
@@ -176,7 +175,6 @@ void InfEngineBackendNet::init(int targetId)
        const std::string& name = it.first;
        auto blobIt = allBlobs.find(name);
        CV_Assert(blobIt != allBlobs.end());
-        outBlobs[name] = blobIt->second;
        it.second->setPrecision(blobIt->second->precision());  // Should be always FP32
    }

@@ -288,6 +286,24 @@ InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::La
    return wrapToInfEngineBlob(m, reversedShape, layout);
 }

+InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob)
+{
+    InferenceEngine::Precision precision = blob->precision();
+    InferenceEngine::Blob::Ptr copy;
+    if (precision == InferenceEngine::Precision::FP32)
+    {
+        copy = InferenceEngine::make_shared_blob<float>(precision, blob->layout(), blob->dims());
+    }
+    else if (precision == InferenceEngine::Precision::U8)
+    {
+        copy = InferenceEngine::make_shared_blob<uint8_t>(precision, blob->layout(), blob->dims());
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
+    copy->allocate();
+    return copy;
+}
+
 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr)
 {
    CV_Assert(!ptr.empty());
@@ -800,9 +816,6 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
        plugin = InferenceEngine::InferencePlugin(enginePtr);

        netExec = plugin.LoadNetwork(net, {});
-        infRequest = netExec.CreateInferRequest();
-        infRequest.SetInput(inpBlobs);
-        infRequest.SetOutput(outBlobs);
    }
    catch (const std::exception& ex)
    {
@@ -828,9 +841,116 @@ void InfEngineBackendNet::addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs
    }
 }

-void InfEngineBackendNet::forward()
+void InfEngineBackendNet::InfEngineReqWrapper::makePromises(const std::vector<Ptr<BackendWrapper> >& outsWrappers)
+{
+    auto outs = infEngineWrappers(outsWrappers);
+    outProms.clear();
+    outProms.resize(outs.size());
+    outsNames.resize(outs.size());
+    for (int i = 0; i < outs.size(); ++i)
+    {
+        outs[i]->futureMat = outProms[i].get_future();
+        outsNames[i] = outs[i]->dataPtr->name;
+    }
+}
+
+void InfEngineBackendNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                                  bool isAsync)
 {
-    infRequest.Infer();
+    // Look for finished requests.
+    Ptr<InfEngineReqWrapper> reqWrapper;
+    for (auto& wrapper : infRequests)
+    {
+        if (wrapper->isReady)
+        {
+            reqWrapper = wrapper;
+            break;
+        }
+    }
+    if (reqWrapper.empty())
+    {
+        reqWrapper = Ptr<InfEngineReqWrapper>(new InfEngineReqWrapper());
+        try
+        {
+            reqWrapper->req = netExec.CreateInferRequest();
+        }
+        catch (const std::exception& ex)
+        {
+            CV_Error(Error::StsAssert, format("Failed to initialize Inference Engine backend: %s", ex.what()));
+        }
+        infRequests.push_back(reqWrapper);
+
+        InferenceEngine::BlobMap inpBlobs, outBlobs;
+        for (const auto& it : cnn.getInputsInfo())
+        {
+            const std::string& name = it.first;
+            auto blobIt = allBlobs.find(name);
+            CV_Assert(blobIt != allBlobs.end());
+            inpBlobs[name] = isAsync ? cloneBlob(blobIt->second) : blobIt->second;
+        }
+        for (const auto& it : cnn.getOutputsInfo())
+        {
+            const std::string& name = it.first;
+            auto blobIt = allBlobs.find(name);
+            CV_Assert(blobIt != allBlobs.end());
+            outBlobs[name] = isAsync ? cloneBlob(blobIt->second) : blobIt->second;
+        }
+        reqWrapper->req.SetInput(inpBlobs);
+        reqWrapper->req.SetOutput(outBlobs);
+
+        InferenceEngine::IInferRequest::Ptr infRequestPtr = reqWrapper->req;
+        infRequestPtr->SetUserData(reqWrapper.get(), 0);
+
+        infRequestPtr->SetCompletionCallback({
+            [](InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode status)
+            {
+                InfEngineReqWrapper* wrapper;
+                request->GetUserData((void**)&wrapper, 0);
+                CV_Assert(wrapper);
+
+                for (int i = 0; i < wrapper->outProms.size(); ++i)
+                {
+                    const std::string& name = wrapper->outsNames[i];
+                    Mat m = infEngineBlobToMat(wrapper->req.GetBlob(name));
+
+                    if (status == InferenceEngine::StatusCode::OK)
+                        wrapper->outProms[i].set_value(m.clone());
+                    else
+                    {
+                        try {
+                            std::runtime_error e("Async request failed");
+                            wrapper->outProms[i].set_exception(std::make_exception_ptr(e));
+                        } catch(...) {
+                            CV_LOG_ERROR(NULL, "DNN: Exception occured during async inference exception propagation");
+                        }
+                    }
+                }
+                wrapper->isReady = true;
+            }
+        });
+    }
+    if (isAsync)
+    {
+        // Copy actual data to infer request's input blobs.
+        for (const auto& it : cnn.getInputsInfo())
+        {
+            const std::string& name = it.first;
+            auto blobIt = allBlobs.find(name);
+            Mat srcMat = infEngineBlobToMat(blobIt->second);
+            Mat dstMat = infEngineBlobToMat(reqWrapper->req.GetBlob(name));
+            srcMat.copyTo(dstMat);
+        }
+
+        // Set promises to output blobs wrappers.
+        reqWrapper->makePromises(outBlobsWrappers);
+
+        reqWrapper->isReady = false;
+        reqWrapper->req.StartAsync();
+    }
+    else
+    {
+        reqWrapper->req.Infer();
+    }
 }

 Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
@@ -920,14 +1040,15 @@ bool haveInfEngine()
 #endif  // HAVE_INF_ENGINE
 }

-void forwardInfEngine(Ptr<BackendNode>& node)
+void forwardInfEngine(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                      Ptr<BackendNode>& node, bool isAsync)
 {
    CV_Assert(haveInfEngine());
 #ifdef HAVE_INF_ENGINE
    CV_Assert(!node.empty());
    Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
    CV_Assert(!ieNode.empty());
-    ieNode->net->forward();
+    ieNode->net->forward(outBlobsWrappers, isAsync);
 #endif  // HAVE_INF_ENGINE
 }


--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -185,7 +185,8 @@ public:

    void init(int targetId);

-    void forward();
+    void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                 bool isAsync);

    void initPlugin(InferenceEngine::ICNNNetwork& net);

@@ -197,12 +198,23 @@ private:
    InferenceEngine::InferenceEnginePluginPtr enginePtr;
    InferenceEngine::InferencePlugin plugin;
    InferenceEngine::ExecutableNetwork netExec;
-    InferenceEngine::InferRequest infRequest;
    InferenceEngine::BlobMap allBlobs;
-    InferenceEngine::BlobMap inpBlobs;
-    InferenceEngine::BlobMap outBlobs;
    InferenceEngine::TargetDevice targetDevice;

+    struct InfEngineReqWrapper
+    {
+        InfEngineReqWrapper() : isReady(true) {}
+
+        void makePromises(const std::vector<Ptr<BackendWrapper> >& outs);
+
+        InferenceEngine::InferRequest req;
+        std::vector<std::promise<Mat> > outProms;
+        std::vector<std::string> outsNames;
+        bool isReady;
+    };
+
+    std::vector<Ptr<InfEngineReqWrapper> > infRequests;
+
    InferenceEngine::CNNNetwork cnn;
    bool hasNetOwner;

@@ -252,6 +264,7 @@ public:

    InferenceEngine::DataPtr dataPtr;
    InferenceEngine::Blob::Ptr blob;
+    std::future<Mat> futureMat;
 };

 InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY);
@@ -302,7 +315,8 @@ CV__DNN_INLINE_NS_END

 bool haveInfEngine();

-void forwardInfEngine(Ptr<BackendNode>& node);
+void forwardInfEngine(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                      Ptr<BackendNode>& node, bool isAsync);

 }}  // namespace dnn, namespace cv


--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -342,4 +342,106 @@ TEST(Net, forwardAndRetrieve)
    normAssert(outBlobs[0][1], inp.rowRange(2, 4), "second part");
 }

+#ifdef HAVE_INF_ENGINE
+// This test runs network in synchronous mode for different inputs and then
+// runs the same model asynchronously for the same inputs.
+typedef testing::TestWithParam<Target> Async;
+TEST_P(Async, set_and_forward_single)
+{
+    static const int kTimeout = 5000;  // in milliseconds.
+    const int target = GetParam();
+
+    const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+
+    Net netSync = readNet(model, proto);
+    netSync.setPreferableTarget(target);
+
+    Net netAsync = readNet(model, proto);
+    netAsync.setPreferableTarget(target);
+
+    // Generate inputs.
+    const int numInputs = 10;
+    std::vector<Mat> inputs(numInputs);
+    int blobSize[] = {2, 6, 75, 113};
+    for (int i = 0; i < numInputs; ++i)
+    {
+        inputs[i].create(4, &blobSize[0], CV_32FC1);
+        randu(inputs[i], 0.0f, 1.0f);
+    }
+
+    // Run synchronously.
+    std::vector<Mat> refs(numInputs);
+    for (int i = 0; i < numInputs; ++i)
+    {
+        netSync.setInput(inputs[i]);
+        refs[i] = netSync.forward().clone();
+    }
+
+    // Run asynchronously. To make test more robust, process inputs in the reversed order.
+    for (int i = numInputs - 1; i >= 0; --i)
+    {
+        netAsync.setInput(inputs[i]);
+
+        std::future<Mat> out = netAsync.forwardAsync();
+        if (out.wait_for(std::chrono::milliseconds(kTimeout)) == std::future_status::timeout)
+            CV_Error(Error::StsAssert, "Timeout");
+        normAssert(refs[i], out.get(), format("Index: %d", i).c_str(), 0, 0);
+    }
+}
+
+TEST_P(Async, set_and_forward_all)
+{
+    static const int kTimeout = 5000;  // in milliseconds.
+    const int target = GetParam();
+
+    const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+
+
+    Net netSync = readNet(model, proto);
+    netSync.setPreferableTarget(target);
+
+    Net netAsync = readNet(model, proto);
+    netAsync.setPreferableTarget(target);
+
+    // Generate inputs.
+    const int numInputs = 10;
+    std::vector<Mat> inputs(numInputs);
+    int blobSize[] = {2, 6, 75, 113};
+    for (int i = 0; i < numInputs; ++i)
+    {
+        inputs[i].create(4, &blobSize[0], CV_32FC1);
+        randu(inputs[i], 0.0f, 1.0f);
+    }
+
+    // Run synchronously.
+    std::vector<Mat> refs(numInputs);
+    for (int i = 0; i < numInputs; ++i)
+    {
+        netSync.setInput(inputs[i]);
+        refs[i] = netSync.forward().clone();
+    }
+
+    // Run asynchronously. To make test more robust, process inputs in the reversed order.
+    std::vector<std::future<Mat> > outs(numInputs);
+    for (int i = numInputs - 1; i >= 0; --i)
+    {
+        netAsync.setInput(inputs[i]);
+        outs[i] = netAsync.forwardAsync();
+    }
+
+    for (int i = numInputs - 1; i >= 0; --i)
+    {
+        if (outs[i].wait_for(std::chrono::milliseconds(kTimeout)) == std::future_status::timeout)
+            CV_Error(Error::StsAssert, "Timeout");
+        normAssert(refs[i], outs[i].get(), format("Index: %d", i).c_str(), 0, 0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Async, testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)));
+#endif  // HAVE_INF_ENGINE
+
 }} // namespace
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -892,52 +892,52 @@ class CV_EXPORTS_W GeneralizedHoughGuil : public GeneralizedHough
 {
 public:
    //! Angle difference in degrees between two points in feature.
-    virtual void setXi(double xi) = 0;
-    virtual double getXi() const = 0;
+    CV_WRAP virtual void setXi(double xi) = 0;
+    CV_WRAP virtual double getXi() const = 0;

    //! Feature table levels.
-    virtual void setLevels(int levels) = 0;
-    virtual int getLevels() const = 0;
+    CV_WRAP virtual void setLevels(int levels) = 0;
+    CV_WRAP virtual int getLevels() const = 0;

    //! Maximal difference between angles that treated as equal.
-    virtual void setAngleEpsilon(double angleEpsilon) = 0;
-    virtual double getAngleEpsilon() const = 0;
+    CV_WRAP virtual void setAngleEpsilon(double angleEpsilon) = 0;
+    CV_WRAP virtual double getAngleEpsilon() const = 0;

    //! Minimal rotation angle to detect in degrees.
-    virtual void setMinAngle(double minAngle) = 0;
-    virtual double getMinAngle() const = 0;
+    CV_WRAP virtual void setMinAngle(double minAngle) = 0;
+    CV_WRAP virtual double getMinAngle() const = 0;

    //! Maximal rotation angle to detect in degrees.
-    virtual void setMaxAngle(double maxAngle) = 0;
-    virtual double getMaxAngle() const = 0;
+    CV_WRAP virtual void setMaxAngle(double maxAngle) = 0;
+    CV_WRAP virtual double getMaxAngle() const = 0;

    //! Angle step in degrees.
-    virtual void setAngleStep(double angleStep) = 0;
-    virtual double getAngleStep() const = 0;
+    CV_WRAP virtual void setAngleStep(double angleStep) = 0;
+    CV_WRAP virtual double getAngleStep() const = 0;

    //! Angle votes threshold.
-    virtual void setAngleThresh(int angleThresh) = 0;
-    virtual int getAngleThresh() const = 0;
+    CV_WRAP virtual void setAngleThresh(int angleThresh) = 0;
+    CV_WRAP virtual int getAngleThresh() const = 0;

    //! Minimal scale to detect.
-    virtual void setMinScale(double minScale) = 0;
-    virtual double getMinScale() const = 0;
+    CV_WRAP virtual void setMinScale(double minScale) = 0;
+    CV_WRAP virtual double getMinScale() const = 0;

    //! Maximal scale to detect.
-    virtual void setMaxScale(double maxScale) = 0;
-    virtual double getMaxScale() const = 0;
+    CV_WRAP virtual void setMaxScale(double maxScale) = 0;
+    CV_WRAP virtual double getMaxScale() const = 0;

    //! Scale step.
-    virtual void setScaleStep(double scaleStep) = 0;
-    virtual double getScaleStep() const = 0;
+    CV_WRAP virtual void setScaleStep(double scaleStep) = 0;
+    CV_WRAP virtual double getScaleStep() const = 0;

    //! Scale votes threshold.
-    virtual void setScaleThresh(int scaleThresh) = 0;
-    virtual int getScaleThresh() const = 0;
+    CV_WRAP virtual void setScaleThresh(int scaleThresh) = 0;
+    CV_WRAP virtual int getScaleThresh() const = 0;

    //! Position votes threshold.
-    virtual void setPosThresh(int posThresh) = 0;
-    virtual int getPosThresh() const = 0;
+    CV_WRAP virtual void setPosThresh(int posThresh) = 0;
+    CV_WRAP virtual int getPosThresh() const = 0;
 };

 //! @} imgproc_shape
@@ -4175,11 +4175,11 @@ CV_EXPORTS_W int rotatedRectangleIntersection( const RotatedRect& rect1, const R

 /** @brief Creates a smart pointer to a cv::GeneralizedHoughBallard class and initializes it.
 */
-CV_EXPORTS Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
+CV_EXPORTS_W Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();

 /** @brief Creates a smart pointer to a cv::GeneralizedHoughGuil class and initializes it.
 */
-CV_EXPORTS Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
+CV_EXPORTS_W Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();

 //! @} imgproc_shape