Merge remote-tracking branch 'upstream/3.4' into merge-3.4

82c477c9 · Alexander Alekhin · 57bead3a · 67f79aab · 82c477c9 · 82c477c9
Commit 82c477c9 authored 6 years ago by Alexander Alekhin
71 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1402,15 +1402,19 @@ if(WITH_HALIDE OR HAVE_HALIDE)
  status("    Halide:"     HAVE_HALIDE      THEN "YES (${HALIDE_LIBRARIES} ${HALIDE_INCLUDE_DIRS})" ELSE NO)
 endif()

-if(WITH_INF_ENGINE OR HAVE_INF_ENGINE)
-  if(HAVE_INF_ENGINE)
-    set(__msg "YES")
-    if(DEFINED INF_ENGINE_VERSION)
-      set(__msg "YES (ver ${INF_ENGINE_VERSION})")
+if(WITH_INF_ENGINE OR INF_ENGINE_TARGET)
+  if(INF_ENGINE_TARGET)
+    set(__msg "YES (${INF_ENGINE_RELEASE} / ${INF_ENGINE_VERSION})")
+    get_target_property(_lib ${INF_ENGINE_TARGET} IMPORTED_LOCATION)
+    if(NOT _lib)
+      get_target_property(_lib_rel ${INF_ENGINE_TARGET} IMPORTED_IMPLIB_RELEASE)
+      get_target_property(_lib_dbg ${INF_ENGINE_TARGET} IMPORTED_IMPLIB_DEBUG)
+      set(_lib "${_lib_rel} / ${_lib_dbg}")
    endif()
+    get_target_property(_inc ${INF_ENGINE_TARGET} INTERFACE_INCLUDE_DIRECTORIES)
    status("    Inference Engine:" "${__msg}")
-    status("                libs:" "${INF_ENGINE_LIBRARIES}")
-    status("            includes:" "${INF_ENGINE_INCLUDE_DIRS}")
+    status("                libs:" "${_lib}")
+    status("            includes:" "${_inc}")
  else()
    status("    Inference Engine:"     "NO")
  endif()

--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -700,12 +700,21 @@ macro(ocv_compiler_optimization_fill_cpu_config)
    list(APPEND __dispatch_modes ${CPU_DISPATCH_${OPT}_FORCE} ${OPT})
  endforeach()
  list(REMOVE_DUPLICATES __dispatch_modes)
-  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "")
  foreach(OPT ${__dispatch_modes})
    set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
 #define CV_CPU_DISPATCH_COMPILE_${OPT} 1")
  endforeach()

+  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
+\n\n#define CV_CPU_DISPATCH_FEATURES 0 \\")
+  foreach(OPT ${__dispatch_modes})
+    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
+      set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
+    , CV_CPU_${OPT} \\")
+    endif()
+  endforeach()
+  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}\n")
+
  set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "// AUTOGENERATED, DO NOT EDIT\n")
  foreach(OPT ${CPU_ALL_OPTIMIZATIONS})
    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")

--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
 # The script detects Intel(R) Inference Engine installation
 #
-# Parameters:
-# INTEL_CVSDK_DIR - Path to Inference Engine root folder
-# IE_PLUGINS_PATH - Path to folder with Inference Engine plugins
+# Cache variables:
+# INF_ENGINE_OMP_DIR - directory with OpenMP library to link with (needed by some versions of IE)
+# INF_ENGINE_RELEASE - a number reflecting IE source interface (linked with OpenVINO release)
 #
-# On return this will define:
+# Detect parameters:
+# 1. Native cmake IE package:
+#    - enironment variable InferenceEngine_DIR is set to location of cmake module
+# 2. Custom location:
+#    - INF_ENGINE_INCLUDE_DIRS - headers search location
+#    - INF_ENGINE_LIB_DIRS     - library search location
+# 3. OpenVINO location:
+#    - environment variable INTEL_CVSDK_DIR is set to location of OpenVINO installation dir
+#    - INF_ENGINE_PLATFORM - part of name of library directory representing its platform (default ubuntu_16.04)
 #
-# HAVE_INF_ENGINE          - True if Intel Inference Engine was found
-# INF_ENGINE_INCLUDE_DIRS  - Inference Engine include folder
-# INF_ENGINE_LIBRARIES     - Inference Engine libraries and it's dependencies
+# Result:
+# INF_ENGINE_TARGET - set to name of imported library target representing InferenceEngine
 #
-macro(ie_fail)
-    set(HAVE_INF_ENGINE FALSE)
-    return()
-endmacro()
-

-find_package(InferenceEngine QUIET)
-if(InferenceEngine_FOUND)
-  set(INF_ENGINE_LIBRARIES "${InferenceEngine_LIBRARIES}")
-  set(INF_ENGINE_INCLUDE_DIRS "${InferenceEngine_INCLUDE_DIRS}")
-  set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}")
-  set(HAVE_INF_ENGINE TRUE)
-  return()
+if(NOT HAVE_CXX11)
+    message(WARNING "DL Inference engine requires C++11. You can turn it on via ENABLE_CXX11=ON CMake flag.")
+    return()
 endif()

-ocv_check_environment_variables(INTEL_CVSDK_DIR INF_ENGINE_ROOT_DIR IE_PLUGINS_PATH)
+# =======================

-if(NOT INF_ENGINE_ROOT_DIR OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp")
-    set(ie_root_paths "${INF_ENGINE_ROOT_DIR}")
-    if(DEFINED INTEL_CVSDK_DIR)
-        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/")
-        list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/deployment_tools/inference_engine")
-    endif()
+function(add_custom_ie_build _inc _lib _lib_rel _lib_dbg _msg)
+  if(NOT _inc OR NOT (_lib OR _lib_rel OR _lib_dbg))
+    return()
+  endif()
+  add_library(inference_engine UNKNOWN IMPORTED)
+  set_target_properties(inference_engine PROPERTIES
+    IMPORTED_LOCATION "${_lib}"
+    IMPORTED_IMPLIB_RELEASE "${_lib_rel}"
+    IMPORTED_IMPLIB_DEBUG "${_lib_dbg}"
+    INTERFACE_INCLUDE_DIRECTORIES "${_inc}"
+  )
+  find_library(omp_lib iomp5 PATHS "${INF_ENGINE_OMP_DIR}" NO_DEFAULT_PATH)
+  if(NOT omp_lib)
+    message(WARNING "OpenMP for IE have not been found. Set INF_ENGINE_OMP_DIR variable if you experience build errors.")
+  else()
+    set_target_properties(inference_engine PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${omp_lib}")
+  endif()
+  set(INF_ENGINE_VERSION "Unknown" CACHE STRING "")
+  set(INF_ENGINE_TARGET inference_engine PARENT_SCOPE)
+  message(STATUS "Detected InferenceEngine: ${_msg}")
+endfunction()

-    if(NOT ie_root_paths)
-        list(APPEND ie_root_paths "/opt/intel/computer_vision_sdk/deployment_tools/inference_engine/")
-    endif()
+# ======================

-    find_path(INF_ENGINE_ROOT_DIR include/inference_engine.hpp PATHS ${ie_root_paths})
-    if(INF_ENGINE_ROOT_DIR MATCHES "-NOTFOUND$")
-      unset(INF_ENGINE_ROOT_DIR CACHE)
-    endif()
+find_package(InferenceEngine QUIET)
+if(InferenceEngine_FOUND)
+  set(INF_ENGINE_TARGET IE::inference_engine)
+  set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}" CACHE STRING "")
+  message(STATUS "Detected InferenceEngine: cmake package")
 endif()

-set(INF_ENGINE_INCLUDE_DIRS "${INF_ENGINE_ROOT_DIR}/include" CACHE PATH "Path to Inference Engine include directory")
-
-if(NOT INF_ENGINE_ROOT_DIR
-    OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}"
-    OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp"
-)
-    message(WARNING "DL IE: Can't detect INF_ENGINE_ROOT_DIR location.")
-    ie_fail()
+if(NOT INF_ENGINE_TARGET AND INF_ENGINE_LIB_DIRS AND INF_ENGINE_INCLUDE_DIRS)
+  find_path(ie_custom_inc "inference_engine.hpp" PATHS "${INF_ENGINE_INCLUDE_DIRS}" NO_DEFAULT_PATH)
+  find_library(ie_custom_lib "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}" NO_DEFAULT_PATH)
+  find_library(ie_custom_lib_rel "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}/Release" NO_DEFAULT_PATH)
+  find_library(ie_custom_lib_dbg "inference_engine" PATHS "${INF_ENGINE_LIB_DIRS}/Debug" NO_DEFAULT_PATH)
+  add_custom_ie_build("${ie_custom_inc}" "${ie_custom_lib}" "${ie_custom_lib_rel}" "${ie_custom_lib_dbg}" "INF_ENGINE_{INCLUDE,LIB}_DIRS")
 endif()

-set(INF_ENGINE_LIBRARIES "")
-
-set(ie_lib_list inference_engine)
-
-if(NOT IS_ABSOLUTE "${IE_PLUGINS_PATH}")
-  set(IE_PLUGINS_PATH "${INF_ENGINE_ROOT_DIR}/${IE_PLUGINS_PATH}")
+set(_loc "$ENV{INTEL_CVSDK_DIR}")
+if(NOT INF_ENGINE_TARGET AND _loc)
+  set(INF_ENGINE_PLATFORM "ubuntu_16.04" CACHE STRING "InferenceEngine platform (library dir)")
+  find_path(ie_custom_env_inc "inference_engine.hpp" PATHS "${_loc}/deployment_tools/inference_engine/include" NO_DEFAULT_PATH)
+  find_library(ie_custom_env_lib "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/${INF_ENGINE_PLATFORM}/intel64" NO_DEFAULT_PATH)
+  find_library(ie_custom_env_lib_rel "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/intel64/Release" NO_DEFAULT_PATH)
+  find_library(ie_custom_env_lib_dbg "inference_engine" PATHS "${_loc}/deployment_tools/inference_engine/lib/intel64/Debug" NO_DEFAULT_PATH)
+  add_custom_ie_build("${ie_custom_env_inc}" "${ie_custom_env_lib}" "${ie_custom_env_lib_rel}" "${ie_custom_env_lib_dbg}" "OpenVINO (${_loc})")
 endif()

-link_directories(
-  ${INF_ENGINE_ROOT_DIR}/external/mkltiny_lnx/lib
-  ${INF_ENGINE_ROOT_DIR}/external/cldnn/lib
-)
-
-foreach(lib ${ie_lib_list})
-    find_library(${lib} NAMES ${lib} HINTS ${IE_PLUGINS_PATH})
-    if(NOT ${lib})
-        message(WARNING "DL IE: Can't find library: '${lib}'")
-        ie_fail()
-    endif()
-    list(APPEND INF_ENGINE_LIBRARIES ${${lib}})
-endforeach()
+# Add more features to the target

-set(HAVE_INF_ENGINE TRUE)
+if(INF_ENGINE_TARGET)
+  if(NOT INF_ENGINE_RELEASE)
+    message(WARNING "InferenceEngine version have not been set, 2018R2 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+  endif()
+  set(INF_ENGINE_RELEASE "2018020000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2018R2.0.2 -> 2018020002)")
+  set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
+    INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
+  )
+endif()
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -1132,7 +1132,7 @@ function(ocv_add_perf_tests)
      source_group("Src" FILES "${${the_target}_pch}")
      ocv_add_executable(${the_target} ${OPENCV_PERF_${the_module}_SOURCES} ${${the_target}_pch})
      ocv_target_include_modules(${the_target} ${perf_deps} "${perf_path}")
-      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_PERF_${the_module}_DEPS})
      add_dependencies(opencv_perf_tests ${the_target})

      set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};PerfTest")
@@ -1175,7 +1175,7 @@ function(ocv_add_perf_tests)
 endfunction()

 # this is a command for adding OpenCV accuracy/regression tests to the module
-# ocv_add_accuracy_tests([FILES <source group name> <list of sources>] [DEPENDS_ON] <list of extra dependencies>)
+# ocv_add_accuracy_tests(<list of extra dependencies>)
 function(ocv_add_accuracy_tests)
  ocv_debug_message("ocv_add_accuracy_tests(" ${ARGN} ")")

@@ -1211,7 +1211,7 @@ function(ocv_add_accuracy_tests)
      source_group("Src" FILES "${${the_target}_pch}")
      ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch})
      ocv_target_include_modules(${the_target} ${test_deps} "${test_path}")
-      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_TEST_${the_module}_DEPS})
      add_dependencies(opencv_tests ${the_target})

      set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};AccuracyTest")

--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -1016,3 +1016,17 @@
  year = {2017},
  organization = {IEEE}
 }
+
+@ARTICLE{gonzalez,
+  title={Digital Image Fundamentals, Digital Imaging Processing},
+  author={Gonzalez, Rafael C and others},
+  year={1987},
+  publisher={Addison Wesley Publishing Company}
+}
+
+@ARTICLE{gruzman,
+  title={Цифровая обработка изображений в информационных системах},
+  author={Грузман, И.С. and Киричук, В.С. and Косых, В.П. and Перетягин, Г.И. and Спектор, А.А.},
+  year={2000},
+  publisher={Изд-во НГТУ Новосибирск}
+}
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/original.jpg
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/original.jpg
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/psf.png
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/psf.png
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/recovered.jpg
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/images/recovered.jpg
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
+Out-of-focus Deblur Filter {#tutorial_out_of_focus_deblur_filter}
+==========================
+
+Goal
+----
+
+In this tutorial you will learn:
+
+-   what is a degradation image model
+-   what is PSF of out-of-focus image
+-   how to restore a blurred image
+-   what is Wiener filter
+
+Theory
+------
+
+@note The explanation is based on the books @cite gonzalez and @cite gruzman. Also, you can refer to Matlab's tutorial [Image Deblurring in Matlab] and an article [SmartDeblur].
+@note An out-of-focus image on this page is a real world  image. An out-of-focus was done manually by camera optics.
+
+### What is a degradation image model?
+
+A mathematical model of the image degradation in frequency domain representation is:
+
+\f[S = H\cdot U + N\f]
+
+where
+\f$S\f$ is a spectrum of blurred (degraded) image,
+\f$U\f$ is a spectrum of original true (undegraded) image,
+\f$H\f$ is frequency response of point spread function (PSF),
+\f$N\f$ is a spectrum of additive noise.
+
+Circular PSF is a good approximation of out-of-focus distortion. Such PSF is specified by only one parameter - radius \f$R\f$. Circular PSF is used in this work.
+
+![Circular point spread function](psf.png)
+
+### How to restore an blurred image?
+
+The objective of restoration (deblurring) is to obtain an estimate of the original image. Restoration formula in frequency domain is:
+
+\f[U' = H_w\cdot S\f]
+
+where
+\f$U'\f$ is spectrum of estimation of original image \f$U\f$,
+\f$H_w\f$ is restoration filter, for example, Wiener filter.
+
+### What is Wiener filter?
+
+Wiener filter is a way to restore a blurred image. Let's suppose that PSF is a real and symmetric signal, a power spectrum of the original true image and noise are not known,
+then simplified Wiener formula is:
+
+\f[H_w = \frac{H}{|H|^2+\frac{1}{SNR}} \f]
+
+where
+\f$SNR\f$ is signal-to-noise ratio.
+
+So, in order to recover an out-of-focus image by Wiener filter, it needs to know \f$SNR\f$ and \f$R\f$ of circular PSF.
+
+
+Source code
+-----------
+
+You can find source code in the `samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp` of the OpenCV source code library.
+
+@include cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
+
+Explanation
+-----------
+
+An out-of-focus image recovering algorithm consists of PSF generation, Wiener filter generation and filtering an blurred image in frequency domain:
+@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp main
+
+A function calcPSF() forms an circular PSF according to input parameter radius \f$R\f$:
+@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcPSF
+
+A function calcWnrFilter() synthesizes simplified Wiener filter \f$H_w\f$ according to formula described above:
+@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcWnrFilter
+
+A function fftshift() rearranges PSF. This code was just copied from tutorial @ref tutorial_discrete_fourier_transform "Discrete Fourier Transform":
+@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp fftshift
+
+A function filter2DFreq() filters an blurred image in frequency domain:
+@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp filter2DFreq
+
+Result
+------
+
+Below you can see real out-of-focus image:
+![Out-of-focus image](images/original.jpg)
+
+
+Below result was done by \f$R\f$ = 53 and \f$SNR\f$ = 5200 parameters:
+![The restored (deblurred) image](images/recovered.jpg)
+
+The Wiener filter was used, values of \f$R\f$ and \f$SNR\f$ were selected manually to give the best possible visual result.
+We can see that the result is not perfect, but it gives us a hint to the image content. With some difficulty, the text is readable.
+
+@note The parameter \f$R\f$ is the most important. So you should adjust \f$R\f$ first, then \f$SNR\f$.
+@note Sometimes you can observe the ringing effect in an restored image. This effect can be reduced by several methods. For example, you can taper input image edges.
+
+You can also find a quick video demonstration of this on
+[YouTube](https://youtu.be/0bEcE4B0XP4).
+@youtube{0bEcE4B0XP4}
+
+References
+------
+- [Image Deblurring in Matlab] - Image Deblurring in Matlab
+- [SmartDeblur] - SmartDeblur site
+
+<!-- invisible references list -->
+[Digital Image Processing]: http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/Digital_Image_Processing_2ndEd.pdf
+[Image Deblurring in Matlab]: https://www.mathworks.com/help/images/image-deblurring.html
+[SmartDeblur]: http://yuzhikov.com/articles/BlurredImagesRestoration1.htm
--- a/doc/tutorials/imgproc/table_of_content_imgproc.markdown
+++ b/doc/tutorials/imgproc/table_of_content_imgproc.markdown
@@ -292,3 +292,13 @@ In this section you will learn about the image processing (manipulation) functio
    *Author:* Theodore Tsesmelis

    Where we learn to segment objects using Laplacian filtering, the Distance Transformation and the Watershed algorithm.
+
+-   @subpage tutorial_out_of_focus_deblur_filter
+
+    *Languages:* C++
+
+    *Compatibility:* \> OpenCV 2.0
+
+    *Author:* Karpushin Vladislav
+
+    You will learn how to recover an out-of-focus image by Wiener filter.
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -60,6 +60,17 @@
 // access from within opencv code more accessible
 namespace cv {

+namespace hal {
+
+enum StoreMode
+{
+    STORE_UNALIGNED = 0,
+    STORE_ALIGNED = 1,
+    STORE_ALIGNED_NOCACHE = 2
+};
+
+}
+
 template<typename _Tp> struct V_TypeTraits
 {
 };
@@ -154,7 +165,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
 // Correspondingly, the wide intrinsics (which are mapped to the "widest"
 // available instruction set) will get vx_ prefix
-// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
 #if CV_AVX2

 #include "opencv2/core/hal/intrin_avx.hpp"
@@ -214,14 +225,16 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
+    inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
+    inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }

 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+    inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }

 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
-inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+    inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }

 #define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
@@ -316,7 +329,7 @@ template<typename _Tp> struct V_RegTraits
    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
    inline void vx_cleanup() { v256_cleanup(); }
-#elif CV_SIMD128
+#elif CV_SIMD128 || CV_SIMD128_CPP
    typedef v_uint8x16  v_uint8;
    typedef v_int8x16   v_int8;
    typedef v_uint16x8  v_uint16;

--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1319,7 +1319,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b)
+                               const v_reg<_Tp, n>& b,
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i2;
    for( i = i2 = 0; i < n; i++, i2 += 2 )
@@ -1339,7 +1340,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i3;
    for( i = i3 = 0; i < n; i++, i3 += 3 )
@@ -1360,7 +1362,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d)
+                                                            const v_reg<_Tp, n>& d,
+                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
    int i, i4;
    for( i = i4 = 0; i < n; i++, i4 += 4 )
@@ -1430,6 +1433,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i];
 }

+template<typename _Tp, int n>
+inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
 /** @brief Combine vector from first elements of two vectors

 Scheme:

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -319,6 +319,9 @@ static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 #endif
 }

+#ifndef vdup_n_f16
+    #define vdup_n_f16(v) (float16x4_t){v, v, v, v}
+#endif

 struct v_float16x8
 {
@@ -864,6 +867,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vst1q_##suffix(ptr, a.val); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@@ -889,6 +896,11 @@ inline v_float16x8 v_load_f16(const short* ptr)
 inline v_float16x8 v_load_f16_aligned(const short* ptr)
 { return v_float16x8(cv_vld1q_f16(ptr)); }

+inline v_float16x8 v_load_f16_low(const short* ptr)
+{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
+inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
+{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
+
 inline void v_store(short* ptr, const v_float16x8& a)
 { cv_vst1q_f16(ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x8& a)
@@ -1292,14 +1304,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
    c.val = v.val[2]; \
    d.val = v.val[3]; \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x2_t v; \
    v.val[0] = a.val; \
    v.val[1] = b.val; \
    vst2q_##suffix(ptr, v); \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    _Tpvec##x3_t v; \
    v.val[0] = a.val; \
@@ -1308,7 +1322,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
    vst3q_##suffix(ptr, v); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
    _Tpvec##x4_t v; \
    v.val[0] = a.val; \
@@ -1360,7 +1375,8 @@ inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
 } \
 \
-inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@@ -1369,7 +1385,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
 } \
 \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
-                                const v_##tp##x2& b, const v_##tp##x2& c ) \
+                                const v_##tp##x2& b, const v_##tp##x2& c, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@@ -1380,7 +1397,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
 } \
 \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
-                                const v_##tp##x2& c, const v_##tp##x2& d ) \
+                                const v_##tp##x2& c, const v_##tp##x2& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -249,6 +249,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
 { st(a.val, 0, ptr); }                                                      \
 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
 { st_a(a.val, 0, ptr); }                                                    \
+inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)  \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)         \
+{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
 { vec_st_l8(a.val, ptr); }                                                  \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
@@ -281,13 +285,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a,                   \
 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b,        \
                                                _Tpvec& c, _Tpvec& d)        \
 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); }                    \
-inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)   \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, ptr); }                                    \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a,                    \
-                               const _Tpvec& b, const _Tpvec& c)             \
+                               const _Tpvec& b, const _Tpvec& c,             \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, ptr); }                             \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
-                                         const _Tpvec& c, const _Tpvec& d)   \
+                                         const _Tpvec& c, const _Tpvec& d,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }

 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)

--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -457,6 +457,18 @@ Returns empty string if feature is not defined
 */
 CV_EXPORTS_W String getHardwareFeatureName(int feature);

+/** @brief Returns list of CPU features enabled during compilation.
+
+Returned value is a string containing space separated list of CPU features with following markers:
+
+- no markers - baseline features
+- prefix `*` - features enabled in dispatcher
+- suffix `?` - features enabled but not available in HW
+
+Example: `SSE SSE2 SSE3 *SSE4.1 *SSE4.2 *FP16 *AVX *AVX2 *AVX512-SKX?`
+*/
+CV_EXPORTS std::string getCPUFeaturesLine();
+
 /** @brief Returns the number of logical CPUs available for the process.
 */
 CV_EXPORTS_W int getNumberOfCPUs();

--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1180,7 +1180,8 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
    CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
               op == CMP_NE || op == CMP_GE || op == CMP_GT );

-    if(_src1.empty() || _src2.empty())
+    CV_Assert(_src1.empty() == _src2.empty());
+    if (_src1.empty() && _src2.empty())
    {
        _dst.release();
        return;

--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -411,7 +411,8 @@ Mat& Mat::operator = (const Scalar& s)
 {
    CV_INSTRUMENT_REGION()

-    if (empty()) return *this;
+    if (this->empty())
+        return *this;

    const Mat* arrays[] = { this };
    uchar* dptr;

--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -515,17 +515,17 @@ void exp32f( const float *_x, float *y, int n )

 #if CV_SIMD
    const int VECSZ = v_float32::nlanes;
-    static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
-    static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
-    static const v_float32 vminval = vx_setall_f32(minval);
-    static const v_float32 vmaxval = vx_setall_f32(maxval);
+    const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+    const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+    const v_float32 vminval = vx_setall_f32(minval);
+    const v_float32 vmaxval = vx_setall_f32(maxval);

-    static const v_float32 vA1 = vx_setall_f32((float)A1);
-    static const v_float32 vA2 = vx_setall_f32((float)A2);
-    static const v_float32 vA3 = vx_setall_f32((float)A3);
-    static const v_float32 vA4 = vx_setall_f32((float)A4);
+    const v_float32 vA1 = vx_setall_f32((float)A1);
+    const v_float32 vA2 = vx_setall_f32((float)A2);
+    const v_float32 vA3 = vx_setall_f32((float)A3);
+    const v_float32 vA4 = vx_setall_f32((float)A4);

-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;

    for( ; i < n; i += VECSZ*2 )
@@ -627,18 +627,18 @@ void exp64f( const double *_x, double *y, int n )

 #if CV_SIMD_64F
    const int VECSZ = v_float64::nlanes;
-    static const v_float64 vprescale = vx_setall_f64(exp_prescale);
-    static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
-    static const v_float64 vminval = vx_setall_f64(minval);
-    static const v_float64 vmaxval = vx_setall_f64(maxval);
-
-    static const v_float64 vA1 = vx_setall_f64(A1);
-    static const v_float64 vA2 = vx_setall_f64(A2);
-    static const v_float64 vA3 = vx_setall_f64(A3);
-    static const v_float64 vA4 = vx_setall_f64(A4);
-    static const v_float64 vA5 = vx_setall_f64(A5);
-
-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_float64 vprescale = vx_setall_f64(exp_prescale);
+    const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+    const v_float64 vminval = vx_setall_f64(minval);
+    const v_float64 vmaxval = vx_setall_f64(maxval);
+
+    const v_float64 vA1 = vx_setall_f64(A1);
+    const v_float64 vA2 = vx_setall_f64(A2);
+    const v_float64 vA3 = vx_setall_f64(A3);
+    const v_float64 vA4 = vx_setall_f64(A4);
+    const v_float64 vA5 = vx_setall_f64(A5);
+
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;

    for( ; i < n; i += VECSZ*2 )
@@ -1024,13 +1024,13 @@ void log32f( const float *_x, float *y, int n )

 #if CV_SIMD
    const int VECSZ = v_float32::nlanes;
-    static const v_float32 vln2 = vx_setall_f32((float)ln_2);
-    static const v_float32 v1 = vx_setall_f32(1.f);
-    static const v_float32 vshift = vx_setall_f32(-1.f/512);
+    const v_float32 vln2 = vx_setall_f32((float)ln_2);
+    const v_float32 v1 = vx_setall_f32(1.f);
+    const v_float32 vshift = vx_setall_f32(-1.f/512);

-    static const v_float32 vA0 = vx_setall_f32(A0);
-    static const v_float32 vA1 = vx_setall_f32(A1);
-    static const v_float32 vA2 = vx_setall_f32(A2);
+    const v_float32 vA0 = vx_setall_f32(A0);
+    const v_float32 vA1 = vx_setall_f32(A1);
+    const v_float32 vA2 = vx_setall_f32(A2);

    for( ; i < n; i += VECSZ )
    {
@@ -1097,9 +1097,9 @@ void log64f( const double *x, double *y, int n )

 #if CV_SIMD_64F
    const int VECSZ = v_float64::nlanes;
-    static const v_float64 vln2 = vx_setall_f64(ln_2);
+    const v_float64 vln2 = vx_setall_f64(ln_2);

-    static const v_float64
+    const v_float64
        vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
        vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
        vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),

--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -602,13 +602,13 @@ void Mat::pop_back(size_t nelems)

 void Mat::push_back_(const void* elem)
 {
-    int r = size.p[0];
+    size_t r = size.p[0];
    if( isSubmatrix() || dataend + step.p[0] > datalimit )
        reserve( std::max(r + 1, (r*3+1)/2) );

    size_t esz = elemSize();
    memcpy(data + r*step.p[0], elem, esz);
-    size.p[0] = r + 1;
+    size.p[0] = int(r + 1);
    dataend += step.p[0];
    uint64 tsz = size.p[0];
    for( int i = 1; i < dims; i++ )
@@ -709,7 +709,8 @@ void Mat::resize(size_t nelems, const Scalar& s)

 void Mat::push_back(const Mat& elems)
 {
-    int r = size.p[0], delta = elems.size.p[0];
+    size_t r = size.p[0];
+    size_t delta = elems.size.p[0];
    if( delta == 0 )
        return;
    if( this == &elems )
@@ -726,7 +727,7 @@ void Mat::push_back(const Mat& elems)

    size.p[0] = elems.size.p[0];
    bool eq = size == elems.size;
-    size.p[0] = r;
+    size.p[0] = int(r);
    if( !eq )
        CV_Error(CV_StsUnmatchedSizes, "Pushed vector length is not equal to matrix row length");
    if( type() != elems.type() )
@@ -735,7 +736,7 @@ void Mat::push_back(const Mat& elems)
    if( isSubmatrix() || dataend + step.p[0]*delta > datalimit )
        reserve( std::max(r + delta, (r*3+1)/2) );

-    size.p[0] += delta;
+    size.p[0] += int(delta);
    dataend += step.p[0]*delta;

    //updateContinuityFlag(*this);
@@ -744,7 +745,7 @@ void Mat::push_back(const Mat& elems)
        memcpy(data + r*step.p[0], elems.data, elems.total()*elems.elemSize());
    else
    {
-        Mat part = rowRange(r, r + delta);
+        Mat part = rowRange(int(r), int(r + delta));
        elems.copyTo(part);
    }
 }

--- a/modules/core/src/mean.cpp
+++ b/modules/core/src/mean.cpp
@@ -766,11 +766,13 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
 {
    CV_INSTRUMENT_REGION()

+    CV_Assert(!_src.empty());
+    CV_Assert( _mask.empty() || _mask.type() == CV_8UC1 );
+
    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
               ocl_meanStdDev(_src, _mean, _sdv, _mask))

    Mat src = _src.getMat(), mask = _mask.getMat();
-    CV_Assert( mask.empty() || mask.type() == CV_8UC1 );

    CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows),
               openvx_meanStdDev(src, _mean, _sdv, mask))

--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -9,21 +9,58 @@
 namespace cv { namespace hal {

 #if CV_SIMD
+/*
+  The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
+  on IA there are instructions movntps and such to which
+  v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
+  Those instructions write directly into memory w/o touching cache
+  that results in dramatic speed improvements, especially on
+  large arrays (FullHD, 4K etc.).
+
+  Those intrinsics require the destination address to be aligned
+  by 16/32 bits (with SSE2 and AVX2, respectively).
+  So we potentially split the processing into 3 stages:
+  1) the optional prefix part [0:i0), where we use simple unaligned stores.
+  2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
+     But in some cases we have to use unaligned stores in this part.
+  3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
+     to process the remaining len - VECSZ elements.
+  In principle there can be very poorly aligned data where there is no main part.
+  For that we set i0=0 and use unaligned stores for the whole array.
+*/
 template<typename T, typename VecT> static void
 vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
+    int i, i0 = 0;
    const T* src0 = src[0];
    const T* src1 = src[1];

-    const int VECSZ = VecT::nlanes;
+    int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
+    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+    if( r != 0 )
+    {
+        mode = hal::STORE_UNALIGNED;
+        if( r % cn == 0 && len > VECSZ )
+            i0 = VECSZ - (r / cn);
+    }
+
    if( cn == 2 )
    {
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
-            v_store_interleave(dst + i*cn, a, b);
+            v_store_interleave(dst + i*cn, a, b, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else if( cn == 3 )
@@ -31,9 +68,18 @@ vecmerge_( const T** src, T* dst, int len, int cn )
        const T* src2 = src[2];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
-            v_store_interleave(dst + i*cn, a, b, c);
+            v_store_interleave(dst + i*cn, a, b, c, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else
@@ -43,10 +89,19 @@ vecmerge_( const T** src, T* dst, int len, int cn )
        const T* src3 = src[3];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
            VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
-            v_store_interleave(dst + i*cn, a, b, c, d);
+            v_store_interleave(dst + i*cn, a, b, c, d, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    vx_cleanup();

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -2834,7 +2834,22 @@ extern "C" {

 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
 {
-    ((cv::ocl::Kernel::Impl*)p)->finit(e);
+    try
+    {
+        ((cv::ocl::Kernel::Impl*)p)->finit(e);
+    }
+    catch (const cv::Exception& exc)
+    {
+        CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
+    }
+    catch (const std::exception& exc)
+    {
+        CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
+    }
+    catch (...)
+    {
+        CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
+    }
 }

 }

--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -511,8 +511,8 @@ static RandnScaleFunc randnScaleTab[] =
 void RNG::fill( InputOutputArray _mat, int disttype,
                InputArray _param1arg, InputArray _param2arg, bool saturateRange )
 {
-    if (_mat.empty())
-        return;
+    CV_Assert(!_mat.empty());
+
    Mat mat = _mat.getMat(), _param1 = _param1arg.getMat(), _param2 = _param2arg.getMat();
    int depth = mat.depth(), cn = mat.channels();
    AutoBuffer<double> _parambuf;

--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -9,23 +9,46 @@
 namespace cv { namespace hal {

 #if CV_SIMD
+// see the comments for vecmerge_ in merge.cpp
 template<typename T, typename VecT> static void
 vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
+    int i, i0 = 0;
    T* dst0 = dst[0];
    T* dst1 = dst[1];

-    const int VECSZ = VecT::nlanes;
+    int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T)));
+    int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T)));
+    int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0;
+    int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0;
+
+    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+    if( (r0|r1|r2|r3) != 0 )
+    {
+        mode = hal::STORE_UNALIGNED;
+        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
+            i0 = VECSZ - (r0 / cn);
+    }
+
    if( cn == 2 )
    {
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a, b;
            v_load_deinterleave(src + i*cn, a, b);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else if( cn == 3 )
@@ -33,12 +56,21 @@ vecsplit_( const T* src, T** dst, int len, int cn )
        T* dst2 = dst[2];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a, b, c;
            v_load_deinterleave(src + i*cn, a, b, c);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
-            v_store(dst2 + i, c);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            v_store(dst2 + i, c, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    else
@@ -48,13 +80,22 @@ vecsplit_( const T* src, T** dst, int len, int cn )
        T* dst3 = dst[3];
        for( i = 0; i < len; i += VECSZ )
        {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
            VecT a, b, c, d;
            v_load_deinterleave(src + i*cn, a, b, c, d);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
-            v_store(dst2 + i, c);
-            v_store(dst3 + i, d);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            v_store(dst2 + i, c, mode);
+            v_store(dst3 + i, d, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
        }
    }
    vx_cleanup();

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -654,6 +654,27 @@ String getHardwareFeatureName(int feature)
    return name ? String(name) : String();
 }

+std::string getCPUFeaturesLine()
+{
+    const int features[] = { CV_CPU_BASELINE_FEATURES, CV_CPU_DISPATCH_FEATURES };
+    const int sz = sizeof(features) / sizeof(features[0]);
+    std::string result;
+    std::string prefix;
+    for (int i = 1; i < sz; ++i)
+    {
+        if (features[i] == 0)
+        {
+            prefix = "*";
+            continue;
+        }
+        if (i != 1) result.append(" ");
+        result.append(prefix);
+        result.append(getHWFeatureNameSafe(features[i]));
+        if (!checkHardwareSupport(features[i])) result.append("?");
+    }
+    return result;
+}
+
 volatile bool useOptimizedFlag = true;

 void setUseOptimized( bool flag )

--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -84,14 +84,11 @@ UMatData::~UMatData()
    allocatorFlags_ = 0;
    if (originalUMatData)
    {
-        UMatData* u = originalUMatData;
-        CV_XADD(&(u->urefcount), -1);
-        CV_XADD(&(u->refcount), -1);
        bool showWarn = false;
-        if (u->refcount == 0)
+        UMatData* u = originalUMatData;
+        bool zero_Ref = CV_XADD(&(u->refcount), -1) == 1;
+        if (zero_Ref)
        {
-            if (u->urefcount > 0)
-                showWarn = true;
            // simulate Mat::deallocate
            if (u->mapcount != 0)
            {
@@ -102,7 +99,10 @@ UMatData::~UMatData()
                // we don't do "map", so we can't do "unmap"
            }
        }
-        if (u->refcount == 0 && u->urefcount == 0) // oops, we need to free resources
+        bool zero_URef = CV_XADD(&(u->urefcount), -1) == 1;
+        if (zero_Ref && !zero_URef)
+            showWarn = true;
+        if (zero_Ref && zero_URef) // oops, we need to free resources
        {
            showWarn = true;
            // simulate UMat::deallocate

--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -2008,11 +2008,9 @@ TEST(Subtract, scalarc4_matc4)
 TEST(Compare, empty)
 {
    cv::Mat temp, dst1, dst2;
-    cv::compare(temp, temp, dst1, cv::CMP_EQ);
-    dst2 = temp > 5;
-
+    EXPECT_NO_THROW(cv::compare(temp, temp, dst1, cv::CMP_EQ));
    EXPECT_TRUE(dst1.empty());
-    EXPECT_TRUE(dst2.empty());
+    EXPECT_THROW(dst2 = temp > 5, cv::Exception);
 }

 TEST(Compare, regression_8999)
@@ -2020,9 +2018,7 @@ TEST(Compare, regression_8999)
    Mat_<double> A(4,1); A << 1, 3, 2, 4;
    Mat_<double> B(1,1); B << 2;
    Mat C;
-    ASSERT_ANY_THROW({
-        cv::compare(A, B, C, CMP_LT);
-    });
+    EXPECT_THROW(cv::compare(A, B, C, CMP_LT), cv::Exception);
 }



--- a/modules/core/test/test_concatenation.cpp
+++ b/modules/core/test/test_concatenation.cpp
@@ -43,106 +43,35 @@

 namespace opencv_test { namespace {

-class Core_ConcatenationTest : public cvtest::BaseTest
+TEST(Core_Concatenation, empty)
 {
-public:
-    Core_ConcatenationTest(bool horizontal, bool firstEmpty, bool secondEmpty);
-protected:
-    int prepare_test_case( int );
-    void run_func();
-    int validate_test_results( int );
+    const Mat mat0x5(0,5, CV_8U, Scalar::all(1));
+    const Mat mat10x5(10,5, CV_8U, Scalar::all(1));
+    const Mat mat20x5(20,5, CV_8U, Scalar::all(1));

-    Mat mat0x5;
-    Mat mat10x5;
-    Mat mat20x5;
-
-    Mat mat5x0;
-    Mat mat5x10;
-    Mat mat5x20;
+    const Mat mat5x0(5,0, CV_8U, Scalar::all(1));
+    const Mat mat5x10(5,10, CV_8U, Scalar::all(1));
+    const Mat mat5x20(5,20, CV_8U, Scalar::all(1));

    Mat result;

-    bool horizontal;
-    bool firstEmpty;
-    bool secondEmpty;
-
-private:
-    static bool areEqual(const Mat& m1, const Mat& m2);
-
-};
-
-Core_ConcatenationTest::Core_ConcatenationTest(bool horizontal_, bool firstEmpty_, bool secondEmpty_)
-    : horizontal(horizontal_)
-    , firstEmpty(firstEmpty_)
-    , secondEmpty(secondEmpty_)
-{
-    test_case_count = 1;
-
-    mat0x5 = Mat::ones(0,5, CV_8U);
-    mat10x5 = Mat::ones(10,5, CV_8U);
-    mat20x5 = Mat::ones(20,5, CV_8U);
-
-    mat5x0 = Mat::ones(5,0, CV_8U);
-    mat5x10 = Mat::ones(5,10, CV_8U);
-    mat5x20 = Mat::ones(5,20, CV_8U);
-}
-
-int Core_ConcatenationTest::prepare_test_case( int test_case_idx )
-{
-    cvtest::BaseTest::prepare_test_case( test_case_idx );
-    return 1;
-}
-
-void Core_ConcatenationTest::run_func()
-{
-    if (horizontal)
-    {
-        cv::hconcat((firstEmpty ? mat5x0 : mat5x10),
-                    (secondEmpty ? mat5x0 : mat5x10),
-                    result);
-    } else {
-        cv::vconcat((firstEmpty ? mat0x5 : mat10x5),
-                    (secondEmpty ? mat0x5 : mat10x5),
-                    result);
-    }
-}
-
-int Core_ConcatenationTest::validate_test_results( int )
-{
-    Mat expected;
-
-    if (firstEmpty && secondEmpty)
-        expected = (horizontal ? mat5x0 : mat0x5);
-    else if ((firstEmpty && !secondEmpty) || (!firstEmpty && secondEmpty))
-        expected = (horizontal ? mat5x10 : mat10x5);
-    else
-        expected = (horizontal ? mat5x20 : mat20x5);
-
-    if (areEqual(expected, result))
-    {
-        return cvtest::TS::OK;
-    } else
-    {
-        ts->printf( cvtest::TS::LOG, "Concatenation failed");
-        ts->set_failed_test_info( cvtest::TS::FAIL_MISMATCH );
-    }
-
-    return cvtest::TS::OK;
-}
-
-bool Core_ConcatenationTest::areEqual(const Mat &m1, const Mat &m2)
-{
-    return m1.size() == m2.size()
-            && m1.type() == m2.type()
-            && countNonZero(m1 != m2) == 0;
+    cv::hconcat(mat5x0, mat5x0, result);
+    EXPECT_MAT_N_DIFF(result, mat5x0, 0);
+    cv::hconcat(mat5x0, mat5x10, result);
+    EXPECT_MAT_N_DIFF(result, mat5x10, 0);
+    cv::hconcat(mat5x10, mat5x0, result);
+    EXPECT_MAT_N_DIFF(result, mat5x10, 0);
+    cv::hconcat(mat5x10, mat5x10, result);
+    EXPECT_MAT_N_DIFF(result, mat5x20, 0);
+
+    cv::vconcat(mat0x5, mat0x5, result);
+    EXPECT_MAT_N_DIFF(result, mat0x5, 0);
+    cv::vconcat(mat0x5, mat10x5, result);
+    EXPECT_MAT_N_DIFF(result, mat10x5, 0);
+    cv::vconcat(mat10x5, mat0x5, result);
+    EXPECT_MAT_N_DIFF(result, mat10x5, 0);
+    cv::vconcat(mat10x5, mat10x5, result);
+    EXPECT_MAT_N_DIFF(result, mat20x5, 0);
 }

-TEST(Core_Concatenation, hconcat_empty_nonempty) { Core_ConcatenationTest test(true, true, false); test.safe_run(); }
-TEST(Core_Concatenation, hconcat_nonempty_empty) { Core_ConcatenationTest test(true, false, true); test.safe_run(); }
-TEST(Core_Concatenation, hconcat_empty_empty) { Core_ConcatenationTest test(true, true, true); test.safe_run(); }
-
-TEST(Core_Concatenation, vconcat_empty_nonempty) { Core_ConcatenationTest test(false, true, false); test.safe_run(); }
-TEST(Core_Concatenation, vconcat_nonempty_empty) { Core_ConcatenationTest test(false, false, true); test.safe_run(); }
-TEST(Core_Concatenation, vconcat_empty_empty) { Core_ConcatenationTest test(false, true, true); test.safe_run(); }
-
 }} // namespace
--- a/modules/core/test/test_intrin.avx2.cpp
+++ b/modules/core/test/test_intrin.avx2.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "test_precomp.hpp"
+#include "test_intrin.simd.hpp"
\ No newline at end of file
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -2,249 +2,101 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
+#include "test_intrin.simd.hpp"

-#include "test_intrin_utils.hpp"
-
-#define CV_CPU_SIMD_FILENAME "test_intrin_utils.hpp"
+#define CV_CPU_SIMD_FILENAME "test_intrin.simd.hpp"
 #define CV_CPU_DISPATCH_MODE FP16
 #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"

-
-using namespace cv;
+#define CV_CPU_DISPATCH_MODE AVX2
+#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"

 namespace opencv_test { namespace hal {
 using namespace CV_CPU_OPTIMIZATION_NAMESPACE;

-//=============  8-bit integer =====================================================================
-
-TEST(hal_intrin, uint8x16) {
-    TheTest<v_uint8x16>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-}
+TEST(hal_intrin, uint8x16)
+{ test_hal_intrin_uint8(); }

-TEST(hal_intrin, int8x16) {
-    TheTest<v_int8x16>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-}
+TEST(hal_intrin, int8x16)
+{ test_hal_intrin_int8(); }

-//============= 16-bit integer =====================================================================
-
-TEST(hal_intrin, uint16x8) {
-    TheTest<v_uint16x8>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
+TEST(hal_intrin, uint16x8)
+{ test_hal_intrin_uint16(); }

-TEST(hal_intrin, int16x8) {
-    TheTest<v_int16x8>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_dot_prod()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
+TEST(hal_intrin, int16x8)
+{ test_hal_intrin_int16(); }

-//============= 32-bit integer =====================================================================
-
-TEST(hal_intrin, uint32x4) {
-    TheTest<v_uint32x4>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_transpose()
-        ;
-}
+TEST(hal_intrin, int32x4)
+{ test_hal_intrin_int32(); }

-TEST(hal_intrin, int32x4) {
-    TheTest<v_int32x4>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_abs()
-        .test_cmp()
-        .test_popcount()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_float_cvt32()
-        .test_float_cvt64()
-        .test_transpose()
-        ;
-}
+TEST(hal_intrin, uint32x4)
+{ test_hal_intrin_uint32(); }

-//============= 64-bit integer =====================================================================
-
-TEST(hal_intrin, uint64x2) {
-    TheTest<v_uint64x2>()
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
+TEST(hal_intrin, uint64x2)
+{ test_hal_intrin_uint64(); }

-TEST(hal_intrin, int64x2) {
-    TheTest<v_int64x2>()
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
+TEST(hal_intrin, int64x2)
+{ test_hal_intrin_int64(); }

-//============= Floating point =====================================================================
-
-TEST(hal_intrin, float32x4) {
-    TheTest<v_float32x4>()
-        .test_loadstore()
-        .test_interleave()
-        .test_interleave_2channel()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt64()
-        .test_matmul()
-        .test_transpose()
-        .test_reduce_sum4()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        ;
-}
+TEST(hal_intrin, float32x4)
+{ test_hal_intrin_float32(); }

-#if CV_SIMD128_64F
-TEST(hal_intrin, float64x2) {
-    TheTest<v_float64x2>()
-        .test_loadstore()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt32()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-#endif
+TEST(hal_intrin, float64x2)
+{ test_hal_intrin_float64(); }

-TEST(hal_intrin,float16)
+TEST(hal_intrin, float16x8)
 {
    CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
    throw SkipTestException("Unsupported hardware: FP16 is not available");
 }

-}}
+#define DISPATCH_SIMD_MODES AVX2
+#define DISPATCH_SIMD_NAME "SIMD256"
+#define DISPATCH_SIMD(fun)                              \
+    do {                                                \
+        CV_CPU_DISPATCH(fun, (), DISPATCH_SIMD_MODES);  \
+        throw SkipTestException(                        \
+            "Unsupported hardware: "                    \
+            DISPATCH_SIMD_NAME                          \
+            " is not available"                         \
+        );                                              \
+    } while(0)
+
+TEST(hal_intrin256, uint8x32)
+{ DISPATCH_SIMD(test_hal_intrin_uint8); }
+
+TEST(hal_intrin256, int8x32)
+{ DISPATCH_SIMD(test_hal_intrin_int8); }
+
+TEST(hal_intrin256, uint16x16)
+{ DISPATCH_SIMD(test_hal_intrin_uint16); }
+
+TEST(hal_intrin256, int16x16)
+{ DISPATCH_SIMD(test_hal_intrin_int16); }
+
+TEST(hal_intrin256, uint32x8)
+{ DISPATCH_SIMD(test_hal_intrin_uint32); }
+
+TEST(hal_intrin256, int32x8)
+{ DISPATCH_SIMD(test_hal_intrin_int32); }
+
+TEST(hal_intrin256, uint64x4)
+{ DISPATCH_SIMD(test_hal_intrin_uint64); }
+
+TEST(hal_intrin256, int64x4)
+{ DISPATCH_SIMD(test_hal_intrin_int64); }
+
+TEST(hal_intrin256, float32x8)
+{ DISPATCH_SIMD(test_hal_intrin_float32); }
+
+TEST(hal_intrin256, float64x4)
+{ DISPATCH_SIMD(test_hal_intrin_float64); }
+
+TEST(hal_intrin256, float16x16)
+{
+    if (!CV_CPU_HAS_SUPPORT_FP16)
+        throw SkipTestException("Unsupported hardware: FP16 is not available");
+    DISPATCH_SIMD(test_hal_intrin_float16);
+}
+
+}} // namespace
\ No newline at end of file
--- a/modules/core/test/test_intrin.fp16.cpp
+++ b/modules/core/test/test_intrin.fp16.cpp
@@ -9,7 +9,7 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN

 void test_hal_intrin_float16()
 {
-    TheTest<v_float16x8>()
+    TheTest<v_float16>()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
        ;

--- a/modules/core/test/test_intrin.simd.hpp
+++ b/modules/core/test/test_intrin.simd.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "test_precomp.hpp"
+#include "test_intrin_utils.hpp"
+
+namespace opencv_test { namespace hal {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void test_hal_intrin_uint8();
+void test_hal_intrin_int8();
+void test_hal_intrin_uint16();
+void test_hal_intrin_int16();
+void test_hal_intrin_uint32();
+void test_hal_intrin_int32();
+void test_hal_intrin_uint64();
+void test_hal_intrin_int64();
+void test_hal_intrin_float32();
+void test_hal_intrin_float64();
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+//=============  8-bit integer =====================================================================
+
+void test_hal_intrin_uint8()
+{
+    TheTest<v_uint8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+
+#if CV_SIMD256
+    TheTest<v_uint8>()
+        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
+        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
+        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
+        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
+        ;
+#endif
+}
+
+void test_hal_intrin_int8()
+{
+    TheTest<v_int8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+}
+
+//============= 16-bit integer =====================================================================
+
+void test_hal_intrin_uint16()
+{
+    TheTest<v_uint16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+
+void test_hal_intrin_int16()
+{
+    TheTest<v_int16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_dot_prod()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+
+//============= 32-bit integer =====================================================================
+
+void test_hal_intrin_uint32()
+{
+    TheTest<v_uint32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_transpose()
+        ;
+}
+
+void test_hal_intrin_int32()
+{
+    TheTest<v_int32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_abs()
+        .test_cmp()
+        .test_popcount()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_float_cvt32()
+        .test_float_cvt64()
+        .test_transpose()
+        ;
+}
+
+//============= 64-bit integer =====================================================================
+
+void test_hal_intrin_uint64()
+{
+    TheTest<v_uint64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+
+void test_hal_intrin_int64()
+{
+    TheTest<v_int64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+
+//============= Floating point =====================================================================
+void test_hal_intrin_float32()
+{
+    TheTest<v_float32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_interleave_2channel()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt64()
+        .test_matmul()
+        .test_transpose()
+        .test_reduce_sum4()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        ;
+
+#if CV_SIMD256
+    TheTest<v_float32>()
+        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
+        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
+        ;
+#endif
+}
+
+void test_hal_intrin_float64()
+{
+#if CV_SIMD_64F
+    TheTest<v_float64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt32()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+
+#if CV_SIMD256
+    TheTest<v_float64>()
+        .test_extract<2>().test_extract<3>()
+        .test_rotate<2>().test_rotate<3>()
+        ;
+#endif //CV_SIMD256
+
+#endif
+}
+
+#if CV_FP16 && CV_SIMD_WIDTH > 16
+void test_hal_intrin_float16()
+{
+    TheTest<v_float16>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+}
+#endif
+
+#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+
+}} //namespace
\ No newline at end of file
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
--- a/modules/core/test/test_rand.cpp
+++ b/modules/core/test/test_rand.cpp
@@ -173,7 +173,6 @@ void Core_RandTest::run( int )
                dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz;
                Mat aslice = arr[k].colRange(sz, sz + dsz);
                tested_rng.fill(aslice, dist_type, A, B);
-                //printf("%d - %d\n", sz, sz + dsz);
            }
        }


--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -85,12 +85,6 @@ else()
  set(sources_options EXCLUDE_OPENCL)
 endif()

-if(HAVE_INF_ENGINE)
-  add_definitions(-DHAVE_INF_ENGINE=1)
-  list(APPEND include_dirs ${INF_ENGINE_INCLUDE_DIRS})
-  list(APPEND libs ${INF_ENGINE_LIBRARIES})
-endif()
-
 ocv_module_include_directories(${include_dirs})
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
  ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-suggest-override")  # GCC
@@ -98,9 +92,9 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
  ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override")  # Clang
 endif()
 ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs})
-ocv_create_module(${libs})
+ocv_create_module(${libs} ${INF_ENGINE_TARGET})
 ocv_add_samples()
-ocv_add_accuracy_tests()
+ocv_add_accuracy_tests(${INF_ENGINE_TARGET})
 ocv_add_perf_tests()

 ocv_option(${the_module}_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
@@ -120,9 +114,3 @@ if(BUILD_PERF_TESTS)
    endif()
  endif()
 endif()
-
-# Test Intel's Inference Engine models
-if(HAVE_INF_ENGINE AND TARGET opencv_test_dnn)
-  ocv_target_include_directories(opencv_test_dnn PRIVATE ${INF_ENGINE_INCLUDE_DIRS})
-  ocv_target_link_libraries(opencv_test_dnn LINK_PRIVATE ${INF_ENGINE_LIBRARIES})
-endif()
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -201,7 +201,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         *  @param[out] outputs allocated output blobs, which will store results of the computation.
         *  @param[out] internals allocated internal blobs
         */
-        virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) = 0;
+        virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);

        /** @brief Given the @p input blobs, computes the output @p blobs.
         *  @param[in]  inputs  the input blobs.

--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@@ -44,7 +44,9 @@

 #include <opencv2/core.hpp>
 #include <opencv2/core/types_c.h>
+#include <iostream>
 #include <ostream>
+#include <sstream>

 namespace cv {
 namespace dnn {
@@ -178,13 +180,25 @@ static inline MatShape concat(const MatShape& a, const MatShape& b)
    return c;
 }

-inline void print(const MatShape& shape, const String& name = "")
+static inline std::string toString(const MatShape& shape, const String& name = "")
 {
-    printf("%s: [", name.c_str());
-    size_t i, n = shape.size();
-    for( i = 0; i < n; i++ )
-        printf(" %d", shape[i]);
-    printf(" ]\n");
+    std::ostringstream ss;
+    if (!name.empty())
+        ss << name << ' ';
+    ss << '[';
+    for(size_t i = 0, n = shape.size(); i < n; ++i)
+        ss << ' ' << shape[i];
+    ss << " ]";
+    return ss.str();
+}
+static inline void print(const MatShape& shape, const String& name = "")
+{
+    std::cout << toString(shape, name) << std::endl;
+}
+static inline std::ostream& operator<<(std::ostream &out, const MatShape& shape)
+{
+    out << toString(shape);
+    return out;
 }

 inline int clamp(int ax, int dims)

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -74,6 +74,10 @@ static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSize
 #endif
 );

+// Additional checks (slowdowns execution!)
+static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
+static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
+static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);

 using std::vector;
 using std::map;
@@ -2053,10 +2057,75 @@ struct Net::Impl
            {
                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
+                    std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
                    std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
-                    layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
+                    std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
+                    layer->forward(umat_inputBlobs,
                                   umat_outputBlobs,
-                                   OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers));
+                                   umat_internalBlobs);
+                    if (DNN_CHECK_NAN_INF)
+                    {
+                        bool fail = false;
+                        for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
+                        {
+                            UMat& u = umat_outputBlobs[i];
+                            Mat m;
+                            if (u.depth() == CV_16S) // FP16
+                                convertFp16(u, m);
+                            else
+                                m = u.getMat(ACCESS_READ);
+                            if (!checkRange(m))
+                            {
+                                std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
+                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                                fail = true;
+                            }
+                            else if (!checkRange(m, true, NULL, -1e6, 1e6))
+                            {
+                                std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
+                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                                fail = true;
+                            }
+                        }
+                        if (fail)
+                        {
+                            for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
+                            {
+                                UMat& u = umat_inputBlobs[i];
+                                Mat m;
+                                if (u.depth() == CV_16S) // FP16
+                                    convertFp16(u, m);
+                                else
+                                    m = u.getMat(ACCESS_READ);
+                                std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
+                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
+                            }
+                            for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
+                            {
+                                UMat& u = umat_outputBlobs[i];
+                                Mat m;
+                                if (u.depth() == CV_16S) // FP16
+                                    convertFp16(u, m);
+                                else
+                                    m = u.getMat(ACCESS_READ);
+                                std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
+                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
+                            }
+                            for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
+                            {
+                                UMat& u = umat_internalBlobs[i];
+                                Mat m;
+                                if (u.depth() == CV_16S) // FP16
+                                    convertFp16(u, m);
+                                else
+                                    m = u.getMat(ACCESS_READ);
+                                std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
+                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
+                            }
+                            if (DNN_CHECK_NAN_INF_RAISE_ERROR)
+                                CV_Assert(!fail);
+                        }
+                    }
                    OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
                }
                else
@@ -2069,6 +2138,56 @@ struct Net::Impl

                    layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);

+                    if (DNN_CHECK_NAN_INF)
+                    {
+                        bool fail = false;
+                        for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
+                        {
+                            const Mat& m = ld.outputBlobs[i];
+                            if (!checkRange(m))
+                            {
+                                std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
+                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                                fail = true;
+                            }
+                            else if (!checkRange(m, true, NULL, -1e6, 1e6))
+                            {
+                                std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
+                                std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                                fail = true;
+                            }
+                        }
+                        if (fail)
+                        {
+                            for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
+                            {
+                                const Mat* pM = ld.inputBlobs[i];
+                                if (!pM)
+                                {
+                                    std::cout << "INPUT " << i << " is NULL" << std::endl;
+                                    continue;
+                                }
+                                const Mat& m = *pM;
+                                std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
+                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
+                            }
+                            for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
+                            {
+                                const Mat& m = ld.outputBlobs[i];
+                                std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
+                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
+                            }
+                            for (size_t i = 0; i < ld.internals.size(); ++i)
+                            {
+                                const Mat& m = ld.internals[i];
+                                std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
+                                if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
+                            }
+                            if (DNN_CHECK_NAN_INF_RAISE_ERROR)
+                                CV_Assert(!fail);
+                        }
+                    }
+
                    for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
                    {
                        if (!ld.outputBlobsWrappers[i].empty())
@@ -3071,6 +3190,14 @@ std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
    return outputs;
 }

+void Layer::forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals)
+{
+    CV_TRACE_FUNCTION();
+    CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+    Layer::forward_fallback(inputs, outputs, internals);
+}
+
 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
 {
    CV_TRACE_FUNCTION();

--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -196,7 +196,7 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !_locPredTransposed;
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -411,9 +411,12 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
-                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
-                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+        if (_bboxesNormalized)
+        {
+            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                       OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
+        }

        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }
@@ -916,6 +919,7 @@ public:
        ieLayer->params["nms_threshold"] = format("%f", _nmsThreshold);
        ieLayer->params["top_k"] = format("%d", _topK);
        ieLayer->params["keep_top_k"] = format("%d", _keepTopK);
+        ieLayer->params["eta"] = "1.0";
        ieLayer->params["confidence_threshold"] = format("%f", _confidenceThreshold);
        ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0";
        ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType;

--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -135,10 +135,17 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_HALIDE && haveHalide() &&
-               (type == MAX || type == AVE && !pad.width && !pad.height) ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (type == MAX || type == AVE);
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+        {
+            if (preferableTarget == DNN_TARGET_MYRIAD)
+                return type == MAX || type == AVE;
+            else
+                return type != STOCHASTIC;
+        }
+        else
+            return backendId == DNN_BACKEND_OPENCV ||
+                   backendId == DNN_BACKEND_HALIDE && haveHalide() &&
+                   (type == MAX || type == AVE && !pad.width && !pad.height);
    }

 #ifdef HAVE_OPENCL
@@ -192,8 +199,11 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
-                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+        if (type == MAX || type == AVE || type == STOCHASTIC)
+        {
+            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
+        }

        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }
@@ -238,22 +248,41 @@ public:
 #ifdef HAVE_INF_ENGINE
        InferenceEngine::LayerParams lp;
        lp.name = name;
-        lp.type = "Pooling";
        lp.precision = InferenceEngine::Precision::FP32;
-        std::shared_ptr<InferenceEngine::PoolingLayer> ieLayer(new InferenceEngine::PoolingLayer(lp));
-
-        ieLayer->_kernel_x = kernel.width;
-        ieLayer->_kernel_y = kernel.height;
-        ieLayer->_stride_x = stride.width;
-        ieLayer->_stride_y = stride.height;
-        ieLayer->_padding_x = pad.width;
-        ieLayer->_padding_y = pad.height;
-        ieLayer->_exclude_pad = type == AVE && padMode == "SAME";
-        ieLayer->params["rounding-type"] = ceilMode ? "ceil" : "floor";
-        if (type == MAX)
-            ieLayer->_type = InferenceEngine::PoolingLayer::PoolType::MAX;
-        else if (type == AVE)
-            ieLayer->_type = InferenceEngine::PoolingLayer::PoolType::AVG;
+
+        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer;
+        if (type == MAX || type == AVE)
+        {
+            lp.type = "Pooling";
+            InferenceEngine::PoolingLayer* poolLayer = new InferenceEngine::PoolingLayer(lp);
+            poolLayer->_kernel_x = kernel.width;
+            poolLayer->_kernel_y = kernel.height;
+            poolLayer->_stride_x = stride.width;
+            poolLayer->_stride_y = stride.height;
+            poolLayer->_padding_x = pad.width;
+            poolLayer->_padding_y = pad.height;
+            poolLayer->_exclude_pad = type == AVE && padMode == "SAME";
+            poolLayer->params["rounding-type"] = ceilMode ? "ceil" : "floor";
+            poolLayer->_type = type == MAX ? InferenceEngine::PoolingLayer::PoolType::MAX :
+                                             InferenceEngine::PoolingLayer::PoolType::AVG;
+            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(poolLayer);
+        }
+        else if (type == ROI)
+        {
+            lp.type = "ROIPooling";
+            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
+            ieLayer->params["pooled_w"] = format("%d", pooledSize.width);
+            ieLayer->params["pooled_h"] = format("%d", pooledSize.height);
+            ieLayer->params["spatial_scale"] = format("%f", spatialScale);
+        }
+        else if (type == PSROI)
+        {
+            lp.type = "PSROIPooling";
+            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
+            ieLayer->params["output_dim"] = format("%d", psRoiOutChannels);
+            ieLayer->params["group_size"] = format("%d", pooledSize.width);
+            ieLayer->params["spatial_scale"] = format("%f", spatialScale);
+        }
        else
            CV_Error(Error::StsNotImplemented, "Unsupported pooling type");


--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@@ -6,6 +6,7 @@
 // Third party copyrights are property of their respective owners.
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "../op_inf_engine.hpp"

 namespace cv { namespace dnn {

@@ -16,14 +17,14 @@ public:
    {
        setParamsFrom(params);

-        uint32_t featStride = params.get<uint32_t>("feat_stride", 16);
-        uint32_t baseSize = params.get<uint32_t>("base_size", 16);
+        featStride = params.get<uint32_t>("feat_stride", 16);
+        baseSize = params.get<uint32_t>("base_size", 16);
        // uint32_t minSize = params.get<uint32_t>("min_size", 16);
-        uint32_t keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000);
+        keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000);
        keepTopAfterNMS = params.get<uint32_t>("post_nms_topn", 300);
-        float nmsThreshold = params.get<float>("nms_thresh", 0.7);
-        DictValue ratios = params.get("ratio");
-        DictValue scales = params.get("scale");
+        nmsThreshold = params.get<float>("nms_thresh", 0.7);
+        ratios = params.get("ratio");
+        scales = params.get("scale");

        {
            LayerParams lp;
@@ -83,6 +84,12 @@ public:
        }
    }

+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && preferableTarget != DNN_TARGET_MYRIAD;
+    }
+
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
@@ -312,6 +319,38 @@ public:
                outputs[i].rowRange(numDets, keepTopAfterNMS).setTo(0);
    }

+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "Proposal";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
+
+        ieLayer->params["base_size"] = format("%d", baseSize);
+        ieLayer->params["feat_stride"] = format("%d", featStride);
+        ieLayer->params["min_size"] = "16";
+        ieLayer->params["nms_thresh"] = format("%f", nmsThreshold);
+        ieLayer->params["post_nms_topn"] = format("%d", keepTopAfterNMS);
+        ieLayer->params["pre_nms_topn"] = format("%d", keepTopBeforeNMS);
+        if (ratios.size())
+        {
+            ieLayer->params["ratio"] = format("%f", ratios.get<float>(0));
+            for (int i = 1; i < ratios.size(); ++i)
+                ieLayer->params["ratio"] += format(",%f", ratios.get<float>(i));
+        }
+        if (scales.size())
+        {
+            ieLayer->params["scale"] = format("%f", scales.get<float>(0));
+            for (int i = 1; i < scales.size(); ++i)
+                ieLayer->params["scale"] += format(",%f", scales.get<float>(i));
+        }
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
 private:
    // A first half of channels are background scores. We need only a second one.
    static Mat getObjectScores(const Mat& m)
@@ -342,8 +381,10 @@ private:

    Ptr<PermuteLayer> deltasPermute;
    Ptr<PermuteLayer> scoresPermute;
-    uint32_t keepTopAfterNMS;
+    uint32_t keepTopBeforeNMS, keepTopAfterNMS, featStride, baseSize;
    Mat fakeImageBlob;
+    float nmsThreshold;
+    DictValue ratios, scales;
 #ifdef HAVE_OPENCL
    UMat umat_fakeImageBlob;
 #endif

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -183,8 +183,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
            ocl::Kernel oclk_sto_pool_forward(
                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d",
+                       (use_half) ? "half" : "float",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_
                ));

--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -322,12 +322,32 @@ InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) noex
    return InferenceEngine::StatusCode::OK;
 }

+InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(size_t size, InferenceEngine::ResponseDesc *responseDesc) noexcept
+{
+    CV_Error(Error::StsNotImplemented, "");
+    return InferenceEngine::StatusCode::OK;
+}
+
 size_t InfEngineBackendNet::getBatchSize() const noexcept
 {
    CV_Error(Error::StsNotImplemented, "");
    return 0;
 }

+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
+InferenceEngine::StatusCode InfEngineBackendNet::AddExtension(const InferenceEngine::IShapeInferExtensionPtr &extension, InferenceEngine::ResponseDesc *resp) noexcept
+{
+    CV_Error(Error::StsNotImplemented, "");
+    return InferenceEngine::StatusCode::OK;
+}
+
+InferenceEngine::StatusCode InfEngineBackendNet::reshape(const InferenceEngine::ICNNNetwork::InputShapes &inputShapes, InferenceEngine::ResponseDesc *resp) noexcept
+{
+    CV_Error(Error::StsNotImplemented, "");
+    return InferenceEngine::StatusCode::OK;
+}
+#endif
+
 void InfEngineBackendNet::init(int targetId)
 {
    if (inputs.empty())

--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -9,6 +9,8 @@
 #define __OPENCV_DNN_OP_INF_ENGINE_HPP__

 #include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/dnn.hpp"

 #ifdef HAVE_INF_ENGINE
 #if defined(__GNUC__) && __GNUC__ >= 5
@@ -19,6 +21,17 @@
 #if defined(__GNUC__) && __GNUC__ >= 5
 //#pragma GCC diagnostic pop
 #endif
+
+#define INF_ENGINE_RELEASE_2018R1 2018010000
+#define INF_ENGINE_RELEASE_2018R2 2018020000
+
+#ifndef INF_ENGINE_RELEASE
+#warning("IE version have not been provided via command-line. Using 2018R2 by default")
+#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R2
+#endif
+
+#define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
+
 #endif  // HAVE_INF_ENGINE

 namespace cv { namespace dnn {
@@ -86,8 +99,15 @@ public:

    virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE;

+    virtual InferenceEngine::StatusCode setBatchSize(size_t size, InferenceEngine::ResponseDesc* responseDesc) noexcept;
+
    virtual size_t getBatchSize() const noexcept CV_OVERRIDE;

+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
+    virtual InferenceEngine::StatusCode AddExtension(const InferenceEngine::IShapeInferExtensionPtr& extension, InferenceEngine::ResponseDesc* resp) noexcept;
+    virtual InferenceEngine::StatusCode reshape(const InputShapes& inputShapes, InferenceEngine::ResponseDesc* resp) noexcept;
+#endif
+
    void init(int targetId);

    void addBlobs(const std::vector<Ptr<BackendWrapper> >& wrappers);

--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -104,7 +104,7 @@ __kernel void
 #elif defined KERNEL_AVE_POOL

 __kernel void TEMPLATE(ave_pool_forward, Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data,
+    const int nthreads, __global const Dtype* bottom_data,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width,
    __global Dtype* top_data)
@@ -150,7 +150,7 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
 #elif defined KERNEL_STO_POOL

 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data,
+    const int nthreads, __global const Dtype* bottom_data,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width,
    __global Dtype* top_data)

--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1293,7 +1293,13 @@ void TFImporter::populateNet(Net dstNet)
                    if (!next_layers.empty())
                    {
                        int maximumLayerIdx = next_layers[0].second;
-                        ExcludeLayer(net, maximumLayerIdx, 0, false);
+
+                        CV_Assert(net.node(maximumLayerIdx).input_size() == 2);
+
+                        // The input from the Mul layer can also be at index 1.
+                        int mulInputIdx = (net.node(maximumLayerIdx).input(0) == name) ? 0 : 1;
+
+                        ExcludeLayer(net, maximumLayerIdx, mulInputIdx, false);
                        layers_to_ignore.insert(next_layers[0].first);

                        layerParams.set("negative_slope", scaleMat.at<float>(0));

--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@@ -938,6 +938,16 @@ struct TorchImporter
                layerParams.set("end", DictValue::arrayInt<int*>(&ends[0], 4));
                curModule->modules.push_back(newModule);
            }
+            else if (nnName == "SpatialUpSamplingNearest")
+            {
+                readTorchTable(scalarParams, tensorParams);
+                CV_Assert(scalarParams.has("scale_factor"));
+                int scale_factor = scalarParams.get<int>("scale_factor");
+                newModule->apiType = "Resize";
+                layerParams.set("interpolation", "nearest");
+                layerParams.set("zoom_factor", scale_factor);
+                curModule->modules.push_back(newModule);
+            }
            else
            {
                // Importer does not know how to map Torch's layer type to an OpenCV's one.

--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -175,7 +175,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 0.0;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.06 : 0.0;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.062 : 0.0;
    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",
               inp, "detection_out", "", l1, lInf, 0.25);
 }
@@ -233,11 +233,8 @@ TEST_P(DNNTestNetwork, opencv_face_detector)
 {
    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
-    Size inpSize;
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        inpSize = Size(300, 300);
    Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
-    Mat inp = blobFromImage(img, 1.0, inpSize, Scalar(104.0, 177.0, 123.0), false, false);
+    Mat inp = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt",
               inp, "detection_out");
 }
@@ -249,7 +246,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.008 : 0.0;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.07 : 0.0;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0731 : 0.0;
    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
               inp, "detection_out", "", l1, lInf);
 }

--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -51,6 +51,33 @@ static std::string _tf(TString filename)
    return (getOpenCVExtraDir() + "/dnn/") + filename;
 }

+class Test_Caffe_nets : public DNNTestLayer
+{
+public:
+    void testFaster(const std::string& proto, const std::string& model, const Mat& ref,
+                    double scoreDiff = 0.0, double iouDiff = 0.0)
+    {
+        checkBackend();
+        Net net = readNetFromCaffe(findDataFile("dnn/" + proto, false),
+                                   findDataFile("dnn/" + model, false));
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);
+        Mat img = imread(findDataFile("dnn/dog416.png", false));
+        resize(img, img, Size(800, 600));
+        Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
+        Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f);
+
+        net.setInput(blob, "data");
+        net.setInput(imInfo, "im_info");
+        // Output has shape 1x1xNx7 where N - number of detections.
+        // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
+        Mat out = net.forward();
+        scoreDiff = scoreDiff ? scoreDiff : default_l1;
+        iouDiff = iouDiff ? iouDiff : default_lInf;
+        normAssertDetections(ref, out, ("model name: " + model).c_str(), 0.8, scoreDiff, iouDiff);
+    }
+};
+
 TEST(Test_Caffe, memory_read)
 {
    const string proto = findDataFile("dnn/bvlc_googlenet.prototxt", false);
@@ -344,9 +371,15 @@ TEST(Reproducibility_GoogLeNet_fp16, Accuracy)
 }

 // https://github.com/richzhang/colorization
-TEST(Reproducibility_Colorization, Accuracy)
+TEST_P(Test_Caffe_nets, Colorization)
 {
-    const float l1 = 3e-5;
+    checkBackend();
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+
+    const float l1 = 4e-4;
    const float lInf = 3e-3;

    Mat inp = blobFromNPY(_tf("colorization_inp.npy"));
@@ -356,7 +389,8 @@ TEST(Reproducibility_Colorization, Accuracy)
    const string proto = findDataFile("dnn/colorization_deploy_v2.prototxt", false);
    const string model = findDataFile("dnn/colorization_release_v2.caffemodel", false);
    Net net = readNetFromCaffe(proto, model);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);

    net.getLayer(net.getLayerId("class8_ab"))->blobs.push_back(kernel);
    net.getLayer(net.getLayerId("conv8_313_rh"))->blobs.push_back(Mat(1, 313, CV_32F, 2.606));
@@ -447,39 +481,40 @@ INSTANTIATE_TEST_CASE_P(Test_Caffe, opencv_face_detector,
    )
 );

-TEST(Test_Caffe, FasterRCNN_and_RFCN)
+TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 {
-    std::string models[] = {"VGG16_faster_rcnn_final.caffemodel", "ZF_faster_rcnn_final.caffemodel",
-                            "resnet50_rfcn_final.caffemodel"};
-    std::string protos[] = {"faster_rcnn_vgg16.prototxt", "faster_rcnn_zf.prototxt",
-                            "rfcn_pascal_voc_resnet50.prototxt"};
-    Mat refs[] = {(Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
-                                        0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
-                                        0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166),
-                  (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
-                                        0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
-                                        0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176),
-                  (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
-                                        0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16)};
-    for (int i = 0; i < 3; ++i)
-    {
-        std::string proto = findDataFile("dnn/" + protos[i], false);
-        std::string model = findDataFile("dnn/" + models[i], false);
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
+                                           0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
+                                           0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
+    testFaster("faster_rcnn_vgg16.prototxt", "VGG16_faster_rcnn_final.caffemodel", ref);
+}

-        Net net = readNetFromCaffe(proto, model);
-        net.setPreferableBackend(DNN_BACKEND_OPENCV);
-        Mat img = imread(findDataFile("dnn/dog416.png", false));
-        resize(img, img, Size(800, 600));
-        Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
-        Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f);
+TEST_P(Test_Caffe_nets, FasterRCNN_zf)
+{
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
+                                           0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
+                                           0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
+    testFaster("faster_rcnn_zf.prototxt", "ZF_faster_rcnn_final.caffemodel", ref);
+}

-        net.setInput(blob, "data");
-        net.setInput(imInfo, "im_info");
-        // Output has shape 1x1xNx7 where N - number of detections.
-        // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
-        Mat out = net.forward();
-        normAssertDetections(refs[i], out, ("model name: " + models[i]).c_str(), 0.8);
-    }
+TEST_P(Test_Caffe_nets, RFCN)
+{
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    static Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
+                                           0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
+    testFaster("rfcn_pascal_voc_resnet50.prototxt", "resnet50_rfcn_final.caffemodel", ref);
 }

+INSTANTIATE_TEST_CASE_P(/**/, Test_Caffe_nets, dnnBackendsAndTargets());
+
 }} // namespace
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -16,7 +16,7 @@ using namespace cv;
 using namespace cv::dnn;
 using namespace testing;

-static void test(Mat& input, Net& net, int backendId, int targetId)
+static void test(Mat& input, Net& net, Backend backendId, Target targetId, bool skipCheck = false)
 {
    DNNTestLayer::checkBackend(backendId, targetId);
    randu(input, -1.0f, 1.0f);
@@ -29,16 +29,19 @@ static void test(Mat& input, Net& net, int backendId, int targetId)
    net.setPreferableTarget(targetId);
    Mat outputHalide = net.forward().clone();

+    if (skipCheck)
+        return;
+
    double l1, lInf;
    DNNTestLayer::getDefaultThresholds(backendId, targetId, &l1, &lInf);
    normAssert(outputDefault, outputHalide, "", l1, lInf);
 }

-static void test(LayerParams& params, Mat& input, int backendId, int targetId)
+static void test(LayerParams& params, Mat& input, Backend backendId, Target targetId, bool skipCheck = false)
 {
    Net net;
    net.addLayerToPrev(params.name, params.type, params);
-    test(input, net, backendId, targetId);
+    test(input, net, backendId, targetId, skipCheck);
 }

 static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide()
@@ -101,16 +104,17 @@ TEST_P(Convolution, Accuracy)
    Size pad = get<4>(GetParam());
    Size dilation = get<5>(GetParam());
    bool hasBias = get<6>(GetParam());
-    int backendId = get<0>(get<7>(GetParam()));
-    int targetId = get<1>(get<7>(GetParam()));
+    Backend backendId = get<0>(get<7>(GetParam()));
+    Target targetId = get<1>(get<7>(GetParam()));

    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
        throw SkipTestException("");

+    bool skipCheck = false;
    if (cvtest::skipUnstableTests && backendId == DNN_BACKEND_OPENCV &&
        (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
        kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1))
-        throw SkipTestException("Skip unstable test");
+        skipCheck = true;

    int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
    Mat weights(4, &sz[0], CV_32F);
@@ -139,7 +143,9 @@ TEST_P(Convolution, Accuracy)
    }
    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
    Mat input(4, &inpSz[0], CV_32F);
-    test(lp, input, backendId, targetId);
+    test(lp, input, backendId, targetId, skipCheck);
+    if (skipCheck)
+        throw SkipTestException("Skip checks in unstable test");
 }

 INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine(
@@ -171,8 +177,8 @@ TEST_P(Deconvolution, Accuracy)
    Size stride = Size(get<5>(GetParam())[0], get<5>(GetParam())[1]);
    Size adjPad = Size(get<5>(GetParam())[2], get<5>(GetParam())[3]);
    bool hasBias = get<6>(GetParam());
-    int backendId = get<0>(get<7>(GetParam()));
-    int targetId = get<1>(get<7>(GetParam()));
+    Backend backendId = get<0>(get<7>(GetParam()));
+    Target targetId = get<1>(get<7>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU &&
        dilation.width == 2 && dilation.height == 2)
        throw SkipTestException("");
@@ -235,8 +241,8 @@ TEST_P(LRN, Accuracy)
    float bias = get<2>(GetParam())[2];
    bool normBySize = get<3>(GetParam());
    std::string nrmType = get<4>(GetParam());
-    int backendId = get<0>(get<5>(GetParam()));
-    int targetId = get<1>(get<5>(GetParam()));
+    Backend backendId = get<0>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        throw SkipTestException("");

@@ -276,8 +282,8 @@ TEST_P(AvePooling, Accuracy)
    Size outSize = get<1>(GetParam());;  // Input size will be computed from parameters.
    Size kernel = get<2>(GetParam());
    Size stride = get<3>(GetParam());
-    int backendId = get<0>(get<4>(GetParam()));
-    int targetId = get<1>(get<4>(GetParam()));
+    Backend backendId = get<0>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
        throw SkipTestException("");

@@ -317,8 +323,8 @@ TEST_P(MaxPooling, Accuracy)
    Size kernel = get<2>(GetParam());
    Size stride = get<3>(GetParam());
    Size pad = get<4>(GetParam());
-    int backendId = get<0>(get<5>(GetParam()));
-    int targetId = get<1>(get<5>(GetParam()));
+    Backend backendId = get<0>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));

    LayerParams lp;
    lp.set("pool", "max");
@@ -355,8 +361,8 @@ TEST_P(FullyConnected, Accuracy)
    Size inSize = get<1>(GetParam());
    int outChannels = get<2>(GetParam());
    bool hasBias = get<3>(GetParam());
-    int backendId = get<0>(get<4>(GetParam()));
-    int targetId = get<1>(get<4>(GetParam()));
+    Backend backendId = get<0>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        throw SkipTestException("");

@@ -394,8 +400,8 @@ typedef TestWithParam<tuple<int,  tuple<Backend, Target> > > SoftMax;
 TEST_P(SoftMax, Accuracy)
 {
    int inChannels = get<0>(GetParam());
-    int backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
    LayerParams lp;
    lp.type = "SoftMax";
    lp.name = "testLayer";
@@ -457,7 +463,7 @@ TEST_P(Test_Halide_layers, MaxPoolUnpool)
 ////////////////////////////////////////////////////////////////////////////////
 static const int kNumChannels = 3;

-void testInPlaceActivation(LayerParams& lp, int backendId, int targetId)
+void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId)
 {
    EXPECT_FALSE(lp.name.empty());

@@ -485,8 +491,8 @@ TEST_P(BatchNorm, Accuracy)
    bool hasWeights = get<0>(GetParam());
    bool hasBias = get<1>(GetParam());
    float epsilon = get<2>(GetParam());
-    int backendId = get<0>(get<3>(GetParam()));
-    int targetId = get<1>(get<3>(GetParam()));
+    Backend backendId = get<0>(get<3>(GetParam()));
+    Target targetId = get<1>(get<3>(GetParam()));

    LayerParams lp;
    lp.set("has_weight", hasWeights);
@@ -518,8 +524,8 @@ typedef TestWithParam<tuple<float, tuple<Backend, Target> > > ReLU;
 TEST_P(ReLU, Accuracy)
 {
    float negativeSlope = get<0>(GetParam());
-    int backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));

    LayerParams lp;
    lp.set("negative_slope", negativeSlope);
@@ -536,8 +542,8 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, ReLU, Combine(
 typedef TestWithParam<tuple<std::string, tuple<Backend, Target> > > NoParamActivation;
 TEST_P(NoParamActivation, Accuracy)
 {
-    int backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));

    LayerParams lp;
    lp.type = get<0>(GetParam());
@@ -555,8 +561,8 @@ TEST_P(Power, Accuracy)
    float power = get<0>(GetParam())[0];
    float scale = get<0>(GetParam())[1];
    float shift = get<0>(GetParam())[2];
-    int backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));

    LayerParams lp;
    lp.set("power", power);
@@ -589,8 +595,8 @@ typedef TestWithParam<tuple<bool, tuple<Backend, Target> > > Scale;
 TEST_P(Scale, Accuracy)
 {
    bool hasBias = get<0>(GetParam());
-    int backendId = get<0>(get<1>(GetParam()));
-    int targetId = get<1>(get<1>(GetParam()));
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));

    LayerParams lp;
    lp.set("bias_term", hasBias);
@@ -624,8 +630,8 @@ TEST_P(Concat, Accuracy)
 {
    Vec3i inSize = get<0>(GetParam());
    Vec3i numChannels = get<1>(GetParam());
-    int backendId = get<0>(get<2>(GetParam()));
-    int targetId = get<1>(get<2>(GetParam()));
+    Backend backendId = get<0>(get<2>(GetParam()));
+    Target targetId = get<1>(get<2>(GetParam()));

    Net net;

@@ -692,8 +698,8 @@ TEST_P(Eltwise, Accuracy)
    std::string op = get<1>(GetParam());
    int numConv = get<2>(GetParam());
    bool weighted = get<3>(GetParam());
-    int backendId = get<0>(get<4>(GetParam()));
-    int targetId = get<1>(get<4>(GetParam()));
+    Backend backendId = get<0>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));

    Net net;


--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -1205,14 +1205,6 @@ public:
        }
    }

-    void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-
-        Layer::forward_fallback(inputs, outputs, internals);
-    }
-
 private:
    int outWidth, outHeight, zoomFactor;
 };
@@ -1225,7 +1217,7 @@ TEST_P(Test_Caffe_layers, DISABLED_Interp)  // requires patched protobuf (availa
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
-    // Test a cusom layer.
+    // Test a custom layer.
    CV_DNN_REGISTER_LAYER_CLASS(Interp, CustomInterpLayer);
    try
    {

--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -230,6 +230,13 @@ TEST_P(Test_TensorFlow_layers, flatten)
    runTensorFlowNet("unfused_flatten_unknown_batch");
 }

+TEST_P(Test_TensorFlow_layers, leaky_relu)
+{
+    runTensorFlowNet("leaky_relu_order1");
+    runTensorFlowNet("leaky_relu_order2");
+    runTensorFlowNet("leaky_relu_order3");
+}
+
 TEST_P(Test_TensorFlow_layers, l2_normalize)
 {
    runTensorFlowNet("l2_normalize");

--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -69,100 +69,119 @@ TEST(Torch_Importer, simple_read)
    ASSERT_FALSE(net.empty());
 }

-static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String outLayerName = "",
-                        bool check2ndBlob = false, bool isBinary = false)
+class Test_Torch_layers : public DNNTestLayer
 {
-    String suffix = (isBinary) ? ".dat" : ".txt";
+public:
+    void runTorchNet(const String& prefix, String outLayerName = "",
+                     bool check2ndBlob = false, bool isBinary = false,
+                     double l1 = 0.0, double lInf = 0.0)
+    {
+        String suffix = (isBinary) ? ".dat" : ".txt";

-    Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
-    ASSERT_FALSE(net.empty());
+        Mat inp, outRef;
+        ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) );
+        ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) );

-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    net.setPreferableTarget(targetId);
+        checkBackend(backend, target, &inp, &outRef);

-    Mat inp, outRef;
-    ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) );
-    ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) );
+        Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
+        ASSERT_FALSE(net.empty());

-    if (outLayerName.empty())
-        outLayerName = net.getLayerNames().back();
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);

-    net.setInput(inp);
-    std::vector<Mat> outBlobs;
-    net.forward(outBlobs, outLayerName);
-    normAssert(outRef, outBlobs[0]);
+        if (outLayerName.empty())
+            outLayerName = net.getLayerNames().back();

-    if (check2ndBlob)
-    {
-        Mat out2 = outBlobs[1];
-        Mat ref2 = readTorchBlob(_tf(prefix + "_output_2" + suffix), isBinary);
-        normAssert(out2, ref2);
-    }
-}
+        net.setInput(inp);
+        std::vector<Mat> outBlobs;
+        net.forward(outBlobs, outLayerName);
+        l1 = l1 ? l1 : default_l1;
+        lInf = lInf ? lInf : default_lInf;
+        normAssert(outRef, outBlobs[0], "", l1, lInf);

-typedef testing::TestWithParam<Target> Test_Torch_layers;
+        if (check2ndBlob && backend != DNN_BACKEND_INFERENCE_ENGINE)
+        {
+            Mat out2 = outBlobs[1];
+            Mat ref2 = readTorchBlob(_tf(prefix + "_output_2" + suffix), isBinary);
+            normAssert(out2, ref2, "", l1, lInf);
+        }
+    }
+};

 TEST_P(Test_Torch_layers, run_convolution)
 {
-    runTorchNet("net_conv", GetParam(), "", false, true);
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    runTorchNet("net_conv", "", false, true);
 }

 TEST_P(Test_Torch_layers, run_pool_max)
 {
-    runTorchNet("net_pool_max", GetParam(), "", true);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
+    runTorchNet("net_pool_max", "", true);
 }

 TEST_P(Test_Torch_layers, run_pool_ave)
 {
-    runTorchNet("net_pool_ave", GetParam());
+    runTorchNet("net_pool_ave");
 }

 TEST_P(Test_Torch_layers, run_reshape)
 {
-    int targetId = GetParam();
-    runTorchNet("net_reshape", targetId);
-    runTorchNet("net_reshape_batch", targetId);
-    runTorchNet("net_reshape_single_sample", targetId);
-    runTorchNet("net_reshape_channels", targetId, "", false, true);
+    runTorchNet("net_reshape");
+    runTorchNet("net_reshape_batch");
+    runTorchNet("net_reshape_channels", "", false, true);
+}
+
+TEST_P(Test_Torch_layers, run_reshape_single_sample)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
+    runTorchNet("net_reshape_single_sample", "", false, false,
+                (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.0052 : 0.0);
 }

 TEST_P(Test_Torch_layers, run_linear)
 {
-    runTorchNet("net_linear_2d", GetParam());
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
+    runTorchNet("net_linear_2d");
 }

 TEST_P(Test_Torch_layers, run_concat)
 {
-    int targetId = GetParam();
-    runTorchNet("net_concat", targetId, "l5_torchMerge");
-    runTorchNet("net_depth_concat", targetId, "", false, true);
+    runTorchNet("net_concat", "l5_torchMerge");
+    runTorchNet("net_depth_concat", "", false, true, 0.0,
+                target == DNN_TARGET_OPENCL_FP16 ? 0.021 : 0.0);
 }

 TEST_P(Test_Torch_layers, run_deconv)
 {
-    runTorchNet("net_deconv", GetParam());
+    runTorchNet("net_deconv");
 }

 TEST_P(Test_Torch_layers, run_batch_norm)
 {
-    runTorchNet("net_batch_norm", GetParam(), "", false, true);
+    runTorchNet("net_batch_norm", "", false, true);
 }

 TEST_P(Test_Torch_layers, net_prelu)
 {
-    runTorchNet("net_prelu", GetParam());
+    runTorchNet("net_prelu");
 }

 TEST_P(Test_Torch_layers, net_cadd_table)
 {
-    runTorchNet("net_cadd_table", GetParam());
+    runTorchNet("net_cadd_table");
 }

 TEST_P(Test_Torch_layers, net_softmax)
 {
-    int targetId = GetParam();
-    runTorchNet("net_softmax", targetId);
-    runTorchNet("net_softmax_spatial", targetId);
+    runTorchNet("net_softmax");
+    runTorchNet("net_softmax_spatial");
 }

 TEST_P(Test_Torch_layers, net_logsoftmax)
@@ -173,40 +192,55 @@ TEST_P(Test_Torch_layers, net_logsoftmax)

 TEST_P(Test_Torch_layers, net_lp_pooling)
 {
-    int targetId = GetParam();
-    runTorchNet("net_lp_pooling_square", targetId, "", false, true);
-    runTorchNet("net_lp_pooling_power", targetId, "", false, true);
+    runTorchNet("net_lp_pooling_square", "", false, true);
+    runTorchNet("net_lp_pooling_power", "", false, true);
 }

 TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
 {
-    runTorchNet("net_conv_gemm_lrn", GetParam(), "", false, true);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+    runTorchNet("net_conv_gemm_lrn", "", false, true,
+                target == DNN_TARGET_OPENCL_FP16 ? 0.046 : 0.0,
+                target == DNN_TARGET_OPENCL_FP16 ? 0.023 : 0.0);
 }

 TEST_P(Test_Torch_layers, net_inception_block)
 {
-    runTorchNet("net_inception_block", GetParam(), "", false, true);
+    runTorchNet("net_inception_block", "", false, true);
 }

 TEST_P(Test_Torch_layers, net_normalize)
 {
-    runTorchNet("net_normalize", GetParam(), "", false, true);
+    runTorchNet("net_normalize", "", false, true);
 }

 TEST_P(Test_Torch_layers, net_padding)
 {
-    int targetId = GetParam();
-    runTorchNet("net_padding", targetId, "", false, true);
-    runTorchNet("net_spatial_zero_padding", targetId, "", false, true);
-    runTorchNet("net_spatial_reflection_padding", targetId, "", false, true);
+    runTorchNet("net_padding", "", false, true);
+    runTorchNet("net_spatial_zero_padding", "", false, true);
+    runTorchNet("net_spatial_reflection_padding", "", false, true);
 }

 TEST_P(Test_Torch_layers, net_non_spatial)
 {
-    runTorchNet("net_non_spatial", GetParam(), "", false, true);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
+        (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    runTorchNet("net_non_spatial", "", false, true);
+}
+
+TEST_P(Test_Torch_layers, run_paralel)
+{
+    if (backend != DNN_BACKEND_OPENCV || target != DNN_TARGET_CPU)
+        throw SkipTestException("");
+    runTorchNet("net_parallel", "l5_torchMerge");
 }

-INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, availableDnnTargets());
+TEST_P(Test_Torch_layers, net_residual)
+{
+    runTorchNet("net_residual", "", false, true);
+}

 typedef testing::TestWithParam<Target> Test_Torch_nets;

@@ -313,21 +347,6 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)

 INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_nets, availableDnnTargets());

-// TODO: fix OpenCL and add to the rest of tests
-TEST(Torch_Importer, run_paralel)
-{
-    runTorchNet("net_parallel", DNN_TARGET_CPU, "l5_torchMerge");
-}
-
-TEST(Torch_Importer, DISABLED_run_paralel)
-{
-    runTorchNet("net_parallel", DNN_TARGET_OPENCL, "l5_torchMerge");
-}
-
-TEST(Torch_Importer, net_residual)
-{
-    runTorchNet("net_residual", DNN_TARGET_CPU, "", false, true);
-}

 // Test a custom layer
 // https://github.com/torch/nn/blob/master/doc/convolution.md#nn.SpatialUpSamplingNearest
@@ -374,17 +393,29 @@ public:
        }
    }

-    virtual void forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE {}
-
 private:
    int scale;
 };

-TEST(Torch_Importer, upsampling_nearest)
+TEST_P(Test_Torch_layers, upsampling_nearest)
 {
+    // Test a custom layer.
    CV_DNN_REGISTER_LAYER_CLASS(SpatialUpSamplingNearest, SpatialUpSamplingNearestLayer);
-    runTorchNet("net_spatial_upsampling_nearest", DNN_TARGET_CPU, "", false, true);
+    try
+    {
+        runTorchNet("net_spatial_upsampling_nearest", "", false, true);
+    }
+    catch (...)
+    {
+        LayerFactory::unregisterLayer("SpatialUpSamplingNearest");
+        throw;
+    }
    LayerFactory::unregisterLayer("SpatialUpSamplingNearest");
+
+    // Test an implemented layer.
+    runTorchNet("net_spatial_upsampling_nearest", "", false, true);
 }

+INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, dnnBackendsAndTargets());
+
 }
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -307,8 +307,8 @@ icvLoadWindowPos( const char* name, CvRect& rect )
 {
    HKEY hkey;
    char szKey[1024];
-    strcpy( szKey, icvWindowPosRootKey );
-    strcat( szKey, name );
+    strcpy_s( szKey, 1024, icvWindowPosRootKey );
+    strcat_s( szKey, 1024, name );

    rect.x = rect.y = CW_USEDEFAULT;
    rect.width = rect.height = 320;
@@ -368,8 +368,8 @@ icvSaveWindowPos( const char* name, CvRect rect )
    HKEY hkey;
    char szKey[1024];
    char rootKey[1024];
-    strcpy( szKey, icvWindowPosRootKey );
-    strcat( szKey, name );
+    strcpy_s( szKey, 1024, icvWindowPosRootKey );
+    strcat_s( szKey, 1024, name );

    if( RegOpenKeyEx( HKEY_CURRENT_USER,szKey,0,KEY_READ,&hkey) != ERROR_SUCCESS )
    {
@@ -379,7 +379,7 @@ icvSaveWindowPos( const char* name, CvRect rect )
        char oldestKey[1024];
        char currentKey[1024];

-        strcpy( rootKey, icvWindowPosRootKey );
+        strcpy_s( rootKey, 1024, icvWindowPosRootKey );
        rootKey[strlen(rootKey)-1] = '\0';
        if( RegCreateKeyEx(HKEY_CURRENT_USER, rootKey, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_READ+KEY_WRITE, 0, &hroot, NULL) != ERROR_SUCCESS )
            //RegOpenKeyEx( HKEY_CURRENT_USER,rootKey,0,KEY_READ,&hroot) != ERROR_SUCCESS )
@@ -398,7 +398,7 @@ icvSaveWindowPos( const char* name, CvRect rect )
                oldestTime.dwLowDateTime > accesstime.dwLowDateTime) )
            {
                oldestTime = accesstime;
-                strcpy( oldestKey, currentKey );
+                strcpy_s( oldestKey, 1024, currentKey );
            }
        }

@@ -1500,6 +1500,8 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
            rgn = CreateRectRgn(0, 0, wrc.right, wrc.bottom);
            rgn1 = CreateRectRgn(cr.left, cr.top, cr.right, cr.bottom);
            rgn2 = CreateRectRgn(tr.left, tr.top, tr.right, tr.bottom);
+            CV_Assert(rgn != 0, rgn1 != 0, rgn2 != 0);
+
            ret = CombineRgn(rgn, rgn, rgn1, RGN_DIFF);
            ret = CombineRgn(rgn, rgn, rgn2, RGN_DIFF);


--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1771,7 +1771,7 @@ Corners in the image can be found as the local maxima of this response map.
 size as src .
 @param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ).
 @param ksize Aperture parameter for the Sobel operator.
-@param k Harris detector free parameter. See the formula below.
+@param k Harris detector free parameter. See the formula above.
 @param borderType Pixel extrapolation method. See #BorderTypes.
 */
 CV_EXPORTS_W void cornerHarris( InputArray src, OutputArray dst, int blockSize,

--- a/modules/python/bindings/CMakeLists.txt
+++ b/modules/python/bindings/CMakeLists.txt
@@ -20,8 +20,12 @@ endforeach()
 set(opencv_hdrs "")
 set(opencv_userdef_hdrs "")
 foreach(m ${OPENCV_PYTHON_MODULES})
-  ocv_list_filter(OPENCV_MODULE_${m}_HEADERS "${OPENCV_MODULE_${m}_LOCATION}/include" __hdrs)
-  list(APPEND opencv_hdrs ${__hdrs})
+  foreach (hdr ${OPENCV_MODULE_${m}_HEADERS})
+    ocv_is_subdir(is_sub "${OPENCV_MODULE_${m}_LOCATION}/include" "${hdr}")
+    if(is_sub)
+      list(APPEND opencv_hdrs "${hdr}")
+    endif()
+  endforeach()
  file(GLOB userdef_hdrs ${OPENCV_MODULE_${m}_LOCATION}/misc/python/pyopencv*.hpp)
  list(APPEND opencv_userdef_hdrs ${userdef_hdrs})
 endforeach(m)

--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@@ -379,10 +379,9 @@ struct TSParams

 class TS
 {
-public:
-    // constructor(s) and destructor
    TS();
    virtual ~TS();
+public:

    enum
    {
@@ -484,9 +483,6 @@ public:
        SKIPPED=1
    };

-    // get file storage
-    CvFileStorage* get_file_storage();
-
    // get RNG to generate random input data for a test
    RNG& get_rng() { return rng; }

@@ -629,9 +625,6 @@ struct DefaultRngAuto
 void fillGradient(Mat& img, int delta = 5);
 void smoothBorder(Mat& img, const Scalar& color, int delta = 3);

-void printVersionInfo(bool useStdOut = true);
-
-
 // Utility functions

 void addDataSearchPath(const std::string& path);
@@ -660,6 +653,13 @@ std::string findDataFile(const std::string& relative_path, bool required = true)
 */
 std::string findDataDirectory(const std::string& relative_path, bool required = true);

+// Test definitions
+
+class SystemInfoCollector : public testing::EmptyTestEventListener
+{
+private:
+    virtual void OnTestProgramStart(const testing::UnitTest&);
+};

 #ifndef __CV_TEST_EXEC_ARGS
 #if defined(_MSC_VER) && (_MSC_VER <= 1400)
@@ -671,15 +671,6 @@ std::string findDataDirectory(const std::string& relative_path, bool required =
 #endif
 #endif

-#ifdef HAVE_OPENCL
-namespace ocl {
-void dumpOpenCLDevice();
-}
-#define TEST_DUMP_OCL_INFO cvtest::ocl::dumpOpenCLDevice();
-#else
-#define TEST_DUMP_OCL_INFO
-#endif
-
 void parseCustomOptions(int argc, char **argv);

 #define CV_TEST_INIT0_NOOP (void)0
@@ -696,8 +687,7 @@ int main(int argc, char **argv) \
    ts->init(resourcesubdir); \
    __CV_TEST_EXEC_ARGS(CV_TEST_INIT0_ ## INIT0) \
    ::testing::InitGoogleTest(&argc, argv); \
-    cvtest::printVersionInfo(); \
-    TEST_DUMP_OCL_INFO \
+    ::testing::UnitTest::GetInstance()->listeners().Append(new SystemInfoCollector); \
    __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
    parseCustomOptions(argc, argv); \
    } \

--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
--- a/modules/ts/src/ocl_test.cpp
+++ b/modules/ts/src/ocl_test.cpp
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -905,7 +905,7 @@ public:

    /** @brief Writes the next video frame

-    @param image The written frame
+    @param image The written frame. In general, color images are expected in BGR format.

    The function/method writes the specified image to video file. It must have the same size as has
    been specified when opening the video writer.

--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
--- a/modules/videoio/src/cap_vfw.cpp
+++ b/modules/videoio/src/cap_vfw.cpp
@@ -377,8 +377,8 @@ LRESULT PASCAL CvCaptureCAM_VFW::frameCallback( HWND hWnd, VIDEOHDR* hdr )
    if (!hWnd) return FALSE;

    capture = (CvCaptureCAM_VFW*)capGetUserData(hWnd);
+    if (!capture) return (LRESULT)FALSE;
    capture->hdr = hdr;
-
    return (LRESULT)TRUE;
 }


--- a/samples/cpp/create_mask.cpp
+++ b/samples/cpp/create_mask.cpp
--- a/samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py