Merge remote-tracking branch 'upstream/3.4' into merge-3.4

b39cd062 · Alexander Alekhin · 46def2fd · ab8022f7 · b39cd062 · b39cd062
Commit b39cd062 authored Jun 25, 2018 by Alexander Alekhin
26 changed files
--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
@@ -1531,7 +1531,7 @@ class TegraCvtColor_##name##_Invoker : public cv::ParallelLoopBody \
 public: \
    TegraCvtColor_##name##_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, int height_) : \
        cv::ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), height(height_) {} \
-    virtual void operator()(const cv::Range& range) const \
+    virtual void operator()(const cv::Range& range) const CV_OVERRIDE \
    { \
        CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \
    } \

--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
 add_definitions(-D__OPENCV_BUILD=1)
 add_definitions(-D__OPENCV_APPS=1)

+# Unified function for creating OpenCV applications:
+#   ocv_add_application(tgt [MODULES <m1> [<m2> ...]] SRCS <src1> [<src2> ...])
+function(ocv_add_application the_target)
+  cmake_parse_arguments(APP "" "" "MODULES;SRCS" ${ARGN})
+  ocv_check_dependencies(${APP_MODULES})
+  if(NOT OCV_DEPENDENCIES_FOUND)
+     return()
+  endif()
+
+  project(${the_target})
+  ocv_target_include_modules_recurse(${the_target} ${APP_MODULES})
+  ocv_target_include_directories(${the_target} PRIVATE "${OpenCV_SOURCE_DIR}/include/opencv")
+  ocv_add_executable(${the_target} ${APP_SRCS})
+  ocv_target_link_libraries(${the_target} ${APP_MODULES})
+  set_target_properties(${the_target} PROPERTIES
+                        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+                        ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
+                        RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
+                        OUTPUT_NAME "${the_target}")
+
+  if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(${the_target} PROPERTIES FOLDER "applications")
+  endif()
+
+  if(INSTALL_CREATE_DISTRIB)
+    if(BUILD_SHARED_LIBS)
+      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
+    endif()
+  else()
+    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
+  endif()
+endfunction()
+
 link_libraries(${OPENCV_LINKER_LIBS})

 macro(ocv_add_app directory)

--- a/apps/annotation/CMakeLists.txt
+++ b/apps/annotation/CMakeLists.txt
-SET(OPENCV_ANNOTATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_videoio)
-ocv_check_dependencies(${OPENCV_ANNOTATION_DEPS})
-
-if(NOT OCV_DEPENDENCIES_FOUND)
-   return()
-endif()
-
-project(annotation)
-set(the_target opencv_annotation)
-
-ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules_recurse(${the_target} ${OPENCV_ANNOTATION_DEPS})
-
-file(GLOB SRCS *.cpp)
-
-set(annotation_files ${SRCS})
-ocv_add_executable(${the_target} ${annotation_files})
-ocv_target_link_libraries(${the_target} ${OPENCV_ANNOTATION_DEPS})
-
-set_target_properties(${the_target} PROPERTIES
-                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
-                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      OUTPUT_NAME "opencv_annotation")
-
-if(ENABLE_SOLUTION_FOLDERS)
-   set_target_properties(${the_target} PROPERTIES FOLDER "applications")
-endif()
-
-if(INSTALL_CREATE_DISTRIB)
-   if(BUILD_SHARED_LIBS)
-      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
-   endif()
-else()
-   install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
-endif()
+ocv_add_application(opencv_annotation
+    MODULES opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_videoio
+    SRCS opencv_annotation.cpp)
--- a/apps/createsamples/CMakeLists.txt
+++ b/apps/createsamples/CMakeLists.txt
-set(OPENCV_CREATESAMPLES_DEPS opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d opencv_videoio)
-ocv_check_dependencies(${OPENCV_CREATESAMPLES_DEPS})
-
-if(NOT OCV_DEPENDENCIES_FOUND)
-  return()
-endif()
-
-project(createsamples)
-set(the_target opencv_createsamples)
-
-ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules_recurse(${the_target} ${OPENCV_CREATESAMPLES_DEPS})
-
 file(GLOB SRCS *.cpp)
-file(GLOB HDRS *.h*)
-
-set(createsamples_files ${SRCS} ${HDRS})
-
-ocv_add_executable(${the_target} ${createsamples_files})
-ocv_target_link_libraries(${the_target} ${OPENCV_CREATESAMPLES_DEPS})
-
-set_target_properties(${the_target} PROPERTIES
-                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
-                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      OUTPUT_NAME "opencv_createsamples")
-
-if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${the_target} PROPERTIES FOLDER "applications")
-endif()
-
-if(INSTALL_CREATE_DISTRIB)
-  if(BUILD_SHARED_LIBS)
-    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
-  endif()
-else()
-  install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
-endif()
+ocv_add_application(opencv_createsamples
+    MODULES opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d opencv_videoio
+    SRCS ${SRCS})
--- a/apps/interactive-calibration/CMakeLists.txt
+++ b/apps/interactive-calibration/CMakeLists.txt
-set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_imgproc opencv_features2d opencv_highgui opencv_calib3d opencv_videoio)
+set(DEPS opencv_core opencv_imgproc opencv_features2d opencv_highgui opencv_calib3d opencv_videoio)
 if(${BUILD_opencv_aruco})
-    list(APPEND OPENCV_INTERACTIVECALIBRATION_DEPS opencv_aruco)
+    list(APPEND DEPS opencv_aruco)
 endif()
-ocv_check_dependencies(${OPENCV_INTERACTIVECALIBRATION_DEPS})
-
-if(NOT OCV_DEPENDENCIES_FOUND)
-  return()
-endif()
-
-project(interactive-calibration)
-set(the_target opencv_interactive-calibration)
-
-ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules_recurse(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS})
-
 file(GLOB SRCS *.cpp)
-file(GLOB HDRS *.h*)
-
-set(interactive-calibration_files ${SRCS} ${HDRS})
-
-ocv_add_executable(${the_target} ${interactive-calibration_files})
-ocv_target_link_libraries(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS})
-
-set_target_properties(${the_target} PROPERTIES
-                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
-                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                    OUTPUT_NAME "opencv_interactive-calibration")
-
-if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${the_target} PROPERTIES FOLDER "applications")
-endif()
-
-if(INSTALL_CREATE_DISTRIB)
-  if(BUILD_SHARED_LIBS)
-    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
-  endif()
-else()
-  install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
-endif()
+ocv_add_application(opencv_interactive-calibration MODULES ${DEPS} SRCS ${SRCS})
--- a/apps/traincascade/CMakeLists.txt
+++ b/apps/traincascade/CMakeLists.txt
-set(OPENCV_TRAINCASCADE_DEPS opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d)
-ocv_check_dependencies(${OPENCV_TRAINCASCADE_DEPS})
-
-if(NOT OCV_DEPENDENCIES_FOUND)
-  return()
-endif()
-
-project(traincascade)
-set(the_target opencv_traincascade)
-
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Woverloaded-virtual
-                                     -Winconsistent-missing-override -Wsuggest-override
-)
-
-ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules_recurse(${the_target} ${OPENCV_TRAINCASCADE_DEPS})
-
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Woverloaded-virtual -Winconsistent-missing-override -Wsuggest-override)
 file(GLOB SRCS *.cpp)
-file(GLOB HDRS *.h*)
-
-set(traincascade_files ${SRCS} ${HDRS})
-
-ocv_add_executable(${the_target} ${traincascade_files})
-ocv_target_link_libraries(${the_target} ${OPENCV_TRAINCASCADE_DEPS})
-
-set_target_properties(${the_target} PROPERTIES
-                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
-                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      OUTPUT_NAME "opencv_traincascade")
-
-if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${the_target} PROPERTIES FOLDER "applications")
-endif()
-
-if(INSTALL_CREATE_DISTRIB)
-  if(BUILD_SHARED_LIBS)
-    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
-  endif()
-else()
-  install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
-endif()
+ocv_add_application(opencv_traincascade
+    MODULES opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d
+    SRCS ${SRCS})
--- a/apps/version/CMakeLists.txt
+++ b/apps/version/CMakeLists.txt
-set(OPENCV_APPLICATION_DEPS opencv_core)
-ocv_check_dependencies(${OPENCV_APPLICATION_DEPS})
-if(NOT OCV_DEPENDENCIES_FOUND)
-  return()
-endif()
-
-project(opencv_version)
-set(the_target opencv_version)
-ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS})
-ocv_add_executable(${the_target} opencv_version.cpp)
-ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS})
-
-set_target_properties(${the_target} PROPERTIES
-                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      OUTPUT_NAME "opencv_version")
-
-set_target_properties(${the_target} PROPERTIES FOLDER "applications")
-
-if(INSTALL_CREATE_DISTRIB)
-  if(BUILD_SHARED_LIBS)
-    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT libs)
-  endif()
-else()
-  install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
-endif()
-
+ocv_add_application(opencv_version MODULES opencv_core SRCS opencv_version.cpp)
 if(WIN32)
-  project(opencv_version_win32)
-  set(the_target opencv_version_win32)
-  ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS})
-  ocv_add_executable(${the_target} opencv_version.cpp)
-  ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS})
-  target_compile_definitions(${the_target} PRIVATE "OPENCV_WIN32_API=1")
-  set_target_properties(${the_target} PROPERTIES
-                        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-                        RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                        OUTPUT_NAME "opencv_version_win32")
-
-  set_target_properties(${the_target} PROPERTIES FOLDER "applications")
-
-  if(INSTALL_CREATE_DISTRIB)
-    if(BUILD_SHARED_LIBS)
-      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT libs)
-    endif()
-  else()
-    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
-  endif()
+  ocv_add_application(opencv_version_win32 MODULES opencv_core SRCS opencv_version.cpp)
+  target_compile_definitions(opencv_version_win32 PRIVATE "OPENCV_WIN32_API=1")
 endif()
--- a/apps/visualisation/CMakeLists.txt
+++ b/apps/visualisation/CMakeLists.txt
-SET(OPENCV_VISUALISATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_videoio opencv_imgcodecs)
-ocv_check_dependencies(${OPENCV_VISUALISATION_DEPS})
-
-if(NOT OCV_DEPENDENCIES_FOUND)
-   return()
-endif()
-
-project(visualisation)
-set(the_target opencv_visualisation)
-
-ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules_recurse(${the_target} ${OPENCV_VISUALISATION_DEPS})
-
-file(GLOB SRCS *.cpp)
-
-set(visualisation_files ${SRCS})
-ocv_add_executable(${the_target} ${visualisation_files})
-ocv_target_link_libraries(${the_target} ${OPENCV_VISUALISATION_DEPS})
-
-set_target_properties(${the_target} PROPERTIES
-                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
-                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      OUTPUT_NAME "opencv_visualisation")
-
-if(ENABLE_SOLUTION_FOLDERS)
-   set_target_properties(${the_target} PROPERTIES FOLDER "applications")
-endif()
-
-if(INSTALL_CREATE_DISTRIB)
-   if(BUILD_SHARED_LIBS)
-      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
-   endif()
-else()
-   install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
-endif()
+ocv_add_application(opencv_visualisation
+    MODULES opencv_core opencv_highgui opencv_imgproc opencv_videoio opencv_imgcodecs
+    SRCS opencv_visualisation.cpp)
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -361,6 +361,23 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        static Ptr<PermuteLayer> create(const LayerParams& params);
    };

+    /**
+     * Permute channels of 4-dimensional input blob.
+     * @param group Number of groups to split input channels and pick in turns
+     *              into output blob.
+     *
+     * \f[ groupSize = \frac{number\ of\ channels}{group} \f]
+     * \f[ output(n, c, h, w) = input(n, groupSize \times (c \% group) + \lfloor \frac{c}{group} \rfloor, h, w) \f]
+     * Read more at https://arxiv.org/pdf/1707.01083.pdf
+     */
+    class CV_EXPORTS ShuffleChannelLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+
+        int group;
+    };
+
    /**
     * @brief Adds extra values for specific axes.
     * @param paddings Vector of paddings in format
@@ -575,6 +592,17 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        static Ptr<ResizeLayer> create(const LayerParams& params);
    };

+    /**
+     * @brief Bilinear resize layer from https://github.com/cdmh/deeplab-public
+     *
+     * It differs from @ref ResizeLayer in output shape and resize scales computations.
+     */
+    class CV_EXPORTS InterpLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
    class CV_EXPORTS ProposalLayer : public Layer
    {
    public:

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -988,52 +988,26 @@ struct Net::Impl
        ld.inputBlobsId[inNum] = from;
    }

-    static void splitPin(const String &pinAlias, String &layerName, String &outName)
-    {
-        size_t delimPos = pinAlias.find('.');
-        layerName = pinAlias.substr(0, delimPos);
-        outName = (delimPos == String::npos) ? String() : pinAlias.substr(delimPos + 1);
-    }
-
    int resolvePinOutputName(LayerData &ld, const String &outName)
    {
        if (outName.empty())
            return 0;
-
-        if (std::isdigit(outName[0]))
-        {
-            char *lastChar;
-            long inum = std::strtol(outName.c_str(), &lastChar, 10);
-
-            if (*lastChar == 0)
-            {
-                CV_Assert(inum == (int)inum);
-                return (int)inum;
-            }
-        }
-
        return ld.getLayerInstance()->outputNameToIndex(outName);
    }

-    LayerPin getPinByAlias(const String &pinAlias)
+    LayerPin getPinByAlias(const String &layerName)
    {
        LayerPin pin;
-        String layerName, outName;
-        splitPin(pinAlias, layerName, outName);
-
        pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);

        if (pin.lid >= 0)
-            pin.oid = resolvePinOutputName(getLayerData(pin.lid), outName);
+            pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);

        return pin;
    }

-    std::vector<LayerPin> getLayerOutPins(const String &pinAlias)
+    std::vector<LayerPin> getLayerOutPins(const String &layerName)
    {
-        String layerName, outName;
-        splitPin(pinAlias, layerName, outName);
-
        int lid = (layerName.empty()) ? 0 : getLayerId(layerName);

        std::vector<LayerPin> pins;
@@ -2044,12 +2018,6 @@ int Net::addLayer(const String &name, const String &type, LayerParams &params)
 {
    CV_TRACE_FUNCTION();

-    if (name.find('.') != String::npos)
-    {
-        CV_Error(Error::StsBadArg, "Added layer name \"" + name + "\" must not contain dot symbol");
-        return -1;
-    }
-
    if (impl->getLayerId(name) >= 0)
    {
        CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
@@ -2689,7 +2657,7 @@ int Layer::inputNameToIndex(String)

 int Layer::outputNameToIndex(const String&)
 {
-    return -1;
+    return 0;
 }

 bool Layer::supportBackend(int backendId)

--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -84,6 +84,7 @@ void initializeLayerFactory()
    CV_DNN_REGISTER_LAYER_CLASS(Reshape,        ReshapeLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Flatten,        FlattenLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Resize,         ResizeLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Interp,         InterpLayer);
    CV_DNN_REGISTER_LAYER_CLASS(CropAndResize,  CropAndResizeLayer);

    CV_DNN_REGISTER_LAYER_CLASS(Convolution,    ConvolutionLayer);
@@ -115,6 +116,7 @@ void initializeLayerFactory()
    CV_DNN_REGISTER_LAYER_CLASS(Crop,           CropLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Eltwise,        EltwiseLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Permute,        PermuteLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ShuffleChannel, ShuffleChannelLayer);
    CV_DNN_REGISTER_LAYER_CLASS(PriorBox,       PriorBoxLayer);
    CV_DNN_REGISTER_LAYER_CLASS(PriorBoxClustered, PriorBoxLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Reorg,          ReorgLayer);

--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
 #include "../precomp.hpp"
 #include "layers_common.hpp"


--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -310,7 +310,6 @@ public:
            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
        }

-        UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
        for (size_t i = 0; i < inputs.size(); i++)
        {
            MatShape inshape, outshape;
@@ -320,7 +319,6 @@ public:
            UMat srcMat, dstMat;
            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
-            dstMat.setTo(0.0f);

            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
@@ -332,6 +330,7 @@ public:

            if (!use_half && bias && (outerSize > 1))
            {
+                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
                UMat& biases = umat_blobs[1];
                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
            }
@@ -354,6 +353,7 @@ public:

            if (bias)
            {
+                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
                UMat& biases = umat_blobs[1];
                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
            }

--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -11,7 +11,7 @@

 namespace cv { namespace dnn {

-class ResizeLayerImpl CV_FINAL : public ResizeLayer
+class ResizeLayerImpl : public ResizeLayer
 {
 public:
    ResizeLayerImpl(const LayerParams& params)
@@ -33,7 +33,7 @@ public:
        interpolation = params.get<String>("interpolation");
        CV_Assert(interpolation == "nearest" || interpolation == "bilinear");

-        alignCorners = params.get<bool>("align_corners", false);
+        bool alignCorners = params.get<bool>("align_corners", false);
        if (alignCorners)
            CV_Error(Error::StsNotImplemented, "Resize with align_corners=true is not implemented");
    }
@@ -66,6 +66,8 @@ public:
            outHeight = outputs[0].size[2];
            outWidth = outputs[0].size[3];
        }
+        scaleHeight = static_cast<float>(inputs[0]->size[2]) / outHeight;
+        scaleWidth = static_cast<float>(inputs[0]->size[3]) / outWidth;
    }

    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
@@ -103,8 +105,6 @@ public:
            const int inpWidth = inp.size[3];
            const int inpSpatialSize = inpHeight * inpWidth;
            const int outSpatialSize = outHeight * outWidth;
-            const float heightScale = static_cast<float>(inpHeight) / (outHeight);
-            const float widthScale = static_cast<float>(inpWidth) / (outWidth);
            const int numPlanes = inp.size[0] * inp.size[1];
            CV_Assert(inp.isContinuous(), out.isContinuous());

@@ -112,13 +112,13 @@ public:
            Mat outPlanes = out.reshape(1, numPlanes * outHeight);
            for (int y = 0; y < outHeight; ++y)
            {
-                float input_y = y * heightScale;
+                float input_y = y * scaleHeight;
                int y0 = static_cast<int>(input_y);
                const float* inpData_row0 = inpPlanes.ptr<float>(y0);
                const float* inpData_row1 = inpPlanes.ptr<float>(std::min(y0 + 1, inpHeight - 1));
                for (int x = 0; x < outWidth; ++x)
                {
-                    float input_x = x * widthScale;
+                    float input_x = x * scaleWidth;
                    int x0 = static_cast<int>(input_x);
                    int x1 = std::min(x0 + 1, inpWidth - 1);

@@ -162,10 +162,10 @@ public:
        return Ptr<BackendNode>();
    }

-private:
+protected:
    int outWidth, outHeight, zoomFactorWidth, zoomFactorHeight;
    String interpolation;
-    bool alignCorners;
+    float scaleWidth, scaleHeight;
 };


@@ -174,5 +174,44 @@ Ptr<ResizeLayer> ResizeLayer::create(const LayerParams& params)
    return Ptr<ResizeLayer>(new ResizeLayerImpl(params));
 }

+class InterpLayerImpl CV_FINAL : public ResizeLayerImpl
+{
+public:
+    InterpLayerImpl(const LayerParams& params) : ResizeLayerImpl(params) {}
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1, inputs[0].size() == 4);
+        outputs.resize(1, inputs[0]);
+        outputs[0][2] = outHeight > 0 ? outHeight : (1 + zoomFactorHeight * (outputs[0][2] - 1));
+        outputs[0][3] = outWidth > 0 ? outWidth : (1 + zoomFactorWidth * (outputs[0][3] - 1));
+        // We can work in-place (do nothing) if input shape == output shape.
+        return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]);
+    }
+
+    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+        int inpHeight = inputs[0]->size[2];
+        int inpWidth = inputs[0]->size[3];
+        scaleHeight = (outHeight > 1) ? (static_cast<float>(inpHeight - 1) / (outHeight - 1)) : 0.f;
+        scaleWidth = (outWidth > 1) ? (static_cast<float>(inpWidth - 1) / (outWidth - 1)) : 0.f;
+    }
+};
+
+Ptr<Layer> InterpLayer::create(const LayerParams& params)
+{
+    LayerParams lp(params);
+    lp.set("interpolation", "bilinear");
+    return Ptr<Layer>(new InterpLayerImpl(lp));
+}
+
 }  // namespace dnn
 }  // namespace cv
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+
+namespace cv { namespace dnn {
+
+class ShuffleChannelLayerImpl CV_FINAL : public ShuffleChannelLayer
+{
+public:
+    ShuffleChannelLayerImpl(const LayerParams& params)
+    {
+        group = params.get<int>("group", 1);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1 && inputs[0].size() == 4);
+        CV_Assert(inputs[0][1] % group == 0);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return group == 1;
+    }
+
+    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        if (group != 1)
+        {
+            LayerParams lp;
+            float order[] = {0, 2, 1, 3};
+            lp.set("order", DictValue::arrayInt(&order[0], 4));
+            permute = PermuteLayer::create(lp);
+
+            Mat inp = *inputs[0];
+            Mat out = outputs[0];
+
+            permuteInpShape.resize(4);
+            permuteInpShape[0] = inp.size[0];
+            permuteInpShape[1] = group;
+            permuteInpShape[2] = inp.size[1] / group;
+            permuteInpShape[3] = inp.size[2]*inp.size[3];
+
+            permuteOutShape.resize(4);
+            permuteOutShape[0] = permuteInpShape[0];
+            permuteOutShape[1] = permuteInpShape[2];
+            permuteOutShape[2] = permuteInpShape[1];
+            permuteOutShape[3] = permuteInpShape[3];
+
+            inp = inp.reshape(1, permuteInpShape);
+            out = out.reshape(1, permuteOutShape);
+
+            std::vector<Mat*> permuteInputs(1, &inp);
+            std::vector<Mat> permuteOutputs(1, out);
+            permute->finalize(permuteInputs, permuteOutputs);
+        }
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
+    }
+
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        Mat inp = *inputs[0];
+        Mat out = outputs[0];
+        if (inp.data != out.data)
+        {
+            if (!permute.empty())
+            {
+                inp = inp.reshape(1, permuteInpShape);
+                out = out.reshape(1, permuteOutShape);
+                std::vector<Mat*> permuteInputs(1, &inp);
+                std::vector<Mat> permuteOutputs(1, out);
+                permute->forward(permuteInputs, permuteOutputs, internals);
+            }
+            else
+                inp.copyTo(out);
+        }
+    }
+
+private:
+    Ptr<PermuteLayer> permute;
+    std::vector<int> permuteInpShape, permuteOutShape;
+};
+
+Ptr<Layer> ShuffleChannelLayer::create(const LayerParams& params)
+{
+    return Ptr<Layer>(new ShuffleChannelLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
+++ b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -306,6 +306,7 @@ class OCL4DNNConvSpatial
        std::string kernel_name_;
        std::string cache_path_;
        bool use_cache_path_; // true if cache_path_ directory exists
+        bool run_auto_tuning_;
        bool force_auto_tuning_;
        int32_t kernel_index_;
        std::vector< cv::Ptr<kernelConfig> > kernelQueue;

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -55,6 +55,7 @@
 #include "../include/math_functions.hpp"
 #include "../include/default_kernel_config.hpp"
 #include "opencv2/dnn/shape_utils.hpp"
+#include "opencv2/core/utils/logger.hpp"

 #if defined WIN32 || defined _WIN32
 #include <windows.h>
@@ -67,6 +68,69 @@ typedef std::map<std::string, std::string> kernel_hash_t;
 static kernel_hash_t kernelConfigMap;
 static bool defaultConfigLoaded = false;

+static std::string sanitize(const std::string& s)
+{
+    std::string s_ = s;
+    for (size_t i = 0; i < s_.size(); i++)
+    {
+        char c = s_[i];
+        if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'))
+        {
+            s_[i] = '_';
+        }
+    }
+    // TODO add hash?
+    // s_ = s_ + cv::format("_%08llx", crc64((uchar*)s.c_str(), s.size()));
+    return s_;
+}
+
+static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
+{
+    CV_Assert(defaultConfigLoaded == false);
+    CV_Assert(kernelConfigMap.empty());
+
+    /* fp32 config */
+    size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
+                        sizeof(default_kernel_config_intel_fp32[0]) / 2;
+    for (size_t i = 0; i < numConfigs; i++)
+    {
+        std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
+        if (!cache_path.empty())
+        {
+            std::string cacheFile = cache_path + sanitize(key);
+            std::ifstream cachedKernel(cacheFile.c_str());
+            if (cachedKernel)
+                continue;  // external configuration found, skip builtin
+        }
+        std::pair<std::string, std::string> entry(
+                key,
+                default_kernel_config_intel_fp32[2 * i + 1]);
+        kernelConfigMap.insert(entry);
+    }
+
+    /* fp16 config */
+    numConfigs = sizeof(default_kernel_config_intel_fp16) /
+                 sizeof(default_kernel_config_intel_fp16[0]) / 2;
+    for (size_t i = 0; i < numConfigs; i++)
+    {
+        std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
+        if (!cache_path.empty())
+        {
+            std::string cacheFile = cache_path + sanitize(key);
+            std::ifstream cachedKernel(cacheFile.c_str());
+            if (cachedKernel)
+                continue;  // external configuration found, skip builtin
+        }
+        std::pair<std::string, std::string> entry(
+                key,
+                default_kernel_config_intel_fp16[2 * i + 1]);
+        kernelConfigMap.insert(entry);
+    }
+
+    defaultConfigLoaded = true;
+}
+
+
 template<typename Dtype>
 OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
 {
@@ -139,9 +203,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
        }
    }

-    force_auto_tuning_ =
-            (use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false))
-            || utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false);
+    run_auto_tuning_ = use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false);
+    force_auto_tuning_ = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false);
 }

 template<typename Dtype>
@@ -272,40 +335,38 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,

        // options
        options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
+        options_ << " -cl-mad-enable";
        if (clOptionSupport("-cl-no-subgroup-ifp"))
            options_ << " -cl-no-subgroup-ifp ";

        // defs
-        int32_t output_width = output_w_;
-        int32_t output_height = output_h_;
        int32_t output_block_width = blockM;
        int32_t output_block_height = blockK;
-        const int32_t last_block_width = (output_width % output_block_width == 0) ?
-                                        output_block_width : output_width % output_block_width;
-        const int32_t last_block_height = (output_height % output_block_height == 0) ?
-                                         output_block_height : output_height % output_block_height;
-        int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4);
-        int tile_y = (output_block_height -1) * stride_h_ + kernel_h_ * dilation_h_;
-        int tile_y_stride = (4 * simd_size) / tile_x;
-        int invec_size = divUp(tile_y, tile_y_stride);
+        int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
+        int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
+        int invec_size = tile_y;

        addDef("SIMD_SIZE", simd_size);
-        addDef("filter_qualifier", "__global");
        addDef("OUT_BLOCK_WIDTH", output_block_width);
        addDef("OUT_BLOCK_HEIGHT", output_block_height);
-        addDef("LAST_BLOCK_WIDTH", last_block_width);
-        addDef("LAST_BLOCK_HEIGHT", last_block_height);
        addDef("INPUT_DEPTH", channels_ / group_);
        addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
        addDef("TOTAL_OUTPUT_DEPTH", num_output_);
        addDef("NUM_FILTERS", M_);
        addDef("TILE_X", tile_x);
        addDef("TILE_Y", tile_y);
-        addDef("TILE_Y_STRIDE", tile_y_stride);
        addDef("INVEC_SIZE", invec_size);
        addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
        addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
        addDef("APPLY_BIAS", bias_term_);
+        addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
+        addDef("INPUT_PITCH", (width_ * height_));
+        addDef("OUTPUT_PITCH", (output_w_ * output_h_));
+        addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
+        addDef("INPUT_WIDTH", width_);
+        addDef("INPUT_HEIGHT", height_);
+        addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
+
        setFusionDefine(fused_activ_, fused_eltwise_);

        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@@ -528,13 +589,6 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
    return;
 }

-#define dbg
-#ifdef dbg
-#define dbgPrint(x) (x)
-#else
-#define dbgPrint(x)
-#endif
-
 // For large enough input size, we do not need to tune kernels for different
 // size. The reason is with large input size, there will be enough work items
 // to feed al the EUs.
@@ -545,6 +599,7 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
 template<typename Dtype>
 void OCL4DNNConvSpatial<Dtype>::generateKey()
 {
+    std::string precision = (use_half_) ? "FP16" : "FP32";
    std::stringstream keyBuilder;
    // FIXME: to support fuse?
    keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
@@ -558,21 +613,12 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
               << "num" << num_ << "_"
               << "M" << M_ << "_"
               << "activ" << fused_activ_ << "_"
-               << "eltwise" << fused_eltwise_;
+               << "eltwise" << fused_eltwise_ << "_"
+               << precision;


    key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
-    key_sanitized_ = key_;
-    for (size_t i = 0; i < key_sanitized_.size(); i++)
-    {
-        char c = key_sanitized_[i];
-        if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'))
-        {
-            key_sanitized_[i] = '_';
-        }
-    }
-    // TODO add hash?
-    // key_sanitized_ = key_sanitized_ + cv::format("_%08llx", crc64((uchar*)key_.c_str(), key_.size()));
+    key_sanitized_ = sanitize(key_);
    short_key_ = keyBuilder.str();
 }

@@ -587,11 +633,6 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
               << "_" << blockHeight
               << "_" << blockDepth;

-    if (!use_half_)
-        keyBuilder << "_float";
-    else
-        keyBuilder << "_half";
-
    return keyBuilder.str();
 }

@@ -1135,7 +1176,7 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
    cv::ocl::Timer timer(queue);
    timer.start();
    bool res = true;;
-    dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl);
+    CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
    tuned_ = true;
    int loop_cnt = 4;
    for (int i = 0; i < loop_cnt; i++) {
@@ -1152,7 +1193,6 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
    }

    float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
-    #ifdef dbg
    double out_w = output_w_;
    double out_h = output_h_;
    double out_z = M_;
@@ -1160,16 +1200,8 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
    double k_h = kernel_h_;
    double k_z = channels_;
    double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
-    std::cout << "\tEstimated Gflops:" << (totalFlops * 1e-9)
-              << std::endl;
-    std::cout << "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))
-              << std::endl;
-    #if 0
-    std::cout << "Estimated utilization: " <<
-        ((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0
-        << std::endl;
-    #endif
-    #endif
+    CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
+    CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
    return elapsedTime;
 }

@@ -1225,18 +1257,18 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
                        if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
                            error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
                        {
-                            dbgPrint(printf("test verification failed @ image %d group %d"
-                                            "out_ch %d h %d w %d got %G expected %G\n",
-                                            n, g, out_ch, h, w, data[offset], verify_data[offset]));
+                            CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
+                                         << " out_ch " << out_ch << " h " << h << " w " << w
+                                         << " got " << data[offset] << " expected " << verify_data[offset]);
                            verificationFail = 1;
                            goto out;
                        }
                        else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
                                 !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
                        {
-                            dbgPrint(printf("test verification failed @ image %d group %d"
-                                            "out_ch %d h %d w %d got %G expected %G\n",
-                                            n, g, out_ch, h, w, data[offset], verify_data[offset]));
+                            CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
+                                         << " out_ch " << out_ch << " h " << h << " w " << w
+                                         << " got " << data[offset] << " expected " << verify_data[offset]);
                            verificationFail = 1;
                            goto out;
                        }
@@ -1517,17 +1549,11 @@ void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tu
        return;

    int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
-    int tile_x = alignSize(actual_tile_x, 4);
-    int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_;
-    if (tile_x > (4 * simd_size))
-        return;
-
-    if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
+    int tile_x = alignSize(actual_tile_x, simd_size);
+    if (tile_x > simd_size)
        return;

-    int tile_y_stride = (4 * simd_size) / tile_x;
-    int invec_size = divUp(tile_y, tile_y_stride);
-    if (invec_size > 4)
+    if (blockM * blockK > block_size_max)
        return;

    tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
@@ -1570,11 +1596,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
                for (uint32_t height = height_max; height > 0; height--)
                {
                    generate_idlf_tuneritems(tunerItems, width, height, simd_size);
-                    if (tunerItems.size() >= 8 && height == 2)
-                        break;
                }
-                if (tunerItems.size() >= 12 && width == 2)
-                    break;
            }
        }
    }
@@ -1661,35 +1683,31 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
        if (kernelQueue[x]->tested == false) {
            bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
            if (verified == false) {
-                dbgPrint(std::cout << "Kernel "
-                         << kernelQueue[x]->kernelName
-                         << " failed verification" << std::endl);
-                dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: "
-                         << kernelQueue[x]->workItem_output[0] << " "
-                         << "kernelQueue[x]->workItem_output[1]: "
-                         << kernelQueue[x]->workItem_output[1] << " "
-                         << "kernelQueue[x]->workItem_output[2]: "
-                         << kernelQueue[x]->workItem_output[2] << " "
-                         << "kernelQueue[x]->kernelType: "
-                         << kernelQueue[x]->kernelType << " "
-                         << "kernelQueue[x]->global_work_size[0]: "
-                         << kernelQueue[x]->global_work_size[0] << " "
-                         << "kernelQueue[x]->global_work_size[1]: "
-                         << kernelQueue[x]->global_work_size[1] << " "
-                         << "kernelQueue[x]->global_work_size[2]: "
-                         << kernelQueue[x]->global_work_size[2] << " "
-                         << "kernelQueue[x]->local_work_size[0]: "
-                         << kernelQueue[x]->local_work_size[0] << " "
-                         << "kernelQueue[x]->local_work_size[1]: "
-                         << kernelQueue[x]->local_work_size[1] << " "
-                         << "kernelQueue[x]->local_work_size[2]: "
-                         << kernelQueue[x]->local_work_size[2] << " "
-                         << kernelQueue[x]->swizzle_weights << " "
-                         << kernelQueue[x]->use_null_local << std::endl);
+                CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification");
+                CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: "
+                             << kernelQueue[x]->workItem_output[0] << " "
+                             << "kernelQueue[x]->workItem_output[1]: "
+                             << kernelQueue[x]->workItem_output[1] << " "
+                             << "kernelQueue[x]->workItem_output[2]: "
+                             << kernelQueue[x]->workItem_output[2] << " "
+                             << "kernelQueue[x]->kernelType: "
+                             << kernelQueue[x]->kernelType << " "
+                             << "kernelQueue[x]->global_work_size[0]: "
+                             << kernelQueue[x]->global_work_size[0] << " "
+                             << "kernelQueue[x]->global_work_size[1]: "
+                             << kernelQueue[x]->global_work_size[1] << " "
+                             << "kernelQueue[x]->global_work_size[2]: "
+                             << kernelQueue[x]->global_work_size[2] << " "
+                             << "kernelQueue[x]->local_work_size[0]: "
+                             << kernelQueue[x]->local_work_size[0] << " "
+                             << "kernelQueue[x]->local_work_size[1]: "
+                             << kernelQueue[x]->local_work_size[1] << " "
+                             << "kernelQueue[x]->local_work_size[2]: "
+                             << kernelQueue[x]->local_work_size[2] << " "
+                             << kernelQueue[x]->swizzle_weights << " "
+                             << kernelQueue[x]->use_null_local);
            } else {
-                dbgPrint(std::cout << "Kernel "
-                         << kernelQueue[x]->kernelName
-                         << " pass verification" << std::endl);
+                CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification");
            }
        }
        #endif
@@ -1718,19 +1736,28 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
                break;
            } else {
                kernelQueue[fastestKernel]->tested = true;
-                dbgPrint(std::cout << "Kernel " <<
-                         kernelQueue[fastestKernel]->kernelName <<
-                         " failed verification" << std::endl);
+                CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
+                             " failed verification");
                failures++;
            }
        }
    }
    if (verification) {
-        dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
-                 "> passed verification" << std::endl);
-        dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl);
+        CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
+                    "> passed verification");
+        CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
+        double out_w = output_w_;
+        double out_h = output_h_;
+        double out_z = M_;
+        double k_w = kernel_w_;
+        double k_h = kernel_h_;
+        double k_z = channels_;
+        float elapsedTime = kernelQueue[kernel_index_]->executionTime;
+        double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
+        CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
+        CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
    } else {
-        dbgPrint(std::cout << "fallback to basic kernel" << std::endl);
+        CV_LOG_INFO(NULL, "fallback to basic kernel");
        options_.str(""); options_.clear(); // clear contents and state flags
        createBasicKernel(1, 1, 1);
        kernel_index_ = kernelQueue.size() - 1;
@@ -1798,14 +1825,14 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
    if (loadCachedConfig()) // check in-memory cache
        return;

-    if (loadTunedConfig()) // check external storage
+    if (loadTunedConfig())  // check external storage
        return;

    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);

    calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);

-    if (force_auto_tuning_)
+    if (run_auto_tuning_ || force_auto_tuning_)
    {
        setupConvolution(bottom, top, weight, bias, numImages, benchData);
    }
@@ -1820,18 +1847,8 @@ template<typename Dtype>
 bool OCL4DNNConvSpatial<Dtype>::loadCachedConfig()
 {
    cv::AutoLock lock(kernelConfigMutex);
-    if (!defaultConfigLoaded)
-    {
-        const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2;
-        for (size_t i = 0; i < numConfigs; i++)
-        {
-            std::pair<std::string, std::string> entry(
-                    std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i],
-                    default_kernel_config_intel[2 * i + 1]);
-            kernelConfigMap.insert(entry);
-        }
-        defaultConfigLoaded = true;
-    }
+    if (!defaultConfigLoaded && !force_auto_tuning_)
+        initializeGlobalBuiltinConfigurations((use_cache_path_ && !cache_path_.empty()) ? (cache_path_ + '/') : std::string());

    kernel_hash_t::iterator it = kernelConfigMap.find(key_);
    if (it != kernelConfigMap.end())
@@ -1904,9 +1921,12 @@ bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int typ
 template<typename Dtype>
 bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
 {
+    if (force_auto_tuning_)
+        return false;  // don't load results from external storage
+
    if (!use_cache_path_)
    {
-        if (cache_path_.empty() && !force_auto_tuning_)
+        if (cache_path_.empty())
        {
            static int warn_ = 0;
            if (!warn_)

--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -206,8 +206,6 @@ __kernel void ConvolveBasic(

 #elif defined KERNEL_IDLF

-#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
-
 // Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
 // Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
 // NDRange:  (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
@@ -219,190 +217,123 @@ __kernel void
 convolve_simd(
    ELTWISE_DATA_ARG
    FUSED_ARG
-    __global Dtype* inputs_base,
-    filter_qualifier Dtype* weights_base,
+    __global Dtype* inputs,
+    __global Dtype* weights,
    BIAS_KERNEL_ARG
-    __global Dtype* outputs_base,
+    __global Dtype* outputs,
    const ushort input_width,
    const ushort input_height,
    const ushort output_width,
    const ushort output_height)
 {
-  __global Dtype* outputs = outputs_base;
-  __global Dtype* inputs = inputs_base;
-  filter_qualifier Dtype* weights = weights_base;
  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
-  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row
-  unsigned int fm = get_global_id(2);// fm = Feature Map = od = Output Depth
+  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
+  unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
  unsigned int fmg = get_group_id(2);
  unsigned int lid = get_local_id(2);

-  Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT];
-
-  int in_addr;
+  Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };

  // find weights address of given neuron (lid is index)
-  unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
+  unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
+                             INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;

-  for(int i=0;i<OUT_BLOCK_SIZE;i++) {
-    out[i]=0.0f;
-  }
+  unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;

-  unsigned int num_in_batch = ( fm ) / ALIGNED_NUM_FILTERS;
+  unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;

-  unsigned int input_batch_offset = num_in_batch * input_height * input_width * TOTAL_INPUT_DEPTH_SIZE;
-
-  int curr_local_y = ( lid / ( TILE_X / 4 ) );
-  int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
-  int curr_y = or * STRIDE_Y + curr_local_y;
-  int curr_x = oc * STRIDE_X + curr_local_x;
+  int curr_y = or * STRIDE_Y;
+  int curr_x = oc * STRIDE_X + lid;
 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
  int saved_y = curr_y;
 #endif
-  in_addr = input_batch_offset
-            +  (curr_y - INPUT_PAD_H) * input_width             // y tile offset
-            +   curr_x - INPUT_PAD_W;                        // x tile offset
-  union {
-    Dtype4 in_vec[INVEC_SIZE];
-    Dtype in_array[INVEC_SIZE * 4];
-  } in_buf;
+  int in_addr = input_batch_offset
+                +  (curr_y - INPUT_PAD_H) * INPUT_WIDTH          // y tile offset
+                +   curr_x - INPUT_PAD_W;                        // x tile offset
+
+  Dtype in_buf[INVEC_SIZE];

  for(int kd = 0; kd < INPUT_DEPTH; kd++)
  {
    int in_offset = in_addr;
-    int reg = 0;
-    LOOP(INVEC_SIZE, reg,
-      {
-        if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) {
+    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
+    for (int reg = 0; reg < INVEC_SIZE; reg++)
+    {
+        in_buf[reg] = inputs[in_offset];
 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-        if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) {
-          if (curr_x < INPUT_PAD_W) {
-            in_buf.in_vec[reg].s0 = 0;
-            if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W)
-              in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1);
-            else
-              in_buf.in_vec[reg].s1 = 0;
-            if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W)
-              in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
-            else
-              in_buf.in_vec[reg].s2 = 0;
-            if (curr_x + 3 < input_width + INPUT_PAD_W)
-              in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
-            else
-              in_buf.in_vec[reg].s3 = 0;
-          } else {
-            VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
-            if (curr_x + 1 >= input_width + INPUT_PAD_W)
-              in_buf.in_vec[reg].s1 = 0;
-            if (curr_x + 2 >= input_width + INPUT_PAD_W)
-              in_buf.in_vec[reg].s2 = 0;
-            if (curr_x + 3 >= input_width + INPUT_PAD_W)
-              in_buf.in_vec[reg].s3 = 0;
-          }
-        } else {
-          in_buf.in_vec[reg] = 0;
+        if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H &&
+              curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W))
+        {
+          in_buf[reg] = 0;
        }
-        curr_y += TILE_Y_STRIDE;
-#else
-        VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
 #endif
-        }
-        in_offset += input_width * TILE_Y_STRIDE;
-      });
-    in_addr += input_height * input_width;
+        curr_y += 1;
+        in_offset += INPUT_WIDTH;
+    }
+
+    in_addr += INPUT_PITCH;
+
 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
    curr_y = saved_y;
 #endif

-#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
-#define WEIGHT_PREF 8
-#else
-#define WEIGHT_PREF 1
-#endif
-    union {
-      Dtype w[WEIGHT_PREF];
-#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
-      INT_TYPE8 ui8;
-#endif
-    } weight_buf;
+    Dtype weight_buf[WEIGHT_PREF];
    int w_idx=0;

-    unsigned int orig_weight_addr = weight_addr;
-#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
-    weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
-    weight_addr += SIMD_SIZE * WEIGHT_PREF;
-#else
-    weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr]));
-    weight_addr += SIMD_SIZE * 1;
-#endif
+    for (int i = 0; i < WEIGHT_PREF; i++)
+    {
+        weight_buf[i] = weights[weight_addr];
+        weight_addr += SIMD_SIZE;
+    }

-#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
+#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))

    int kr = 0;  // kr = Kernel Row
    LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
+    {
+        int kc = 0;  // kc = Kernel Column
+        LOOP(KERNEL_WIDTH, kc,
        {
-          int kc = 0;  // kc = Kernel Column
-          LOOP(KERNEL_WIDTH, kc,
-              {
-                for(int br=0; br < OUT_BLOCK_HEIGHT; br++) {
-                  for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
-                    Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X);
-                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
-                  }
-                }
-#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF
-                // We assume KERNEL_W is equal to KERNEL_H here.
-                if ((w_idx + 1) % WEIGHT_PREF == 0
-                #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0
-                && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))
-                #endif
-                    ) {
-                  weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
-                  weight_addr += SIMD_SIZE * WEIGHT_PREF;  // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
+            for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
+            {
+                for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
+                {
+                    Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
+                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
                }
-              #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0
-                // need to do nothing
-              #else
-                else if ((w_idx + 1) %  WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)))
-                #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1
-                  weight_buf.w[0] = weights[weight_addr];
-                #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2
-                  weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]);
-                #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4
-                  weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]);
-                #else
-                  weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
-                #endif
-              #endif
-#endif
-                ++w_idx;
-              });
+            }
+            weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr];
+            weight_addr += SIMD_SIZE;
+            ++w_idx;
        });
-    weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE;
-
-  }
-  // dead code to work around possible compiler bug.
-  if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) {
-    outputs[0] = BLOCK_IN(fm % SIMD_SIZE);
+    });
+    weight_addr -= WEIGHT_PREF * SIMD_SIZE;
  }
+
  fm = fm % ALIGNED_NUM_FILTERS;

-  if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
-  unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
-  out_addr += or * output_width + oc;
-  // we need this address calculation for biases because we support views and batching
+#if LEFT_FILTERS > 0
+  if (fm < NUM_FILTERS)
+#endif
+  {
+    unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
+    out_addr += or * output_width + oc;
+    // we need this address calculation for biases because we support views and batching
 #if APPLY_BIAS
-  Dtype bias = biases_base[fm];
+    Dtype bias = biases_base[fm];
 #else
-  Dtype bias = 0;
+    Dtype bias = 0;
 #endif
-    for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
+
+    for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
+    {
      if (r + or >= output_height) break;
-      for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) {
+      for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
+      {
        if (c + oc >= output_width) break;
-        // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
+        // this does a scattered write to SIMD_SIZE different feature maps,
+        // so that data within one map is contiguous, thus ready for input to next layer.
        ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
-
      }
    }
  }

--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -1137,11 +1137,95 @@ private:
    int outWidth, outHeight, zoomFactor;
 };

-TEST(Layer_Test_Interp, Accuracy)
+TEST(Layer_Test_Interp_custom, Accuracy)
 {
    CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer);
    testLayerUsingCaffeModels("layer_interp", DNN_TARGET_CPU, false, false);
    LayerFactory::unregisterLayer("Interp");
 }

+TEST(Layer_Test_Interp, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_interp", DNN_TARGET_CPU, false, false);
+}
+
+TEST(Layer_Test_PoolingIndices, Accuracy)
+{
+    Net net;
+
+    LayerParams lp;
+    lp.set("pool", "max");
+    lp.set("kernel_w", 2);
+    lp.set("kernel_h", 2);
+    lp.set("stride_w", 2);
+    lp.set("stride_h", 2);
+    lp.set("pad_w", 0);
+    lp.set("pad_h", 0);
+    lp.name = "testLayer.name";  // This test also checks that OpenCV lets use names with dots.
+    lp.type = "Pooling";
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    Mat inp(10, 10, CV_8U);
+    randu(inp, 0, 255);
+
+    Mat maxValues(5, 5, CV_32F, Scalar(-1)), indices(5, 5, CV_32F, Scalar(-1));
+    for (int y = 0; y < 10; ++y)
+    {
+        int dstY = y / 2;
+        for (int x = 0; x < 10; ++x)
+        {
+            int dstX = x / 2;
+            uint8_t val = inp.at<uint8_t>(y, x);
+            if ((float)inp.at<uint8_t>(y, x) > maxValues.at<float>(dstY, dstX))
+            {
+                maxValues.at<float>(dstY, dstX) = val;
+                indices.at<float>(dstY, dstX) = y * 10 + x;
+            }
+        }
+    }
+    net.setInput(blobFromImage(inp));
+
+    std::vector<Mat> outputs;
+    net.forward(outputs, lp.name);
+    normAssert(maxValues, outputs[0].reshape(1, 5));
+    normAssert(indices, outputs[1].reshape(1, 5));
+}
+
+typedef testing::TestWithParam<tuple<Vec4i, int> > Layer_Test_ShuffleChannel;
+TEST_P(Layer_Test_ShuffleChannel, Accuracy)
+{
+    Vec4i inpShapeVec = get<0>(GetParam());
+    int group = get<1>(GetParam());
+    ASSERT_EQ(inpShapeVec[1] % group, 0);
+    const int groupSize = inpShapeVec[1] / group;
+
+    Net net;
+    LayerParams lp;
+    lp.set("group", group);
+    lp.type = "ShuffleChannel";
+    lp.name = "testLayer";
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    const int inpShape[] = {inpShapeVec[0], inpShapeVec[1], inpShapeVec[2], inpShapeVec[3]};
+    Mat inp(4, inpShape, CV_32F);
+    randu(inp, 0, 255);
+
+    net.setInput(inp);
+    Mat out = net.forward();
+
+    for (int n = 0; n < inpShapeVec[0]; ++n)
+    {
+        for (int c = 0; c < inpShapeVec[1]; ++c)
+        {
+            Mat outChannel = getPlane(out, n, c);
+            Mat inpChannel = getPlane(inp, n, groupSize * (c % group) + c / group);
+            normAssert(outChannel, inpChannel);
+        }
+    }
+}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_ShuffleChannel, Combine(
+/*input shape*/  Values(Vec4i(1, 6, 5, 7), Vec4i(3, 12, 1, 4)),
+/*group*/        Values(1, 2, 3, 6)
+));
+
 }} // namespace
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -87,7 +87,7 @@ static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String out
    if (outLayerName.empty())
        outLayerName = net.getLayerNames().back();

-    net.setInput(inp, "0");
+    net.setInput(inp);
    std::vector<Mat> outBlobs;
    net.forward(outBlobs, outLayerName);
    normAssert(outRef, outBlobs[0]);

--- a/modules/videoio/src/cap_openni2.cpp
+++ b/modules/videoio/src/cap_openni2.cpp
@@ -70,6 +70,35 @@
 #include "PS1080.h"

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static cv::Mutex initOpenNI2Mutex;
+
+struct OpenNI2Initializer
+{
+public:
+    static void init()
+    {
+        cv::AutoLock al(initOpenNI2Mutex);
+        static OpenNI2Initializer initializer;
+    }
+
+private:
+    OpenNI2Initializer()
+    {
+        // Initialize and configure the context.
+        openni::Status status = openni::OpenNI::initialize();
+        if (status != openni::STATUS_OK)
+        {
+            CV_Error(CV_StsError, std::string("Failed to initialize:") + openni::OpenNI::getExtendedError());
+        }
+    }
+
+    ~OpenNI2Initializer()
+    {
+        openni::OpenNI::shutdown();
+    }
+};
+
 class CvCapture_OpenNI2 : public CvCapture
 {
 public:
@@ -107,6 +136,8 @@ protected:

    static openni::VideoMode defaultStreamOutputMode(int stream);

+    CvCapture_OpenNI2(int index, const char * filename);
+
    IplImage* retrieveDepthMap();
    IplImage* retrievePointCloudMap();
    IplImage* retrieveDisparityMap();
@@ -116,8 +147,8 @@ protected:
    IplImage* retrieveGrayImage();
    IplImage* retrieveIrImage();

-    openni::Status toggleStream(int stream, bool toggle);
-    bool readCamerasParams();
+    void toggleStream(int stream, bool toggle);
+    void readCamerasParams();

    double getDepthGeneratorProperty(int propIdx) const;
    bool setDepthGeneratorProperty(int propIdx, double propVal);
@@ -131,12 +162,11 @@ protected:
    // OpenNI context
    openni::Device device;
    bool isContextOpened;
-    openni::Recorder recorder;

    // Data generators with its metadata
-    openni::VideoStream streams[CV_MAX_NUM_STREAMS];
-    openni::VideoFrameRef streamFrames[CV_MAX_NUM_STREAMS];
-    cv::Mat streamImages[CV_MAX_NUM_STREAMS];
+    std::vector<openni::VideoStream> streams;
+    std::vector<openni::VideoFrameRef> streamFrames;
+    std::vector<cv::Mat> streamImages;

    int maxBufferSize, maxTimeDuration; // for approx sync
    bool isCircleBuffer;
@@ -191,80 +221,103 @@ openni::VideoMode CvCapture_OpenNI2::defaultStreamOutputMode(int stream)
    return mode;
 }

-CvCapture_OpenNI2::CvCapture_OpenNI2( int index )
-{
-    const char* deviceURI = openni::ANY_DEVICE;
-    openni::Status status;
-    int deviceType = DEVICE_DEFAULT;
-
-    noSampleValue = shadowValue = 0;
-
-    isContextOpened = false;
-    maxBufferSize = DEFAULT_MAX_BUFFER_SIZE;
-    isCircleBuffer = DEFAULT_IS_CIRCLE_BUFFER;
-    maxTimeDuration = DEFAULT_MAX_TIME_DURATION;
-
-    if( index >= 10 )
-    {
-        deviceType = index / 10;
-        index %= 10;
-    }

+CvCapture_OpenNI2::CvCapture_OpenNI2(int index) :
+    CvCapture_OpenNI2(index, nullptr)
+{ }
+
+CvCapture_OpenNI2::CvCapture_OpenNI2(const char * filename) :
+    CvCapture_OpenNI2(-1, filename)
+{ }
+
+CvCapture_OpenNI2::CvCapture_OpenNI2(int index, const char * filename) :
+    device(),
+    isContextOpened(false),
+    streams(CV_MAX_NUM_STREAMS),
+    streamFrames(CV_MAX_NUM_STREAMS),
+    streamImages(CV_MAX_NUM_STREAMS),
+    maxBufferSize(DEFAULT_MAX_BUFFER_SIZE),
+    maxTimeDuration(DEFAULT_MAX_TIME_DURATION),
+    isCircleBuffer(DEFAULT_IS_CIRCLE_BUFFER),
+    baseline(0),
+    depthFocalLength_VGA(0),
+    shadowValue(0),
+    noSampleValue(0),
+    outputMaps(outputMapsTypesCount)
+{
    // Initialize and configure the context.
-    status = openni::OpenNI::initialize();
+    OpenNI2Initializer::init();

-    if (status != openni::STATUS_OK)
+    const char* deviceURI = openni::ANY_DEVICE;
+    bool needColor = true;
+    bool needIR = true;
+    if (index >= 0)
    {
-        CV_Error(CV_StsError, cv::format("Failed to initialize:", openni::OpenNI::getExtendedError()));
-        return;
-    }
+        int deviceType = DEVICE_DEFAULT;
+        if (index >= 10)
+        {
+            deviceType = index / 10;
+            index %= 10;
+        }
+        // Asus XTION and Occipital Structure Sensor do not have an image generator
+        needColor = (deviceType != DEVICE_ASUS_XTION);

-    // find appropriate device URI
-    openni::Array<openni::DeviceInfo> ldevs;
-    if (index > 0)
-    {
-        openni::OpenNI::enumerateDevices(&ldevs);
-        deviceURI = ldevs[index].getUri();
+        // find appropriate device URI
+        openni::Array<openni::DeviceInfo> ldevs;
+        if (index > 0)
+        {
+            openni::OpenNI::enumerateDevices(&ldevs);
+            if (index < ldevs.getSize())
+                deviceURI = ldevs[index].getUri();
+            else
+            {
+                CV_Error(CV_StsError, "OpenCVKinect2: Device index exceeds the number of available OpenNI devices");
+            }
+        }
    }
-
-    status = device.open(deviceURI);
-    if( status != openni::STATUS_OK )
+    else
    {
-        CV_Error(CV_StsError, cv::format("OpenCVKinect: Device open failed see: %s\n", openni::OpenNI::getExtendedError()));
-        openni::OpenNI::shutdown();
-        return;
+        deviceURI = filename;
    }

-    status = toggleStream(CV_DEPTH_STREAM, true);
-    // Asus XTION and Occipital Structure Sensor do not have an image generator
-    if (deviceType != DEVICE_ASUS_XTION)
-        status = openni::Status(status | toggleStream(CV_COLOR_STREAM, true));
+    openni::Status status;
+    status = device.open(deviceURI);
    if (status != openni::STATUS_OK)
    {
-        openni::OpenNI::shutdown();
-        return;
+        CV_Error(CV_StsError, std::string("OpenCVKinect2: Failed to open device: ") + openni::OpenNI::getExtendedError());
    }

-    if (!readCamerasParams())
-    {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Could not read cameras parameters\n"));
-        return;
-    }
+    toggleStream(CV_DEPTH_STREAM, true);
+    if (needColor)
+        toggleStream(CV_COLOR_STREAM, true);
+    if (needIR)
+        toggleStream(CV_IR_STREAM, true);

+    setProperty(CV_CAP_PROP_OPENNI_REGISTRATION, 1.0);

-    outputMaps.resize( outputMapsTypesCount );
+    // default for Kinect2 camera
+    setProperty(CV_CAP_PROP_OPENNI2_MIRROR, 0.0);

    isContextOpened = true;
+}

-    setProperty(CV_CAP_PROP_OPENNI_REGISTRATION, 1.0);
+CvCapture_OpenNI2::~CvCapture_OpenNI2()
+{
+    for (size_t i = 0; i < streams.size(); ++i)
+    {
+        streamFrames[i].release();
+        streams[i].stop();
+        streams[i].destroy();
+    }
+    device.close();
 }

-openni::Status CvCapture_OpenNI2::toggleStream(int stream, bool toggle)
+void CvCapture_OpenNI2::toggleStream(int stream, bool toggle)
 {
    openni::Status status;

    // for logging
-    static const char* stream_names[CV_MAX_NUM_STREAMS] = {
+    static const std::string stream_names[CV_MAX_NUM_STREAMS] = {
        "depth",
        "color",
        "IR"
@@ -280,140 +333,92 @@ openni::Status CvCapture_OpenNI2::toggleStream(int stream, bool toggle)
    {
        // already opened
        if (streams[stream].isValid())
-            return openni::STATUS_OK;
+            return;

        // open stream
        status = streams[stream].create(device, stream_sensor_types[stream]);
        if (status == openni::STATUS_OK)
        {
-            // set video mode
-            status = streams[stream].setVideoMode(defaultStreamOutputMode(stream)); // xn::DepthGenerator supports VGA only! (Jan 2011)
-            if (status != openni::STATUS_OK)
+            // try to set up default stream mode (if available)
+            const openni::Array<openni::VideoMode>& vm = streams[stream].getSensorInfo().getSupportedVideoModes();
+            openni::VideoMode dm = defaultStreamOutputMode(stream);
+            for (int i = 0; i < vm.getSize(); i++)
            {
-                CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't set %s stream output mode: %s\n",
-                                                 stream_names[stream],
-                                                 openni::OpenNI::getExtendedError()));
-                streams[stream].destroy();
-                return status;
+                if (vm[i].getPixelFormat() == dm.getPixelFormat() &&
+                    vm[i].getResolutionX() == dm.getResolutionX() &&
+                    vm[i].getResolutionY() == dm.getResolutionY() &&
+                    vm[i].getFps() == dm.getFps())
+                {
+                    status = streams[stream].setVideoMode(defaultStreamOutputMode(stream));
+                    if (status != openni::STATUS_OK)
+                    {
+                        streams[stream].destroy();
+                        CV_Error(CV_StsError, std::string("OpenCVKinect2 : Couldn't set ") +
+                                 stream_names[stream] + std::string(" stream output mode: ") +
+                                 std::string(openni::OpenNI::getExtendedError()));
+                    }
+                }
            }

            // start stream
            status = streams[stream].start();
            if (status != openni::STATUS_OK)
            {
-                CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start %s stream: %s\n",
-                                                 stream_names[stream],
-                                                 openni::OpenNI::getExtendedError()));
                streams[stream].destroy();
-                return status;
+                CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start ") +
+                         stream_names[stream] + std::string(" stream: ") +
+                         std::string(openni::OpenNI::getExtendedError()));
            }
        }
        else
        {
-            CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find %s stream:: %s\n",
-                                             stream_names[stream],
-                                             openni::OpenNI::getExtendedError()));
-            return status;
+            CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find ") +
+                     stream_names[stream] + " stream: " +
+                     std::string(openni::OpenNI::getExtendedError()));
        }
    }
    else if (streams[stream].isValid()) // want to close stream
    {
-        streams[stream].stop();
-        streams[stream].destroy();
-    }
-
-    return openni::STATUS_OK;
-}
-
-CvCapture_OpenNI2::CvCapture_OpenNI2(const char * filename)
-{
-    openni::Status status;
-
-    isContextOpened = false;
-    maxBufferSize = DEFAULT_MAX_BUFFER_SIZE;
-    isCircleBuffer = DEFAULT_IS_CIRCLE_BUFFER;
-    maxTimeDuration = DEFAULT_MAX_TIME_DURATION;
-
-    // Initialize and configure the context.
-    status = openni::OpenNI::initialize();
-
-    if (status != openni::STATUS_OK)
-    {
-        CV_Error(CV_StsError, cv::format("Failed to initialize:", openni::OpenNI::getExtendedError()));
-        return;
-    }
+        //FIX for libfreenect2
+        //which stops the whole device when stopping only one stream

-    // Open file
-    status = device.open(filename);
-    if( status != openni::STATUS_OK )
-    {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Failed to open input file (%s): %s\n", filename, openni::OpenNI::getExtendedError()));
-        return;
+        //streams[stream].stop();
+        //streams[stream].destroy();
    }
-
-    status = openni::Status(toggleStream(CV_DEPTH_STREAM, true) | toggleStream(CV_COLOR_STREAM, true));
-    if (status != openni::STATUS_OK)
-    {
-        openni::OpenNI::shutdown();
-        return;
-    }
-
-    if( !readCamerasParams() )
-    {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Could not read cameras parameters\n"));
-        return;
-    }
-
-    outputMaps.resize( outputMapsTypesCount );
-
-    isContextOpened = true;
 }

-CvCapture_OpenNI2::~CvCapture_OpenNI2()
-{
-    for (int i = 0; i < CV_MAX_NUM_STREAMS; ++i)
-    {
-        streamFrames[i].release();
-        streams[i].stop();
-        streams[i].destroy();
-    }
-    device.close();
-    openni::OpenNI::shutdown();
-}

-bool CvCapture_OpenNI2::readCamerasParams()
+void CvCapture_OpenNI2::readCamerasParams()
 {
    double pixelSize = 0;
    if (streams[CV_DEPTH_STREAM].getProperty<double>(XN_STREAM_PROPERTY_ZERO_PLANE_PIXEL_SIZE, &pixelSize) != openni::STATUS_OK)
    {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read pixel size!\n"));
-        return false;
+        CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read pixel size!" +
+                              std::string(openni::OpenNI::getExtendedError()));
    }

    // pixel size @ VGA = pixel size @ SXGA x 2
    pixelSize *= 2.0; // in mm

    // focal length of IR camera in pixels for VGA resolution
-    int zeroPlanDistance; // in mm
-    if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_DISTANCE, &zeroPlanDistance) != openni::STATUS_OK)
+    unsigned long long zeroPlaneDistance; // in mm
+    if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_DISTANCE, &zeroPlaneDistance) != openni::STATUS_OK)
    {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read virtual plane distance!\n"));
-        return false;
+        CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read virtual plane distance!" +
+                              std::string(openni::OpenNI::getExtendedError()));
    }

    if (streams[CV_DEPTH_STREAM].getProperty<double>(XN_STREAM_PROPERTY_EMITTER_DCMOS_DISTANCE, &baseline) != openni::STATUS_OK)
    {
-        CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read base line!\n"));
-        return false;
+        CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read base line!" +
+                              std::string(openni::OpenNI::getExtendedError()));
    }

    // baseline from cm -> mm
    baseline *= 10;

    // focal length from mm -> pixels (valid for 640x480)
-    depthFocalLength_VGA = (int)((double)zeroPlanDistance / (double)pixelSize);
-
-    return true;
+    depthFocalLength_VGA = (int)((double)zeroPlaneDistance / (double)pixelSize);
 }

 double CvCapture_OpenNI2::getProperty( int propIdx ) const
@@ -500,7 +505,7 @@ double CvCapture_OpenNI2::getCommonProperty( int propIdx ) const
        break;
    }
    default :
-        CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for getting.\n", propIdx) );
+        CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for getting.", propIdx) );
    }

    return propValue;
@@ -525,14 +530,20 @@ bool CvCapture_OpenNI2::setCommonProperty( int propIdx, double propValue )
    // There is a set of properties that correspond to depth generator by default
    // (is they are pass without particular generator flag).
    case CV_CAP_PROP_OPENNI_REGISTRATION:
-        isSet = setDepthGeneratorProperty( propIdx, propValue );
+        isSet = setDepthGeneratorProperty(propIdx, propValue);
        break;
    case CV_CAP_PROP_OPENNI2_SYNC:
        isSet = device.setDepthColorSyncEnabled(propValue > 0.0) == openni::STATUS_OK;
        break;

+    case CV_CAP_PROP_FRAME_WIDTH:
+    case CV_CAP_PROP_FRAME_HEIGHT:
+    case CV_CAP_PROP_AUTOFOCUS:
+        isSet = false;
+        break;
+
    default:
-        CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for setting.\n", propIdx) );
+        CV_Error(CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for setting.", propIdx));
    }

    return isSet;
@@ -565,9 +576,13 @@ double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const
        propValue = streams[CV_DEPTH_STREAM].getMaxPixelValue();
        break;
    case CV_CAP_PROP_OPENNI_BASELINE :
+        if(baseline <= 0)
+            const_cast<CvCapture_OpenNI2*>(this)->readCamerasParams();
        propValue = baseline;
        break;
    case CV_CAP_PROP_OPENNI_FOCAL_LENGTH :
+        if(depthFocalLength_VGA <= 0)
+            const_cast<CvCapture_OpenNI2*>(this)->readCamerasParams();
        propValue = (double)depthFocalLength_VGA;
        break;
    case CV_CAP_PROP_OPENNI_REGISTRATION :
@@ -580,7 +595,7 @@ double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const
        propValue = streamFrames[CV_DEPTH_STREAM].getFrameIndex();
        break;
    default :
-        CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for getting.\n", propIdx) );
+        CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for getting.", propIdx) );
    }

    return propValue;
@@ -594,7 +609,10 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue
    {
    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
        if (isContextOpened)
-            isSet = toggleStream(CV_DEPTH_STREAM, propValue > 0.0) == openni::STATUS_OK;
+        {
+            toggleStream(CV_DEPTH_STREAM, propValue > 0.0);
+            isSet = true;
+        }
        break;
    case CV_CAP_PROP_OPENNI_REGISTRATION:
        {
@@ -612,12 +630,13 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue
                        {
                            openni::Status status = device.setImageRegistrationMode(mode);
                            if( status != openni::STATUS_OK )
-                                CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setDepthGeneratorProperty : %s\n", openni::OpenNI::getExtendedError()));
+                                CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") +
+                                         std::string(openni::OpenNI::getExtendedError()));
                            else
                                isSet = true;
                        }
                        else
-                            CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setDepthGeneratorProperty : Unsupported viewpoint.\n"));
+                            CV_Error(CV_StsError, "CvCapture_OpenNI2::setDepthGeneratorProperty: Unsupported viewpoint.");
                    }
                    else
                        isSet = true;
@@ -627,14 +646,15 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue
            {
                openni::Status status = device.setImageRegistrationMode(openni::IMAGE_REGISTRATION_OFF);
                if( status != openni::STATUS_OK )
-                    CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setDepthGeneratorProperty : %s\n", openni::OpenNI::getExtendedError()));
+                    CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") +
+                             std::string(openni::OpenNI::getExtendedError()));
                else
                    isSet = true;
            }
        }
        break;
    default:
-        CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for setting.\n", propIdx) );
+        CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for setting.", propIdx) );
    }

    return isSet;
@@ -668,7 +688,7 @@ double CvCapture_OpenNI2::getImageGeneratorProperty( int propIdx ) const
        propValue = (double)streamFrames[CV_COLOR_STREAM].getFrameIndex();
        break;
    default :
-        CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.\n", propIdx) );
+        CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.", propIdx) );
    }

    return propValue;
@@ -682,7 +702,10 @@ bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue)
        {
        case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
            if (isContextOpened)
-                isSet = toggleStream(CV_COLOR_STREAM, propValue > 0.0) == openni::STATUS_OK;
+            {
+                toggleStream(CV_COLOR_STREAM, propValue > 0.0);
+                isSet = true;
+            }
            break;
        case CV_CAP_PROP_OPENNI_OUTPUT_MODE :
        {
@@ -713,18 +736,19 @@ bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue)
                mode.setFps(60);
                 break;
            default :
-                CV_Error( CV_StsBadArg, "Unsupported image generator output mode.\n");
+                CV_Error( CV_StsBadArg, "Unsupported image generator output mode.");
            }

            openni::Status status = streams[CV_COLOR_STREAM].setVideoMode( mode );
            if( status != openni::STATUS_OK )
-                CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setImageGeneratorProperty : %s\n", openni::OpenNI::getExtendedError()));
+                CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") +
+                         std::string(openni::OpenNI::getExtendedError()));
            else
                isSet = true;
            break;
        }
        default:
-            CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.\n", propIdx) );
+            CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx) );
        }

    return isSet;
@@ -758,7 +782,7 @@ double CvCapture_OpenNI2::getIrGeneratorProperty(int propIdx) const
        propValue = (double)streamFrames[CV_IR_STREAM].getFrameIndex();
        break;
    default:
-        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.\n", propIdx));
+        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.", propIdx));
    }

    return propValue;
@@ -772,7 +796,10 @@ bool CvCapture_OpenNI2::setIrGeneratorProperty(int propIdx, double propValue)
    {
    case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT:
        if (isContextOpened)
-            isSet = toggleStream(CV_IR_STREAM, propValue > 0.0) == openni::STATUS_OK;
+        {
+            toggleStream(CV_IR_STREAM, propValue > 0.0);
+            isSet = true;
+        }
        break;
    case CV_CAP_PROP_OPENNI_OUTPUT_MODE:
    {
@@ -803,18 +830,19 @@ bool CvCapture_OpenNI2::setIrGeneratorProperty(int propIdx, double propValue)
            mode.setFps(60);
            break;
        default:
-            CV_Error(CV_StsBadArg, "Unsupported image generator output mode.\n");
+            CV_Error(CV_StsBadArg, "Unsupported image generator output mode.");
        }

        openni::Status status = streams[CV_IR_STREAM].setVideoMode(mode);
        if (status != openni::STATUS_OK)
-            CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setImageGeneratorProperty : %s\n", openni::OpenNI::getExtendedError()));
+            CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") +
+                     std::string(openni::OpenNI::getExtendedError()));
        else
            isSet = true;
        break;
    }
    default:
-        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.\n", propIdx));
+        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx));
    }

    return isSet;
@@ -931,10 +959,12 @@ IplImage* CvCapture_OpenNI2::retrieveDisparityMap()
    if (!streamFrames[CV_DEPTH_STREAM].isValid())
        return 0;

+    readCamerasParams();
+
    cv::Mat disp32;
    computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], disp32, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);

-    disp32.convertTo( outputMaps[CV_CAP_OPENNI_DISPARITY_MAP].mat, CV_8UC1 );
+    disp32.convertTo(outputMaps[CV_CAP_OPENNI_DISPARITY_MAP].mat, CV_8UC1);

    return outputMaps[CV_CAP_OPENNI_DISPARITY_MAP].getIplImagePtr();
 }
@@ -944,6 +974,8 @@ IplImage* CvCapture_OpenNI2::retrieveDisparityMap_32F()
    if (!streamFrames[CV_DEPTH_STREAM].isValid())
        return 0;

+    readCamerasParams();
+
    computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].mat, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);

    return outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].getIplImagePtr();
@@ -966,7 +998,7 @@ inline void getBGRImageFromMetaData( const openni::VideoFrameRef& imageMetaData,
 {
   cv::Mat bufferImage;
   if( imageMetaData.getVideoMode().getPixelFormat() != openni::PIXEL_FORMAT_RGB888 )
-        CV_Error( CV_StsUnsupportedFormat, "Unsupported format of grabbed image\n" );
+        CV_Error( CV_StsUnsupportedFormat, "Unsupported format of grabbed image." );

   bgrImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_8UC3);
   bufferImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_8UC3);
@@ -989,7 +1021,7 @@ inline void getGrayImageFromMetaData(const openni::VideoFrameRef& imageMetaData,
    }
    else
    {
-        CV_Error(CV_StsUnsupportedFormat, "Unsupported format of grabbed image\n");
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported format of grabbed image.");
    }
 }


--- a/samples/cpp/videocapture_basic.cpp
+++ b/samples/cpp/videocapture_basic.cpp
@@ -20,7 +20,7 @@ int main(int, char**)
    //--- INITIALIZE VIDEOCAPTURE
    VideoCapture cap;
    // open the default camera using default API
-    cap.open(0);
+    // cap.open(0);
    // OR advance usage: select any API backend
    int deviceID = 0;             // 0 = open default camera
    int apiID = cv::CAP_ANY;      // 0 = autodetect default API

--- a/samples/data/dnn/object_detection_classes_coco.txt
+++ b/samples/data/dnn/object_detection_classes_coco.txt
@@ -9,7 +9,7 @@ truck
 boat
 traffic light
 fire hydrant
-
+street sign
 stop sign
 parking meter
 bench
@@ -23,11 +23,11 @@ elephant
 bear
 zebra
 giraffe
-
+hat
 backpack
 umbrella
-
-
+shoe
+eye glasses
 handbag
 tie
 suitcase
@@ -42,7 +42,7 @@ skateboard
 surfboard
 tennis racket
 bottle
-
+plate
 wine glass
 cup
 fork
@@ -63,12 +63,12 @@ chair
 couch
 potted plant
 bed
-
+mirror
 dining table
-
-
+window
+desk
 toilet
-
+door
 tv
 laptop
 mouse
@@ -80,7 +80,7 @@ oven
 toaster
 sink
 refrigerator
-
+blender
 book
 clock
 vase

--- a/samples/data/dnn/object_detection_classes_yolov3.txt
+++ b/samples/data/dnn/object_detection_classes_yolov3.txt
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/samples/opencl/opencl-opencv-interop.cpp
+++ b/samples/opencl/opencl-opencv-interop.cpp
@@ -14,6 +14,8 @@
 #include <iomanip>
 #include <stdexcept>

+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #define CL_USE_DEPRECATED_OPENCL_2_0_APIS // eliminate build warning

 #ifdef __APPLE__