Merge pull request #707 from ludv1x:dnn

9a342b51 · Vadim Pisarevsky · e7b5c81b · 942e9205 · 9a342b51 · 9a342b51
Commit 9a342b51 authored Aug 04, 2016 by Vadim Pisarevsky
66 changed files
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -17,15 +17,38 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4701)
 # Resolve libprotobuf dependency
 # ----------------------------------------------------------------------------
 include(cmake/OpenCVFindLibProtobuf.cmake)
-ocv_glob_module_sources(${PROTOBUF_SRCS} ${PROTOBUF_HDRS})
 ocv_source_group("Src\\protobuf" FILES ${PROTOBUF_SRCS} ${PROTOBUF_HDRS})
 ocv_module_include_directories(include ${PROTOBUF_INCLUDE_DIR})
+# ----------------------------------------------------------------------------
+# Try to find BLAS libraries
+# ----------------------------------------------------------------------------
+OCV_OPTION(${the_module}_WITH_BLAS "Use external BLAS library to speedup processing" OFF)
+include(cmake/OpenCVFindCBLAS.cmake)
+ocv_glob_module_sources(${PROTOBUF_SRCS} ${PROTOBUF_HDRS} ${CBLAS_H_PROXY_PATH})
 ocv_create_module(${PROTOBUF_LIBRARIES})
 ocv_add_samples()
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()
+# ----------------------------------------------------------------------------
+# Link BLAS
+# ----------------------------------------------------------------------------
+if(${the_module}_WITH_BLAS AND HAVE_BLAS)
+    add_definitions(-DHAVE_CBLAS=1)
+    ocv_module_include_directories(${${the_module}_BLAS_INCLUDE_DIR})
+    ocv_add_dependencies(${the_module} ${${the_module}_BLAS_LIBRARIES})
+    target_link_libraries(${the_module} ${${the_module}_BLAS_LIBRARIES})
+    if(${the_module}_BLAS_BINARIES)
+        ocv_install_target(${the_module} EXPORT ${the_module}_BLAS_BINARIES
+                           RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
+    endif()
+else()
+    add_definitions(-DHAVE_CBLAS=0)
+endif()
 # ----------------------------------------------------------------------------
 # Download pre-trained models for complex testing on GoogLeNet and AlexNet
 # ----------------------------------------------------------------------------

--- a/modules/dnn/cmake/FindAtlas.cmake
+++ b/modules/dnn/cmake/FindAtlas.cmake
+#COPYRIGHT
+#
+#All contributions by the University of California:
+#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+#All rights reserved.
+#
+#All other contributions:
+#Copyright (c) 2014, 2015, the respective contributors
+#All rights reserved.
+#
+#Caffe uses a shared copyright model: each contributor holds copyright over
+#their contributions to Caffe. The project versioning records all such
+#contribution and copyright details. If a contributor wants to further mark
+#their specific copyright on a particular contribution, they should indicate
+#their copyright solely in the commit message of the change when it is
+#committed.
+#
+#LICENSE
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions are met:
+#
+#1. Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#2. Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#CONTRIBUTION AGREEMENT
+#
+#By contributing to the BVLC/caffe repository through pull-request, comment,
+#or otherwise, the contributor releases their content to the
+#license and copyright terms herein.
+# Find the Atlas (and Lapack) libraries
+#
+# The following variables are optionally searched for defaults
+#  Atlas_ROOT_DIR:            Base directory where all Atlas components are found
+#
+# The following are set after configuration is done:
+#  Atlas_FOUND
+#  Atlas_INCLUDE_DIRS
+#  Atlas_LIBRARIES
+#  Atlas_LIBRARYRARY_DIRS
+set(Atlas_INCLUDE_SEARCH_PATHS
+  /usr/include/atlas
+  /usr/include/atlas-base
+  $ENV{Atlas_ROOT_DIR}
+  $ENV{Atlas_ROOT_DIR}/include
+)
+set(Atlas_LIB_SEARCH_PATHS
+  /usr/lib/atlas
+  /usr/lib/atlas-base
+  $ENV{Atlas_ROOT_DIR}
+  $ENV{Atlas_ROOT_DIR}/lib
+)
+find_path(Atlas_CBLAS_INCLUDE_DIR   NAMES cblas.h   PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
+find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
+find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                 PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES alapack_r alapack lapack_atlas  PATHS ${Atlas_LIB_SEARCH_PATHS})
+set(LOOKED_FOR
+  Atlas_CBLAS_INCLUDE_DIR
+  Atlas_CLAPACK_INCLUDE_DIR
+  Atlas_CBLAS_LIBRARY
+  Atlas_BLAS_LIBRARY
+  Atlas_LAPACK_LIBRARY
+)
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Atlas DEFAULT_MSG ${LOOKED_FOR})
+if(ATLAS_FOUND)
+  set(Atlas_INCLUDE_DIR ${Atlas_CBLAS_INCLUDE_DIR} ${Atlas_CLAPACK_INCLUDE_DIR})
+  set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
+  mark_as_advanced(${LOOKED_FOR})
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+endif(ATLAS_FOUND)
\ No newline at end of file
--- a/modules/dnn/cmake/FindOpenBLAS.cmake
+++ b/modules/dnn/cmake/FindOpenBLAS.cmake
+#COPYRIGHT
+#
+#All contributions by the University of California:
+#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+#All rights reserved.
+#
+#All other contributions:
+#Copyright (c) 2014, 2015, the respective contributors
+#All rights reserved.
+#
+#Caffe uses a shared copyright model: each contributor holds copyright over
+#their contributions to Caffe. The project versioning records all such
+#contribution and copyright details. If a contributor wants to further mark
+#their specific copyright on a particular contribution, they should indicate
+#their copyright solely in the commit message of the change when it is
+#committed.
+#
+#LICENSE
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions are met:
+#
+#1. Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#2. Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#CONTRIBUTION AGREEMENT
+#
+#By contributing to the BVLC/caffe repository through pull-request, comment,
+#or otherwise, the contributor releases their content to the
+#license and copyright terms herein.
+SET(Open_BLAS_INCLUDE_SEARCH_PATHS
+  /usr/include
+  /usr/include/openblas
+  /usr/include/openblas-base
+  /usr/local/include
+  /usr/local/include/openblas
+  /usr/local/include/openblas-base
+  /opt/OpenBLAS/include
+  $ENV{OpenBLAS_HOME}
+  $ENV{OpenBLAS_HOME}/include
+)
+SET(Open_BLAS_LIB_SEARCH_PATHS
+        /lib/
+        /lib/openblas-base
+        /lib64/
+        /usr/lib
+        /usr/lib/openblas-base
+        /usr/lib64
+        /usr/local/lib
+        /usr/local/lib64
+        /opt/OpenBLAS/lib
+        $ENV{OpenBLAS}cd
+        $ENV{OpenBLAS}/lib
+        $ENV{OpenBLAS_HOME}
+        $ENV{OpenBLAS_HOME}/lib
+ )
+FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
+FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
+SET(OpenBLAS_FOUND ON)
+#    Check include files
+IF(NOT OpenBLAS_INCLUDE_DIR)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
+ENDIF()
+#    Check libraries
+IF(NOT OpenBLAS_LIB)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
+ENDIF()
+IF (OpenBLAS_FOUND)
+  IF (NOT OpenBLAS_FIND_QUIETLY)
+    MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
+    MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
+  ENDIF (NOT OpenBLAS_FIND_QUIETLY)
+ELSE (OpenBLAS_FOUND)
+  IF (OpenBLAS_FIND_REQUIRED)
+    MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
+  ENDIF (OpenBLAS_FIND_REQUIRED)
+ENDIF (OpenBLAS_FOUND)
+MARK_AS_ADVANCED(
+    OpenBLAS_INCLUDE_DIR
+    OpenBLAS_LIB
+    OpenBLAS
+)
\ No newline at end of file
--- a/modules/dnn/cmake/OpenCVFindCBLAS.cmake
+++ b/modules/dnn/cmake/OpenCVFindCBLAS.cmake
+macro(_find_file_in_dirs VAR NAME DIRS)
+    find_path(${VAR} ${NAME} ${DIRS} NO_DEFAULT_PATH)
+    set(${VAR} ${${VAR}}/${NAME})
+    unset(${VAR} CACHE)
+endmacro()
+if(${the_module}_WITH_BLAS)
+    set(_bp ${the_module}_BLAS) #prefix for blas variables
+    set(BLAS_CBLAS_H "cblas.h")
+    set(HAVE_BLAS "")
+    if(NOT HAVE_BLAS) #check custom BLAS from user input
+        if(${_bp}_INCLUDE_DIR AND ${_bp}_LIBRARIES AND ${_bp}_CBLAS_H)
+            set(HAVE_BLAS "Custom")
+        endif()
+    endif()
+    if(NOT HAVE_BLAS)
+        include(cmake/OpenCVFindMKL.cmake)
+        if(MKL_FOUND)
+            set(BLAS_INCLUDE_DIR    ${MKL_INCLUDE_DIRS})
+            set(BLAS_LIBRARIES      ${MKL_LIBRARIES}   )
+            set(BLAS_CBLAS_H        "mkl_cblas.h"      )
+            set(HAVE_BLAS "MKL")
+        endif()
+    endif()
+    if(NOT HAVE_BLAS)
+        include(cmake/FindOpenBLAS.cmake)
+        if(OpenBLAS_FOUND)
+            set(BLAS_INCLUDE_DIR    ${OpenBLAS_INCLUDE_DIR} )
+            set(BLAS_LIBRARIES      ${OpenBLAS_LIB}         )
+            set(HAVE_BLAS "OpenBLAS")
+        endif()
+    endif()
+    if(NOT HAVE_BLAS AND UNIX)
+        include(cmake/FindAtlas.cmake)
+        if(ATLAS_FOUND)
+            set(BLAS_INCLUDE_DIR    ${Atlas_INCLUDE_DIR})
+            set(BLAS_LIBRARIES      ${Atlas_LIBRARIES}  )
+            set(HAVE_BLAS "Atlas")
+        endif()
+    endif()
+    if(NOT HAVE_BLAS OR NOT (HAVE_BLAS STREQUAL "Custom"))
+        set(${_bp}_INCLUDE_DIR  ${BLAS_INCLUDE_DIR} CACHE PATH      "Path to BLAS include dir" FORCE)
+        set(${_bp}_CBLAS_H      ${BLAS_CBLAS_H}     CACHE STRING    "Alternative name of cblas.h" FORCE)
+        set(${_bp}_LIBRARIES    ${BLAS_LIBRARIES}   CACHE FILEPATH  "Path to BLAS libraries that will be linked with ${the_module} module" FORCE)
+        set(${_bp}_BINARIES     ${BLAS_BINARIES}    CACHE FILEPATH  "Path to BLAS binaries (.so, .dll) that will be installed with ${the_module} module" FORCE)
+    endif()
+    if(HAVE_BLAS) #adding proxy cblas.h header
+        _find_file_in_dirs(CBLAS_H_PATH ${${_bp}_CBLAS_H} ${${_bp}_INCLUDE_DIR})
+        if(NOT CBLAS_H_PATH)
+            message(WARNING "CBLAS header '${${_bp}_CBLAS_H}' not found into '${${_bp}_INCLUDE_DIR}'")
+        endif()
+        set(CBLAS_H_PROXY_PATH ${CMAKE_CURRENT_BINARY_DIR}/opencv_cblas.hpp)
+        set(_include_str "\#include \"${CBLAS_H_PATH}\"")
+        file(WRITE ${CBLAS_H_PROXY_PATH} ${_include_str})
+    endif()
+endif()
\ No newline at end of file
--- a/modules/dnn/cmake/OpenCVFindMKL.cmake
+++ b/modules/dnn/cmake/OpenCVFindMKL.cmake
+#
+# The script to detect Intel(R) Math Kernel Library (MKL)
+# installation/package
+#
+# Parameters:
+# MKL_WITH_TBB
+#
+# On return this will define:
+#
+# HAVE_MKL          - True if Intel IPP found
+# MKL_ROOT_DIR      - root of IPP installation
+# MKL_INCLUDE_DIRS  - IPP include folder
+# MKL_LIBRARIES     - IPP libraries that are used by OpenCV
+#
+macro(mkl_fail)
+    set(HAVE_MKL OFF CACHE BOOL "True if MKL found")
+    set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
+    unset(MKL_INCLUDE_DIRS CACHE)
+    unset(MKL_LIBRARIES CACHE)
+endmacro()
+macro(get_mkl_version VERSION_FILE)
+    # read MKL version info from file
+    file(STRINGS ${VERSION_FILE} STR1 REGEX "__INTEL_MKL__")
+    file(STRINGS ${VERSION_FILE} STR2 REGEX "__INTEL_MKL_MINOR__")
+    file(STRINGS ${VERSION_FILE} STR3 REGEX "__INTEL_MKL_UPDATE__")
+    #file(STRINGS ${VERSION_FILE} STR4 REGEX "INTEL_MKL_VERSION")
+    # extract info and assign to variables
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_MAJOR ${STR1})
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_MINOR ${STR2})
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_UPDATE ${STR3})
+    set(MKL_VERSION_STR "${MKL_VERSION_MAJOR}.${MKL_VERSION_MINOR}.${MKL_VERSION_UPDATE}" CACHE STRING "MKL version" FORCE)
+endmacro()
+if(NOT DEFINED MKL_USE_MULTITHREAD)
+    OCV_OPTION(MKL_WITH_TBB "Use MKL with TBB multithreading" OFF)#ON IF WITH_TBB)
+    OCV_OPTION(MKL_WITH_OPENMP "Use MKL with OpenMP multithreading" OFF)#ON IF WITH_OPENMP)
+endif()
+#check current MKL_ROOT_DIR
+if(NOT MKL_ROOT_DIR OR NOT EXISTS ${MKL_ROOT_DIR}/include/mkl.h)
+    set(MKLROOT_PATHS ${MKL_ROOT_DIR})
+    if(DEFINED $ENV{MKLROOT})
+        list(APPEND MKLROOT_PATHS $ENV{MKLROOT})
+    endif()
+    if(WIN32)
+        set(ProgramFilesx86 "ProgramFiles(x86)")
+        list(APPEND MKLROOT_PATHS $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows/mkl)
+    endif()
+    if(UNIX)
+        list(APPEND MKLROOT_PATHS "/opt/intel/mkl")
+    endif()
+    find_path(MKL_ROOT_DIR include/mkl.h PATHS ${MKLROOT_PATHS})
+endif()
+if(NOT MKL_ROOT_DIR)
+    mkl_fail()
+    return()
+endif()
+set(MKL_INCLUDE_DIRS ${MKL_ROOT_DIR}/include)
+set(MKL_INCLUDE_HEADERS ${MKL_INCLUDE_DIRS}/mkl.h ${MKL_INCLUDE_DIRS}/mkl_version.h)
+#determine arch
+if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
+    set(MKL_X64 1)
+    set(MKL_ARCH "intel64")
+    include(CheckTypeSize)
+    CHECK_TYPE_SIZE(int _sizeof_int)
+    if (_sizeof_int EQUAL 4)
+        set(MKL_LP64 "lp64")
+    else()
+        set(MKL_LP64 "ilp64")
+    endif()
+else()
+    set(MKL_ARCH "ia32")
+endif()
+if(MSVC)
+    set(MKL_EXT ".lib")
+    set(MKL_PRE "")
+else()
+    set(MKL_EXT ".a")
+    set(MKL_PRE "lib")
+endif()
+set(MKL_LIB_DIR ${MKL_ROOT_DIR}/lib/${MKL_ARCH})
+set(MKL_LIBRARIES ${MKL_LIB_DIR}/${MKL_PRE}mkl_core${MKL_EXT} ${MKL_LIB_DIR}/${MKL_PRE}mkl_intel_${MKL_LP64}${MKL_EXT})
+if(MKL_WITH_TBB)
+    list(APPEND MKL_LIBRARIES ${MKL_LIB_DIR}/${MKL_PRE}mkl_tbb_thread${MKL_EXT})
+    list(APPEND MKL_LIBRARIES ${MKL_ROOT_DIR}/../tbb/lib/${MKL_ARCH}/tbb${MKL_EXT})
+elseif(MKL_WITH_OPENMP)
+    message(FATAL_ERROR "Multithreaded MKL is not supported yet")
+else()
+    list(APPEND MKL_LIBRARIES ${MKL_LIB_DIR}/${MKL_PRE}mkl_sequential${MKL_EXT})
+endif()
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL MKL_INCLUDE_HEADERS MKL_LIBRARIES)
+if(MKL_FOUND)
+    get_mkl_version(${MKL_INCLUDE_DIRS}/mkl_version.h)
+    message(STATUS "Found MKL ${MKL_VERSION_STR} at: ${MKL_ROOT_DIR}")
+    set(HAVE_MKL ON CACHE BOOL "True if MKL found")
+    set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
+    set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIRS} CACHE PATH "Path to MKL include directory")
+    if(NOT UNIX)
+        set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE FILEPATH "MKL libarries")
+    else()
+        #it's ugly but helps to avoid cyclic lib problem
+        set(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_LIBRARIES} ${MKL_LIBRARIES} "-lpthread" "-lm" "-ldl")
+        set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE STRING "MKL libarries")
+    endif()
+else()
+endif()
\ No newline at end of file
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
--- a/modules/dnn/include/opencv2/dnn/blob.hpp
+++ b/modules/dnn/include/opencv2/dnn/blob.hpp
--- a/modules/dnn/include/opencv2/dnn/blob.inl.hpp
+++ b/modules/dnn/include/opencv2/dnn/blob.inl.hpp
@@ -48,20 +48,50 @@ namespace cv
 namespace dnn
 {
-inline BlobShape::BlobShape(int ndims, int fill) : sz( (size_t)std::max(ndims, 0) )
+inline BlobShape::BlobShape()
+{
+    sz.allocate(4);
+    for (size_t i = 0; i < sz.size(); i++)
+        sz[i] = 1;
+}
+inline BlobShape BlobShape::all(int ndims, int fill)
 {
    CV_Assert(ndims >= 0);
+    BlobShape res;
+    res.sz.allocate(ndims);
    for (int i = 0; i < ndims; i++)
-        sz[i] = fill;
+        res.sz[i] = fill;
+    return res;
 }
 inline BlobShape::BlobShape(int ndims, const int *sizes) : sz( (size_t)std::max(ndims, 0) )
 {
    CV_Assert(ndims >= 0);
+    if (!sizes)
+        return;
    for (int i = 0; i < ndims; i++)
        sz[i] = sizes[i];
 }
+inline BlobShape::BlobShape(int s0) : sz(1)
+{
+    sz[0] = s0;
+}
+inline BlobShape::BlobShape(int s0, int s1) : sz(2)
+{
+    sz[0] = s0;
+    sz[1] = s1;
+}
+inline BlobShape::BlobShape(int s0, int s1, int s2) : sz(3)
+{
+    sz[0] = s0;
+    sz[1] = s1;
+    sz[2] = s2;
+}
 inline BlobShape::BlobShape(int num, int cn, int rows, int cols) : sz(4)
 {
    sz[0] = num;
@@ -120,7 +150,13 @@ inline int &BlobShape::operator[] (int axis)
    return sz[(axis < 0) ? axis + dims() : axis];
 }
-inline ptrdiff_t BlobShape::total()
+inline int BlobShape::canonicalAxis(int axis) const
+{
+    CV_Assert(-dims() <= axis && axis < dims());
+    return (axis < 0) ? axis + dims() : axis;
+}
+inline ptrdiff_t BlobShape::total() const
 {
    if (dims() == 0)
        return 0;
@@ -131,11 +167,52 @@ inline ptrdiff_t BlobShape::total()
    return res;
 }
+inline ptrdiff_t BlobShape::total(int startAxis, int endAxis) const
+{
+    if (isEmpty())
+        return 0;
+    if (endAxis == INT_MAX)
+        endAxis = dims();
+    else if (endAxis < 0)
+        endAxis += dims();
+    startAxis = (startAxis < 0) ? startAxis + dims() : startAxis;
+    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
+    ptrdiff_t res = 1;
+    for (int i = startAxis; i < endAxis; i++)
+        res *= sz[i];
+    return res;
+}
+inline BlobShape BlobShape::slice(int startAxis, int endAxis) const
+{
+    if (isEmpty())
+        return BlobShape::empty();
+    if (endAxis == INT_MAX)
+        endAxis = dims();
+    else if (endAxis < 0)
+        endAxis += dims();
+    startAxis = (startAxis < 0) ? startAxis + dims() : startAxis;
+    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
+    BlobShape res(endAxis - startAxis, (const int*)NULL);
+    for (int i = startAxis; i < endAxis; i++)
+        res[i - startAxis] = sz[i];
+    return res;
+}
 inline const int *BlobShape::ptr() const
 {
    return sz;
 }
+inline int *BlobShape::ptr()
+{
+    return sz;
+}
 inline bool BlobShape::equal(const BlobShape &other) const
 {
    if (this->dims() != other.dims())
@@ -155,19 +232,83 @@ inline bool BlobShape::operator==(const BlobShape &r) const
    return this->equal(r);
 }
+inline BlobShape BlobShape::like(const Mat &m)
+{
+    return BlobShape(m.dims, (const int*)m.size);
+}
+inline BlobShape BlobShape::like(const UMat &m)
+{
+    return BlobShape(m.dims, (const int*)m.size);
+}
+inline BlobShape BlobShape::empty()
+{
+    return BlobShape(0, (const int*)NULL);
+}
+inline bool BlobShape::isEmpty() const
+{
+    return dims() == 0;
+}
+inline BlobShape BlobShape::operator+(const BlobShape &r) const
+{
+    BlobShape newShape(this->dims() + r.dims(), (int*)NULL);
+    for (int i = 0; i < this->dims(); i++)
+        newShape[i] = (*this)[i];
+    for (int i = 0; i < r.dims(); i++)
+        newShape[this->dims() + i] = r[i];
+    return newShape;
+}
 CV_EXPORTS std::ostream &operator<< (std::ostream &stream, const BlobShape &shape);
 /////////////////////////////////////////////////////////////////////
-inline int Blob::canonicalAxis(int axis) const
+#ifndef CV_DNN_UMAT
+#   define CV_DNN_SWITCH_MU(cpu_expr, gpu_expr) (cpu_expr)
+#else
+#   define CV_DNN_SWITCH_MU(cpu_expr, gpu_expr) ((state == HEAD_AT_UMAT) ? (gpu_expr) : (cpu_expr))
+#endif
+inline int Blob::dims() const
 {
-    CV_Assert(-dims() <= axis && axis < dims());
+    return CV_DNN_SWITCH_MU(m.dims, um.dims);
-    return (axis < 0) ? axis + dims() : axis;
 }
-inline int Blob::dims() const
+inline const int * Blob::sizes() const
 {
-    return m.dims;
+    return CV_DNN_SWITCH_MU((const int*)m.size, (const int*)um.size);
+}
+inline int Blob::type() const
+{
+    return CV_DNN_SWITCH_MU(m.type(), um.type());
+}
+template<int n>
+inline size_t Blob::offset(const Vec<int, n> &pos) const
+{
+    const MatStep &step = CV_DNN_SWITCH_MU(m.step, um.step);
+    size_t ofs = 0;
+    int i;
+    for (i = 0; i < std::min(n, dims()); i++)
+    {
+        CV_DbgAssert(pos[i] >= 0 && pos[i] < size(i));
+        ofs += step[i] * pos[i];
+    }
+    for (; i < dims(); i++)
+        CV_DbgAssert(pos[i] == 0);
+    CV_DbgAssert(ofs % elemSize() == 0);
+    return ofs / elemSize();
+}
+inline int Blob::canonicalAxis(int axis) const
+{
+    CV_Assert(-dims() <= axis && axis < dims());
+    return (axis < 0) ? axis + dims() : axis;
 }
 inline int Blob::xsize(int axis) const
@@ -196,27 +337,11 @@ inline size_t Blob::total(int startAxis, int endAxis) const
    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
-    size_t size = 1; //fix: assume that slice isn't empty
+    size_t cnt = 1; //fix: assume that slice isn't empty
    for (int i = startAxis; i < endAxis; i++)
-        size *= (size_t)sizes()[i];
+        cnt *= (size_t)sizes()[i];
-    return size;
+    return cnt;
-}
-template<int n>
-inline size_t Blob::offset(const Vec<int, n> &pos) const
-{
-    size_t ofs = 0;
-    int i;
-    for (i = 0; i < std::min(n, dims()); i++)
-    {
-        CV_DbgAssert(pos[i] >= 0 && pos[i] < size(i));
-        ofs = ofs * (size_t)size(i) + pos[i];
-    }
-    for (; i < dims(); i++)
-        ofs *= (size_t)size(i);
-    return ofs;
 }
 inline size_t Blob::offset(int n, int cn, int row, int col) const
@@ -226,20 +351,20 @@ inline size_t Blob::offset(int n, int cn, int row, int col) const
 inline float *Blob::ptrf(int n, int cn, int row, int col)
 {
-    CV_Assert(type() == CV_32F);
+    return matRef(false).ptr<float>() + offset(n, cn, row, col);
-    return (float*)m.data + offset(n, cn, row, col);
 }
 inline uchar *Blob::ptr(int n, int cn, int row, int col)
 {
-    return m.data + m.elemSize() * offset(n, cn, row, col);
+    Mat &mat = matRef(false);
+    return mat.ptr() + mat.elemSize() * offset(n, cn, row, col);
 }
-template<typename TFloat>
+template<typename Dtype>
-inline TFloat* Blob::ptr(int n, int cn, int row, int col)
+inline Dtype* Blob::ptr(int n, int cn, int row, int col)
 {
-    CV_Assert(type() == cv::DataDepth<TFloat>::value);
+    CV_Assert(type() == cv::DataDepth<Dtype>::value);
-    return (TFloat*) ptr(n, cn, row, col);
+    return (Dtype*) ptr(n, cn, row, col);
 }
 inline BlobShape Blob::shape() const
@@ -260,26 +385,69 @@ inline bool Blob::equalShape(const Blob &other) const
    return true;
 }
-inline Mat& Blob::matRef()
+inline Mat& Blob::matRef(bool writeOnly)
 {
+#ifdef CV_DNN_UMAT
+    updateMat(!writeOnly);
+    state = HEAD_AT_MAT;
+#else
+    (void)writeOnly;
+#endif
    return m;
 }
 inline const Mat& Blob::matRefConst() const
 {
+    CV_DNN_UMAT_ONLY( updateMat() );
    return m;
 }
-inline UMat &Blob::umatRef()
+inline UMat &Blob::umatRef(bool writeOnly)
 {
-    CV_Error(Error::StsNotImplemented, "");
+#ifndef CV_DNN_UMAT
+    CV_Error(Error::GpuNotSupported, "");
+    (void)writeOnly;
    return *(new UMat());
+#else
+    updateUMat(!writeOnly);
+    state = HEAD_AT_UMAT;
+    return um;
+#endif
 }
 inline const UMat &Blob::umatRefConst() const
 {
-    CV_Error(Error::StsNotImplemented, "");
+#ifndef CV_DNN_UMAT
+    CV_Error(Error::GpuNotSupported, "");
    return *(new UMat());
+#else
+    updateUMat();
+    return um;
+#endif
+}
+template<>
+inline Mat &Blob::getRef<Mat>(bool writeOnly)
+{
+    return matRef(writeOnly);
+}
+template<>
+inline UMat &Blob::getRef<UMat>(bool writeOnly)
+{
+    return umatRef(writeOnly);
+}
+template<>
+inline const Mat &Blob::getRefConst<Mat>() const
+{
+    return matRefConst();
+}
+template<>
+inline const UMat &Blob::getRefConst<UMat>() const
+{
+    return umatRefConst();
 }
 inline Mat Blob::getPlane(int n, int cn)
@@ -313,27 +481,44 @@ inline Size Blob::size2() const
    return Size(cols(), rows());
 }
-inline int Blob::type() const
+inline Blob &Blob::shareFrom(const Blob &blob)
 {
-    return m.depth();
+    this->m = blob.m;
+#ifdef CV_DNN_UMAT
+    this->um = blob.um;
+    this->state = blob.state;
+#endif
+    return *this;
 }
-inline const int * Blob::sizes() const
+inline Blob &Blob::reshape(const BlobShape &newShape)
 {
-    return &m.size[0];
+    if (!m.empty()) m = m.reshape(1, newShape.dims(), newShape.ptr());
+#ifdef CV_DNN_UMAT
+    if (!um.empty()) um = um.reshape(1, newShape.dims(), newShape.ptr());
+#endif
+    return *this;
 }
+inline Blob Blob::reshaped(const BlobShape &newShape) const
+{
+    Blob res(*this); //also, res.shareFrom(*this) could be used
+    res.reshape(newShape);
+    return res;
+}
-inline Blob &Blob::shareFrom(const Blob &blob)
+inline int Blob::elemSize() const
 {
-    this->m = blob.m;
+    return CV_ELEM_SIZE(type());
-    return *this;
 }
-inline Blob &Blob::reshape(const BlobShape &shape)
+inline int Blob::getState() const
 {
-    m = m.reshape(1, shape.dims(), shape.ptr());
+#ifdef CV_DNN_UMAT
-    return *this;
+    return this->state;
+#else
+    return m.empty() ? UNINITIALIZED : HEAD_AT_MAT;
+#endif
 }
 }

--- a/modules/dnn/include/opencv2/dnn/dict.hpp
+++ b/modules/dnn/include/opencv2/dnn/dict.hpp
@@ -95,10 +95,10 @@ private:
        AutoBuffer<int64, 1> *pi;
        AutoBuffer<double, 1> *pd;
        AutoBuffer<String, 1> *ps;
-        void *p;
+        void *pv;
    };
-    DictValue(int _type, void *_p) : type(_type), p(_p) {}
+    DictValue(int _type, void *_p) : type(_type), pv(_p) {}
    void release();
 };

--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -59,15 +59,17 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     * This function automatically called on most of OpenCV builds,
     * but you need to call it manually on some specific configurations (iOS for example).
     */
-    CV_EXPORTS void initModule();
+    CV_EXPORTS_W void initModule();
    /** @brief This class provides all data needed to initialize layer.
     *
     * It includes dictionary with scalar params (which can be readed by using Dict interface),
     * blob params #blobs and optional meta information: #name and #type of layer instance.
    */
-    struct CV_EXPORTS LayerParams : public Dict
+    class CV_EXPORTS LayerParams : public Dict
    {
+    public:
+        //TODO: Add ability to name blob params
        std::vector<Blob> blobs; //!< List of learned parameters stored as blobs.
        String name; //!< Name of the layer instance (optional, can be used internal purposes).
@@ -77,10 +79,12 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
    /** @brief This interface class allows to build new Layers - are building blocks of networks.
     *
     * Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs.
-     * Also before using the new layer into networks you must register your layer by using one of @ref LayerFactoryModule "LayerFactory" macros.
+     * Also before using the new layer into networks you must register your layer by using one of @ref dnnLayerFactory "LayerFactory" macros.
     */
-    struct CV_EXPORTS Layer
+    class CV_EXPORTS_W Layer
    {
+    public:
        //! List of learned parameters must be stored here to allow read them by using Net::getParam().
        std::vector<Blob> blobs;
@@ -116,7 +120,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
        String type; //!< Type name which was used for creating layer by layer factory.
        Layer();
-        explicit Layer(const LayerParams &params); //!< Initialize only #name, #type and #blobs fields.
+        explicit Layer(const LayerParams &params);      //!< Initializes only #name, #type and #blobs fields.
+        void setParamsFrom(const LayerParams &params);  //!< Initializes only #name, #type and #blobs fields.
        virtual ~Layer();
    };
@@ -130,7 +135,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     *
     * This class supports reference counting of its instances, i. e. copies point to the same instance.
     */
-    class CV_EXPORTS Net
+    class CV_EXPORTS_W Net
    {
    public:
@@ -174,6 +179,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         *  @see setNetInputs(), Layer::inputNameToIndex(), Layer::outputNameToIndex()
         */
        void connect(String outPin, String inpPin);
        /** @brief Connects #@p outNum output of the first layer to #@p inNum input of the second layer.
         *  @param outLayerId identifier of the first layer
         *  @param inpLayerId identifier of the second layer
@@ -181,6 +187,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         *  @param inpNum number of the second layer input
         */
        void connect(int outLayerId, int outNum, int inpLayerId, int inpNum);
        /** @brief Sets ouputs names of the network input pseudo layer.
         *
         * Each net always has special own the network input pseudo layer with id=0.
@@ -267,10 +274,10 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     *  @param isBinary specifies whether the network was serialized in ascii mode or binary.
     *  @returns Pointer to the created importer, NULL in failure cases.
     *
-     *  @warning Torch7 importer is experimental now, you need explicitly set CMake opencv_dnn_BUILD_TORCH_IMPORTER flag to compile its.
+     *  @warning Torch7 importer is experimental now, you need explicitly set CMake `opencv_dnn_BUILD_TORCH_IMPORTER` flag to compile its.
     *
-     *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use long type of C language,
+     *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use `long` type of C language,
-     *  which has different bit-length on different systems.
+     *  which has various bit-length on different systems.
     *
     * The loading file must contain serialized <a href="https://github.com/torch/nn/blob/master/doc/module.md">nn.Module</a> object
     * with importing network. Try to eliminate a custom objects from serialazing data to avoid importing errors.

--- a/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
@@ -86,7 +86,7 @@ inline DictValue DictValue::get<DictValue>(int idx) const
 template<>
 inline int64 DictValue::get<int64>(int idx) const
 {
-    CV_Assert(idx == -1 && size() == 1 || idx >= 0 && idx < size());
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
    idx = (idx == -1) ? 0 : idx;
    if (type == Param::INT)
@@ -131,7 +131,7 @@ inline bool DictValue::get<bool>(int idx) const
 template<>
 inline double DictValue::get<double>(int idx) const
 {
-    CV_Assert(idx == -1 && size() == 1 || idx >= 0 && idx < size());
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
    idx = (idx == -1) ? 0 : idx;
    if (type == Param::REAL)
@@ -159,7 +159,7 @@ template<>
 inline String DictValue::get<String>(int idx) const
 {
    CV_Assert(isString());
-    CV_Assert(idx == -1 && ps->size() == 1 || idx >= 0 && idx < (int)ps->size());
+    CV_Assert((idx == -1 && ps->size() == 1) || (idx >= 0 && idx < (int)ps->size()));
    return (*ps)[(idx == -1) ? 0 : idx];
 }

--- a/modules/dnn/include/opencv2/dnn/layer.hpp
+++ b/modules/dnn/include/opencv2/dnn/layer.hpp
@@ -50,7 +50,7 @@ namespace dnn
 //! @addtogroup dnn
 //! @{
 //!
-//! @defgroup LayerFactoryModule Utilities for new layers registration
+//! @defgroup dnnLayerFactory Utilities for New Layers Registration
 //! @{
 /** @brief %Layer factory allows to create instances of registered layers. */
@@ -86,7 +86,7 @@ private:
 *   @details This macros must be placed inside the function code.
 */
 #define REG_RUNTIME_LAYER_FUNC(type, constuctorFunc) \
-    LayerFactory::registerLayer(#type, constuctorFunc);
+    cv::dnn::LayerFactory::registerLayer(#type, constuctorFunc);
 /** @brief Registers layer class in runtime.
 *  @param type string, containing type name of the layer.
@@ -94,7 +94,7 @@ private:
 *  @details This macros must be placed inside the function code.
 */
 #define REG_RUNTIME_LAYER_CLASS(type, class) \
-    LayerFactory::registerLayer(#type, _layerDynamicRegisterer<class>);
+    cv::dnn::LayerFactory::registerLayer(#type, _layerDynamicRegisterer<class>);
 /** @brief Registers layer constructor on module load time.
 *   @param type string, containing type name of the layer.
@@ -102,7 +102,7 @@ private:
 *   @details This macros must be placed outside the function code.
 */
 #define REG_STATIC_LAYER_FUNC(type, constuctorFunc) \
-static _LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, constuctorFunc);
+static cv::dnn::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, constuctorFunc);
 /** @brief Registers layer class on module load time.
 *  @param type string, containing type name of the layer.
@@ -126,14 +126,15 @@ Ptr<Layer> _layerDynamicRegisterer(LayerParams &params)
 }
 //allows automatically register created layer on module load time
-struct _LayerStaticRegisterer
+class _LayerStaticRegisterer
 {
    String type;
+public:
-    _LayerStaticRegisterer(const String &type, LayerFactory::Constuctor constuctor)
+    _LayerStaticRegisterer(const String &layerType, LayerFactory::Constuctor layerConstuctor)
    {
-        this->type = type;
+        this->type = layerType;
-        LayerFactory::registerLayer(type, constuctor);
+        LayerFactory::registerLayer(layerType, layerConstuctor);
    }
    ~_LayerStaticRegisterer()

--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_DNN_DNN_SHAPE_UTILS_HPP__
+#define __OPENCV_DNN_DNN_SHAPE_UTILS_HPP__
+#include <opencv2/core.hpp>
+#include <ostream>
+namespace cv {
+namespace dnn {
+//Useful shortcut
+typedef BlobShape Shape;
+inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
+{
+    return s << "[" << r.start << ", " << r.end << ")";
+}
+//Reshaping
+//TODO: add -1 specifier for automatic size inferring
+template<typename Mat>
+void reshape(Mat &m, const BlobShape &shape)
+{
+    m = m.reshape(1, shape.dims(), shape.ptr());
+}
+template<typename Mat>
+Mat reshaped(const Mat &m, const BlobShape &shape)
+{
+    return m.reshape(1, shape.dims(), shape.ptr());
+}
+//Slicing
+struct _Range : public cv::Range
+{
+    _Range(const Range &r) : cv::Range(r) {}
+    _Range(int start, int size = 1) : cv::Range(start, start + size) {}
+};
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0)
+{
+    //CV_Assert(m.dims >= 1);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 1; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    return m(&ranges[0]);
+}
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
+{
+    CV_Assert(m.dims >= 2);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 2; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    return m(&ranges[0]);
+}
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
+{
+    CV_Assert(m.dims <= 3);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 3; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    return m(&ranges[0]);
+}
+template<typename Mat>
+Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, const _Range &r3)
+{
+    CV_Assert(m.dims <= 4);
+    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    for (int i = 4; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    ranges[3] = r3;
+    return m(&ranges[0]);
+}
+BlobShape computeShapeByReshapeMask(const BlobShape &srcShape, const BlobShape &maskShape, Range srcRange = Range::all());
+}
+}
+#endif
--- a/modules/dnn/perf/perf_convolution.cpp
+++ b/modules/dnn/perf/perf_convolution.cpp
+#include "perf_precomp.hpp"
+namespace cvtest
+{
+using std::tr1::tuple;
+using std::tr1::get;
+using std::tr1::make_tuple;
+using std::make_pair;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::dnn;
+enum {STRIDE_OFF = 1, STRIDE_ON = 2};
+CV_ENUM(StrideSize, STRIDE_OFF, STRIDE_ON);
+enum {GROUP_OFF = 1, GROUP_2 = 2};
+CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
+//Squared Size
+#define SSZ(n) cv::Size(n, n)
+typedef std::pair<BlobShape, int> InpShapeNumOut;
+typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
+typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;
+PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
+    Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)),
+    Values(make_pair(BlobShape(1,   4, 224, 224),  64),
+           make_pair(BlobShape(1,  64, 112, 122), 128),
+           make_pair(BlobShape(1, 256,  28,  28), 512)),
+    GroupSize::all(),
+    StrideSize::all())
+)
+{
+    RNG rng(0);
+    ConvParam params = GetParam();
+    int ksz     = get<0>(params).width;
+    BlobShape inpShape = get<1>(params).first;
+    int outCn   = get<1>(params).second;
+    int groups  = get<2>(params);
+    int stride  = (ksz >= 11) ? 4 : (int)get<3>(params);
+    int inpCn = inpShape[1];
+    Blob wgtBlob(BlobShape(outCn, inpCn/groups, ksz, ksz)), biasBlob(BlobShape(outCn, 1, 1, 1));
+    Blob inpBlob(inpShape);
+    rng.fill(biasBlob.matRef(), RNG::UNIFORM, -1, +1);
+    rng.fill(wgtBlob.matRef(), RNG::UNIFORM, -1, +1);
+    rng.fill(inpBlob.matRef(), RNG::UNIFORM, -1, +1);
+    LayerParams lp;
+    lp.set("num_output", outCn);
+    lp.set("group", groups);
+    lp.set("stride", stride);
+    lp.set("kernel_size", ksz);
+    lp.blobs.reserve(2);
+    lp.blobs.push_back(wgtBlob);
+    lp.blobs.push_back(biasBlob);
+    std::vector<Blob*> inpBlobs(1, &inpBlob);
+    std::vector<Blob> outBlobs;
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
+    layer->allocate(inpBlobs, outBlobs);
+    declare.in(inpBlob.matRef(), wgtBlob.matRef(), WARMUP_RNG).out(outBlobs[0].matRef()).tbb_threads(cv::getNumThreads());
+    TEST_CYCLE_N(10)
+    {
+        layer->forward(inpBlobs, outBlobs);
+    }
+    SANITY_CHECK_NOTHING();
+}
+}
\ No newline at end of file
--- a/modules/dnn/perf/perf_main.cpp
+++ b/modules/dnn/perf/perf_main.cpp
+#include "perf_precomp.hpp"
+CV_PERF_TEST_MAIN(dnn)
--- a/modules/dnn/perf/perf_precomp.hpp
+++ b/modules/dnn/perf/perf_precomp.hpp
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+#include <opencv2/ts.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/dnn.hpp>
+#endif
--- a/modules/dnn/samples/.gitignore
+++ b/modules/dnn/samples/.gitignore
+*.caffemodel
--- a/modules/dnn/samples/caffe_googlenet.cpp
+++ b/modules/dnn/samples/caffe_googlenet.cpp
@@ -124,8 +124,8 @@ int main(int argc, char **argv)
        exit(-1);
    }
-    resize(img, img, Size(224, 224));       //GoogLeNet accepts only 224x224 RGB-images
+    resize(img, img, Size(224, 224));                   //GoogLeNet accepts only 224x224 RGB-images
-    dnn::Blob inputBlob = dnn::Blob(img);   //Convert Mat to dnn::Blob image batch
+    dnn::Blob inputBlob = dnn::Blob::fromImages(img);   //Convert Mat to dnn::Blob batch of images
    //! [Prepare blob]
    //! [Set input blob]

--- a/modules/dnn/src/blob.cpp
+++ b/modules/dnn/src/blob.cpp
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@@ -191,7 +191,7 @@ namespace
            else if (pbBlob.has_shape())
            {
                const caffe::BlobShape &_shape = pbBlob.shape();
-                BlobShape shape(_shape.dim_size());
+                BlobShape shape = BlobShape::all(_shape.dim_size());
                for (int i = 0; i < _shape.dim_size(); i++)
                    shape[i] = (int)_shape.dim(i);
@@ -201,7 +201,7 @@ namespace
            else
            {
                CV_Error(Error::StsError, "Unknown shape of input blob");
-                return BlobShape(-1);
+                return BlobShape();
            }
        }

--- a/modules/dnn/src/caffe/layer_loaders.cpp
+++ b/modules/dnn/src/caffe/layer_loaders.cpp
+#include "../precomp.hpp"
+#include "layer_loaders.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <climits>
+namespace cv
+{
+namespace dnn
+{
+//Utils
+//Extracts params used into Conv, Deconv and Pooling layers
+static void getCaffeConvParams(LayerParams &params, Size &kernel, Size &pad, Size &stride)
+{
+    if (params.has("kernel_h") && params.has("kernel_w"))
+    {
+        kernel.height = params.get<int>("kernel_h");
+        kernel.width = params.get<int>("kernel_w");
+    }
+    else if (params.has("kernel_size"))
+    {
+        kernel.height = kernel.width = params.get<int>("kernel_size");
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
+    }
+    CV_Assert(kernel.height > 0 && kernel.width > 0);
+    if (params.has("pad_h") && params.has("pad_w"))
+    {
+        pad.height = params.get<int>("pad_h");
+        pad.width = params.get<int>("pad_w");
+    }
+    else
+    {
+        pad.height = pad.width = params.get<int>("pad", 0);
+    }
+    CV_Assert(pad.height >= 0 && pad.width >= 0);
+    if (params.has("stride_h") && params.has("stride_w"))
+    {
+        stride.height = params.get<int>("stride_h");
+        stride.width = params.get<int>("stride_w");
+    }
+    else
+    {
+        stride.height = stride.width = params.get<int>("stride", 1);
+    }
+    CV_Assert(stride.height > 0 && stride.width > 0);
+}
+//Layers
+//Convolution and Deconvolution
+static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, LayerParams &params)
+{
+    l->setParamsFrom(params);
+    getCaffeConvParams(params, l->kernel, l->pad, l->stride);
+    bool bias = params.get<bool>("bias_term", true);
+    int numOutput = params.get<int>("num_output");
+    int group = params.get<int>("group", 1);
+    CV_Assert(numOutput % group == 0);
+    CV_Assert((bias && l->blobs.size() == 2) || (!bias && l->blobs.size() == 1));
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<ConvolutionLayer>(LayerParams &params)
+{
+    Ptr<BaseConvolutionLayer> l = ConvolutionLayer::create();
+    initConvDeconvLayerFromCaffe(l, params);
+    return Ptr<Layer>(l);
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<DeconvolutionLayer>(LayerParams &params)
+{
+    Ptr<BaseConvolutionLayer> l = DeconvolutionLayer::create();
+    initConvDeconvLayerFromCaffe(l, params);
+    return Ptr<Layer>(l);
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<PoolingLayer>(LayerParams &params)
+{
+    int type;
+    Size kernel, stride, pad;
+    if (params.has("pool"))
+    {
+        String pool = params.get<String>("pool").toLowerCase();
+        if (pool == "max")
+            type = PoolingLayer::MAX;
+        else if (pool == "ave")
+            type = PoolingLayer::AVE;
+        else if (pool == "stochastic")
+            type = PoolingLayer::STOCHASTIC;
+        else
+            CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
+    }
+    else
+    {
+        type = PoolingLayer::MAX;
+    }
+    getCaffeConvParams(params, kernel, pad, stride);
+    return Ptr<Layer>(PoolingLayer::create(type, kernel, stride, pad));
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<SoftmaxLayer>(LayerParams &params)
+{
+    int axis = params.get<int>("axis", 1);
+    return Ptr<Layer>(SoftmaxLayer::create(axis));
+}
+template<> //InnerProduct specialization
+Ptr<Layer> createLayerFromCaffe<InnerProductLayer>(LayerParams &params)
+{
+    const std::vector<Blob> &blobs = params.blobs;
+    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+    int numOutputs = params.get<int>("num_output");
+    int innerSize = (int)blobs[0].total() / numOutputs;
+    bool bias = params.get<bool>("bias_term", true);
+    int axis = params.get<int>("axis", 1);
+    CV_Assert(blobs[0].dims() >= 2 && (size_t)(innerSize * numOutputs) == blobs[0].total());
+    CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutputs == blobs[1].total()));
+    Ptr<InnerProductLayer> l = InnerProductLayer::create(axis);
+    l->setParamsFrom(params);
+    l->blobs[0].reshape(Shape(numOutputs, innerSize));
+    if (bias)
+        l->blobs[1].reshape(Shape(1, numOutputs));
+    return Ptr<Layer>(l);
+}
+template<> //LRNLayer specialization
+Ptr<Layer> createLayerFromCaffe<LRNLayer>(LayerParams& params)
+{
+    int type;
+    String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
+    if (nrmType == "ACROSS_CHANNELS")
+        type = LRNLayer::CHANNEL_NRM;
+    else if (nrmType == "WITHIN_CHANNEL")
+        type = LRNLayer::SPATIAL_NRM;
+    else
+        CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
+    int size = params.get<int>("local_size", 5);
+    if (size % 2 != 1 || size <= 0)
+        CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
+    double alpha = params.get<double>("alpha", 1);
+    double beta = params.get<double>("beta", 0.75);
+    return Ptr<Layer>(LRNLayer::create(type, size, alpha, beta));
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<MVNLayer>(LayerParams &params)
+{
+    return Ptr<Layer>(MVNLayer::create(
+        params.get<bool>("normalize_variance", true),
+        params.get<bool>("across_channels", false),
+        params.get<double>("eps", 1e-9)
+    ));
+}
+/* Reshape layers */
+template<>
+Ptr<Layer> createLayerFromCaffe<ReshapeLayer>(LayerParams &params)
+{
+    int axis = params.get<int>("axis", 0);
+    int numAxes = params.get<int>("num_axes", -1);
+    CV_Assert(numAxes >= -1);
+    Range applyingRange = (numAxes == -1) ? Range(axis, INT_MAX) : Range(axis, axis + numAxes);
+    Shape newShape;
+    if (params.has("dim"))
+    {
+        const DictValue &paramShape = params.get("dim");
+        newShape = Shape::all(paramShape.size());
+        for (int i = 0; i < paramShape.size(); i++)
+            newShape[i] = paramShape.get<int>(i);
+    }
+    else
+        newShape = Shape::all(0);
+    return Ptr<Layer>(ReshapeLayer::create(newShape, applyingRange));
+}
+Ptr<Layer> createFlattenLayerFromCaffe(LayerParams&)
+{
+    return Ptr<Layer>(ReshapeLayer::create(Shape(0, -1)));
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<ConcatLayer>(LayerParams& params)
+{
+    return Ptr<Layer>(ConcatLayer::create(params.get<int>("axis", 1)));
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<SplitLayer>(LayerParams &params)
+{
+    int outputsCount;
+    //TODO: maybe "top_count" param is useless because it can be determined by output connections number
+    if (params.has("top_count"))
+    {
+        outputsCount = params.get<int>("top_count");
+        CV_Assert(outputsCount >= 0);
+    }
+    else
+    {
+        outputsCount = -1;
+    }
+    return Ptr<Layer>(SplitLayer::create(outputsCount));
+}
+template<>
+Ptr<Layer> createLayerFromCaffe<SliceLayer>(LayerParams& params)
+{
+    int axis = params.get<int>("axis", 1);
+    if (!params.has("slice_point"))
+    {
+        return Ptr<Layer>(SliceLayer::create(axis));
+    }
+    else
+    {
+        const DictValue &indicesValue = params.get("slice_point");
+        std::vector<int> sliceIndices(indicesValue.size());
+        for (int i = 0; i < indicesValue.size(); i++)
+            sliceIndices[i] = indicesValue.get<int>(i);
+        return Ptr<Layer>(SliceLayer::create(axis, sliceIndices));
+    }
+}
+/* Activation layers */
+template <typename ActivationLayer> //Intended for parameters-free activations
+Ptr<Layer> createLayerFromCaffe(LayerParams&)
+{
+    return Ptr<Layer>(ActivationLayer::create());
+}
+template<> //ReLU specialization
+Ptr<Layer> createLayerFromCaffe<ReLULayer>(LayerParams& params)
+{
+    float negative_slope = params.get<float>("negative_slope", 0.f);
+    return Ptr<Layer>(ReLULayer::create(negative_slope));
+}
+template<> //Power specialization
+Ptr<Layer> createLayerFromCaffe<PowerLayer>(LayerParams& params)
+{
+    float power = params.get<float>("power", 1.0f);
+    float scale = params.get<float>("scale", 1.0f);
+    float shift = params.get<float>("shift", 0.0f);
+    return Ptr<Layer>(PowerLayer::create(power, scale, shift));
+}
+//Explicit instantiation
+template Ptr<Layer> createLayerFromCaffe<ConvolutionLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<DeconvolutionLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SoftmaxLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<InnerProductLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<LRNLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<MVNLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<ConcatLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SliceLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SplitLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<ReLULayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<SigmoidLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<TanHLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<AbsLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<BNLLLayer>(LayerParams&);
+template Ptr<Layer> createLayerFromCaffe<PowerLayer>(LayerParams&);
+}
+}
--- a/modules/dnn/src/caffe/layer_loaders.hpp
+++ b/modules/dnn/src/caffe/layer_loaders.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_DNN_CAFFE_LAYER_LOADERS_HPP__
+#define __OPENCV_DNN_CAFFE_LAYER_LOADERS_HPP__
+#include <opencv2/dnn/all_layers.hpp>
+namespace cv
+{
+namespace dnn
+{
+//Common template for Caffe layer loaders
+template <typename PublicLayer>
+Ptr<Layer> createLayerFromCaffe(LayerParams&);
+Ptr<Layer> createFlattenLayerFromCaffe(LayerParams&);
+}
+}
+#endif
\ No newline at end of file
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -543,6 +543,13 @@ Layer::Layer(const LayerParams &params)
 }
+void Layer::setParamsFrom(const LayerParams &params)
+{
+    blobs = params.blobs;
+    name = params.name;
+    type = params.type;
+}
 int Layer::inputNameToIndex(String)
 {
    return -1;

--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -40,19 +40,8 @@
 //M*/
 #include "precomp.hpp"
+#include "caffe/layer_loaders.hpp"
-#include "layers/concat_layer.hpp"
-#include "layers/convolution_layer.hpp"
 #include "layers/blank_layer.hpp"
-#include "layers/elementwise_layers.hpp"
-#include "layers/fully_connected_layer.hpp"
-#include "layers/lrn_layer.hpp"
-#include "layers/mvn_layer.hpp"
-#include "layers/pooling_layer.hpp"
-#include "layers/reshape_layer.hpp"
-#include "layers/slice_layer.hpp"
-#include "layers/softmax_layer.hpp"
-#include "layers/split_layer.hpp"
 namespace cv
 {
@@ -76,27 +65,27 @@ void initModule()
    if (init.status)
        return;
-    REG_RUNTIME_LAYER_CLASS(Slice, SliceLayer)
+    REG_RUNTIME_LAYER_FUNC(Slice,           createLayerFromCaffe<SliceLayer>);
-    REG_RUNTIME_LAYER_CLASS(Softmax, SoftMaxLayer)
+    REG_RUNTIME_LAYER_FUNC(Split,           createLayerFromCaffe<SplitLayer>);
-    REG_RUNTIME_LAYER_CLASS(Split, SplitLayer)
+    REG_RUNTIME_LAYER_FUNC(Concat,          createLayerFromCaffe<ConcatLayer>);
-    REG_RUNTIME_LAYER_CLASS(Reshape, ReshapeLayer)
+    REG_RUNTIME_LAYER_FUNC(Reshape,         createLayerFromCaffe<ReshapeLayer>);
-    REG_STATIC_LAYER_FUNC(Flatten, createFlattenLayer)
+    REG_RUNTIME_LAYER_FUNC(Flatten,         createFlattenLayerFromCaffe);
-    REG_RUNTIME_LAYER_CLASS(Pooling, PoolingLayer)
-    REG_RUNTIME_LAYER_CLASS(MVN, MVNLayer)
-    REG_RUNTIME_LAYER_CLASS(LRN, LRNLayer)
-    REG_RUNTIME_LAYER_CLASS(InnerProduct, FullyConnectedLayer)
-    REG_RUNTIME_LAYER_CLASS(ReLU, ElementWiseLayer<ReLUFunctor>)
+    REG_RUNTIME_LAYER_FUNC(Convolution,     createLayerFromCaffe<ConvolutionLayer>);
-    REG_RUNTIME_LAYER_CLASS(TanH, ElementWiseLayer<TanHFunctor>)
+    REG_RUNTIME_LAYER_FUNC(Deconvolution,   createLayerFromCaffe<DeconvolutionLayer>);
-    REG_RUNTIME_LAYER_CLASS(BNLL, ElementWiseLayer<BNLLFunctor>)
+    REG_RUNTIME_LAYER_FUNC(Pooling,         createLayerFromCaffe<PoolingLayer>);
-    REG_RUNTIME_LAYER_CLASS(Power, ElementWiseLayer<PowerFunctor>)
+    REG_RUNTIME_LAYER_FUNC(LRN,             createLayerFromCaffe<LRNLayer>);
-    REG_RUNTIME_LAYER_CLASS(AbsVal, ElementWiseLayer<AbsValFunctor>)
+    REG_RUNTIME_LAYER_FUNC(InnerProduct,    createLayerFromCaffe<InnerProductLayer>);
-    REG_RUNTIME_LAYER_CLASS(Sigmoid, ElementWiseLayer<SigmoidFunctor>)
+    REG_RUNTIME_LAYER_FUNC(Softmax,         createLayerFromCaffe<SoftmaxLayer>);
-    REG_RUNTIME_LAYER_CLASS(Dropout, BlankLayer)
+    REG_RUNTIME_LAYER_FUNC(MVN,             createLayerFromCaffe<MVNLayer>);
-    REG_RUNTIME_LAYER_CLASS(Convolution, ConvolutionLayer)
+    REG_RUNTIME_LAYER_FUNC(ReLU,            createLayerFromCaffe<ReLULayer>);
-    REG_RUNTIME_LAYER_CLASS(Deconvolution, DeConvolutionLayer)
+    REG_RUNTIME_LAYER_FUNC(Sigmoid,         createLayerFromCaffe<SigmoidLayer>);
-    REG_RUNTIME_LAYER_CLASS(Concat, ConcatLayer)
+    REG_RUNTIME_LAYER_FUNC(TanH,            createLayerFromCaffe<TanHLayer>);
+    REG_RUNTIME_LAYER_FUNC(BNLL,            createLayerFromCaffe<BNLLLayer>);
+    REG_RUNTIME_LAYER_FUNC(AbsVal,          createLayerFromCaffe<AbsLayer>);
+    REG_RUNTIME_LAYER_FUNC(Power,           createLayerFromCaffe<PowerLayer>);
+    REG_RUNTIME_LAYER_CLASS(Dropout,        BlankLayer)
    init.status = true;
 }

--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -42,60 +42,80 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "concat_layer.hpp"
+#include <opencv2/core/ocl.hpp>
 namespace cv
 {
 namespace dnn
 {
-    ConcatLayer::ConcatLayer(LayerParams &params) : Layer(params)
-    {
-        axis = params.get<int>("axis", 1);
-        CV_Assert(axis >= 0);
-    }
-    void ConcatLayer::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+ConcatLayerImpl::ConcatLayerImpl(int axis_ /*= 1*/)
-    {
+{
-        CV_Assert(inputs.size() > 0);
+    axis = axis_;
+}
-        int refType = inputs[0]->type();
+void ConcatLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-        BlobShape refShape = inputs[0]->shape();
+{
-        CV_Assert(axis < refShape.dims());
+    CV_Assert(inputs.size() > 0);
-        int axisSum = 0;
+    BlobShape refShape = inputs[0]->shape();
-        for (size_t i = 0; i < inputs.size(); i++)
+    axisIdx = inputs[0]->canonicalAxis(axis);
-        {
-            BlobShape curShape = inputs[i]->shape();
-            CV_Assert(curShape.dims() == refShape.dims() && inputs[i]->type() == refType);
+    int axisSum = 0;
-            for (int axisId = 0; axisId < refShape.dims(); axisId++)
+    useOpenCL = false;
-            {
+    for (size_t i = 0; i < inputs.size(); i++)
-                if (axisId != axis && refShape[axisId] != curShape[axisId])
+    {
-                    CV_Error(Error::StsBadSize, "Inconsitent shape for ConcatLayer");
+        BlobShape curShape = inputs[i]->shape();
-            }
-            axisSum += curShape[axis];
+        CV_Assert(curShape.dims() == refShape.dims() && inputs[i]->type() == inputs[0]->type());
+        for (int curAxis = 0; curAxis < refShape.dims(); curAxis++)
+        {
+            if (curAxis != axisIdx && refShape[curAxis] != curShape[curAxis])
+                CV_Error(Error::StsBadSize, "Inconsitent shape for ConcatLayer");
        }
-        refShape[axis] = axisSum;
+        axisSum += curShape[axisIdx];
-        outputs.resize(1);
+        useOpenCL |= inputs[i]->getState() == Blob::HEAD_AT_MAT;
-        outputs[0].create(refShape);
    }
-    void ConcatLayer::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+    refShape[axisIdx] = axisSum;
-    {
+    useOpenCL &= ocl::useOpenCL();
-        const Mat& outMat = outputs[0].matRef();
+    int allocFlags = (useOpenCL) ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
-        std::vector<Range> ranges(outputs[0].dims(), Range::all());
-        int sizeStart = 0;
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            int sizeEnd = sizeStart + inputs[i]->size(axis);
-            ranges[axis] = Range(sizeStart, sizeEnd);
-            Mat outSubMat = outMat(&ranges[0]);
+    outputs.resize(1);
-            inputs[i]->matRef().copyTo(outSubMat);
+    outputs[0].create(refShape, inputs[0]->type(), allocFlags);
+}
-            sizeStart = sizeEnd;
-        }
+void ConcatLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+{
+    #ifdef HAVE_OPENCL
+    if (useOpenCL)
+        forward_<UMat>(inputs, outputs);
+    else
+    #endif
+        forward_<Mat>(inputs, outputs);
+}
+template<typename XMat>
+void ConcatLayerImpl::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    XMat& outMat = outputs[0].getRef<XMat>();
+    std::vector<Range> ranges(outputs[0].dims(), Range::all());
+    ranges[axisIdx].start = 0;
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        ranges[axisIdx].end = ranges[axisIdx].start + inputs[i]->size(axisIdx);
+        inputs[i]->getRefConst<XMat>().copyTo(outMat(&ranges[0]));
+        ranges[axisIdx].start = ranges[axisIdx].end;
    }
 }
+Ptr<ConcatLayer> ConcatLayer::create(int axis)
+{
+    return Ptr<ConcatLayer>(new ConcatLayerImpl(axis));
+}
+}
 }
--- a/modules/dnn/src/layers/concat_layer.hpp
+++ b/modules/dnn/src/layers/concat_layer.hpp
@@ -42,20 +42,29 @@
 #ifndef __OPENCV_DNN_LAYERS_CONCAT_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_CONCAT_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 namespace cv
 {
 namespace dnn
 {
-    class ConcatLayer : public Layer
-    {
+class ConcatLayerImpl : public ConcatLayer
-        int axis;
+{
+    bool useOpenCL;
-    public:
+    int axisIdx;
-        ConcatLayer(LayerParams& params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    template<typename XMat>
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
+public:
+    ConcatLayerImpl(int axis_ = 1);
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
 }
 }
 #endif
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ b/modules/dnn/src/layers/convolution_layer.hpp
@@ -42,51 +42,65 @@
 #ifndef __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 namespace cv
 {
 namespace dnn
 {
-    //TODO: simultaneously convolution and bias addition for cache optimization
-    class ConvolutionLayer : public Layer
-    {
-    protected:
-        bool bias;
-        int numOutput, group;
-        int padH, padW;
-        int kerH, kerW;
-        int strideH, strideW;
-        int inpH, inpW, inpCn;
+//TODO: simultaneously convolution and bias addition for cache optimization
-        int outH, outW, outCn;
+class ConvolutionLayerImpl : public ConvolutionLayer
-        int topH, topW, topCn; //switched between inp/out on deconv/conv
+{
-        int inpGroupCn, outGroupCn;
+public:
-        int ksize;
+    ConvolutionLayerImpl();
+    virtual void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    virtual void init();
+protected:
+    int numOutput, group;
+    int inpH, inpW, inpCn;
+    int outH, outW, outCn;
+    int topH, topW, topCn; //switched between inp/out on deconv/conv
+    int inpGroupCn, outGroupCn;
+    int ksize;
+    bool bias;
+    bool tryUseOpenCL, useOpenCL;
+    Blob colBlob, biasOnesBlob;
+    bool is1x1() const;
+    virtual void computeInpOutShape(const Blob &inpBlob);
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void im2col(const  Mat &srcImg,  Mat &dstCol);
+    void im2col(const UMat &srcImg, UMat &dstCol);
+};
+class DeConvolutionLayerImpl : public ConvolutionLayerImpl
+{
+public:
+    DeConvolutionLayerImpl();
+    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        bool useOpenCL;
+protected:
-        Mat colMat, biasOnesMat;
-        inline bool is1x1() const;
+    virtual void computeInpOutShape(const Blob &inpBlob);
-        virtual void computeInpOutShape(const Blob &inpBlob);
-        void im2col(Blob &inpBlob, int imNum, int cnGroup);
-    public:
+    template<typename XMat>
-        ConvolutionLayer() {}
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        ConvolutionLayer(LayerParams &params);
+    void col2im(const  Mat &colMat, Mat  &dstImg);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void col2im(const UMat &colMat, UMat &dstImg);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
-    };
-    class DeConvolutionLayer : public ConvolutionLayer
+//Importers
-    {
+Ptr<Layer> createConvolutionLayerFromCaffe(LayerParams &params);
-    protected:
+Ptr<Layer> createDeconvolutionLayerFromCaffe(LayerParams &params);
-        void computeInpOutShape(const Blob &inpBlob);
-        void col2im(Mat &dstMat);
-    public:
-        DeConvolutionLayer(LayerParams &params);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
+#include "../precomp.hpp"
+#include "elementwise_layers.hpp"
+namespace cv
+{
+namespace dnn
+{
+#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
+Ptr<_Layer> _Layer::create() { \
+    return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
+Ptr<ReLULayer> ReLULayer::create(double negativeSlope)
+{
+    return Ptr<ReLULayer>(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
+}
+Ptr<TanHLayer> TanHLayer::create()
+{
+    return Ptr<TanHLayer>(new ElementWiseLayer<TanHFunctor>());
+}
+Ptr<SigmoidLayer> SigmoidLayer::create()
+{
+    return Ptr<SigmoidLayer>(new ElementWiseLayer<SigmoidFunctor>());
+}
+Ptr<AbsLayer> AbsLayer::create()
+{
+    return Ptr<AbsLayer>(new ElementWiseLayer<AbsValFunctor>());
+}
+Ptr<BNLLLayer> BNLLLayer::create()
+{
+    return Ptr<BNLLLayer>(new ElementWiseLayer<BNLLFunctor>());
+}
+Ptr<PowerLayer> PowerLayer::create(double power /*= 1*/, double scale /*= 1*/, double shift /*= 0*/)
+{
+    const PowerFunctor f(power, scale, shift);
+    return Ptr<PowerLayer>(new ElementWiseLayer<PowerFunctor>(f));
+}
+}
+}
\ No newline at end of file
--- a/modules/dnn/src/layers/elementwise_layers.hpp
+++ b/modules/dnn/src/layers/elementwise_layers.hpp
@@ -44,6 +44,11 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include <cmath>
+#include <opencv2/dnn/all_layers.hpp>
+#include <opencv2/core/ocl.hpp>
+#ifdef HAVE_OPENCL
+#include "modules/dnn/opencl_kernels_dnn.hpp"
+#endif
 namespace cv
 {
@@ -55,130 +60,259 @@ using std::exp;
 using std::tanh;
 using std::pow;
-    template<typename Func>
+template<typename Func>
-    class ElementWiseLayer : public Layer
+class ElementWiseLayer : public Func::Layer
+{
+    bool useOpenCL;
+    Func func;
+    template<typename Dtype>
+    class PBody : public cv::ParallelLoopBody
    {
-        Func func;
+        Func &func;
+        Dtype *data;
    public:
-        ElementWiseLayer(LayerParams &_params) : func(_params) {}
+        PBody(Mat &mat, Func &func_) :
+            func(func_), data(mat.ptr<Dtype>())
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+        {}
-        {
-            outputs.resize(inputs.size());
-            for (size_t i = 0; i < inputs.size(); i++)
-                outputs[i].shareFrom(*inputs[i]); //no data copy
-        }
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+        void operator()(const Range &r) const
        {
-            for (size_t i = 0; i < inputs.size(); i++)
+            for (int i = r.start; i < r.end; i++)
-            {
+                data[i] = func(data[i]);
-                CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
-                size_t size = outputs[i].total();
-                if (outputs[i].type() == CV_32F)
-                {
-                    float *data = outputs[i].ptrf();
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
-                }
-                else if (outputs[i].type() == CV_64F)
-                {
-                    double *data = outputs[i].ptr<double>();
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
-                }
-                else
-                {
-                    CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
-                }
-            }
        }
    };
+public:
-    struct ReLUFunctor
+    ElementWiseLayer() {}
+    ElementWiseLayer(const Func &f) : func(f) {}
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
-        float negative_slope;
+        useOpenCL = ocl::useOpenCL();
-        ReLUFunctor(LayerParams &params)
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
        {
-            if (params.has("negative_slope"))
+            outputs[i].shareFrom(*inputs[i]); //no data copy
-                negative_slope = params.get<float>("negative_slope");
+            //hotfix: shareFrom doesn't provide properly Mat/UMat switching
+            if (useOpenCL)
+                outputs[i].umatRef() = inputs[i]->umatRefConst();
            else
-                negative_slope = 0.f;
+                outputs[i].matRef() = inputs[i]->matRefConst();
        }
+    }
-        template<typename TFloat>
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-        inline TFloat operator()(TFloat x)
+    {
-        {
+        #ifdef HAVE_OPENCL
-            return (x >= (TFloat)0) ? x : negative_slope * x;
+        if (useOpenCL)
-        }
+            forwardOCL(inputs, outputs);
-    };
+        else
+        #endif
+            forwardCPU(inputs, outputs);
+    }
-    struct TanHFunctor
+    #ifdef HAVE_OPENCL
+    void forwardOCL(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
-        TanHFunctor(LayerParams&) {}
+        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
-        template<typename TFloat>
+        for (size_t i = 0; i < inputs.size(); i++)
-        inline TFloat operator()(TFloat x)
        {
-            return tanh(x);
+            const UMat &src = inputs[i]->umatRefConst();
+            UMat &dst = outputs[i].umatRef();
+            CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
+            ocl::Kernel ker;
+            CV_Assert(func.initKernel(ker, src));
+            ker.set(0, (int)src.total());
+            ker.set(1, ocl::KernelArg::PtrReadOnly(src));
+            ker.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+            size_t gSize = src.total();
+            CV_Assert(ker.run(1, &gSize, &wgSize, true));
        }
-    };
+    }
+    #endif
-    struct SigmoidFunctor
+    void forwardCPU(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
-        SigmoidFunctor(LayerParams&) {}
+        for (size_t i = 0; i < inputs.size(); i++)
-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
        {
-            return (TFloat)1 / ((TFloat)1 + exp(-x));
+            const Mat &src = inputs[i]->matRefConst();
+            Mat &dst = outputs[i].matRef();
+            CV_Assert(src.ptr() == dst.ptr() && src.isContinuous());
+            Range sizeRange = Range(0, dst.total());
+            if (dst.type() == CV_32F)
+            {
+                cv::parallel_for_(sizeRange, PBody<float>(dst, func));
+            }
+            else if (dst.type() == CV_64F)
+            {
+                cv::parallel_for_(sizeRange, PBody<double>(dst, func));
+            }
+            else
+            {
+                CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
+            }
        }
-    };
+    }
+};
-    struct AbsValFunctor
+#ifdef HAVE_OPENCL
+static String oclGetTMacro(const UMat &m)
+{
+    return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
+}
+#endif
+struct ReLUFunctor
+{
+    typedef ReLULayer Layer;
+    double slope;
+    ReLUFunctor(double slope_)
+        : slope(slope_) {}
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        AbsValFunctor(LayerParams&) {}
+        return (x >= (TFloat)0) ? x : (TFloat)slope * x;
+    }
-        template<typename TFloat>
+    #ifdef HAVE_OPENCL
-        inline TFloat operator()(TFloat x)
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-        {
+    {
-            return abs(x);
+        const char *buildoptSlope = (slope == 0) ? "-DRELU_NO_SLOPE" : "";
-        }
+        String buildopt = oclGetTMacro(src) + buildoptSlope;
-    };
-    struct PowerFunctor
+        if (!ker.create("ReLUForward", ocl::dnn::activations_oclsrc, buildopt))
+            return false;
+        if (slope != 0)
+            ker.set(3, (float)slope);
+        return true;
+    }
+    #endif
+};
+struct TanHFunctor
+{
+    typedef TanHLayer Layer;
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        float power, scale, shift;
+        return tanh(x);
+    }
-        PowerFunctor(LayerParams &params)
+    #ifdef HAVE_OPENCL
-        {
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-            power = params.get<float>("power", 1.0f);
+    {
-            scale = params.get<float>("scale", 1.0f);
+        if (!ker.create("TanHForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
-            shift = params.get<float>("shift", 0.0f);
+            return false;
-        }
+        return true;
+    }
+    #endif
+};
-        template<typename TFloat>
+struct SigmoidFunctor
-        inline TFloat operator()(TFloat x)
+{
-        {
+    typedef SigmoidLayer Layer;
-            return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
-        }
-    };
-    struct BNLLFunctor
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        BNLLFunctor(LayerParams&) {}
+        return (TFloat)1 / ((TFloat)1 + exp(-x));
+    }
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("SigmoidForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        return true;
+    }
+    #endif
+};
+struct AbsValFunctor
+{
+    typedef AbsLayer Layer;
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return abs(x);
+    }
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("AbsValForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        return true;
+    }
+    #endif
+};
+struct BNLLFunctor
+{
+    typedef BNLLLayer Layer;
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return log((TFloat)1 + exp(-abs(x)));
+    }
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("BNLLForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        return true;
+    }
+    #endif
+};
+struct PowerFunctor
+{
+    typedef PowerLayer Layer;
+    double power, scale, shift;
+    PowerFunctor(double power_, double scale_ = 1, double shift_ = 0)
+        : power(power_), scale(scale_), shift(shift_) {}
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+    }
+    #ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        if (!ker.create("PowForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
+            return false;
+        ker.set(3, (float)power);
+        ker.set(4, (float)scale);
+        ker.set(5, (float)shift);
+        return true;
+    }
+    #endif
+};
-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
-        {
-            return log((TFloat)1 + exp(-abs(x)));
-        }
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -42,73 +42,88 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "fully_connected_layer.hpp"
+#include "op_blas.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/core/ocl.hpp>
 namespace cv
 {
 namespace dnn
 {
-    FullyConnectedLayer::FullyConnectedLayer(LayerParams &params) : Layer(params)
-    {
-        numOutputs = params.get<int>("num_output");
-        bias = params.get<bool>("bias_term", true);
-        axis_ = params.get<int>("axis", 1);
-        CV_Assert(blobs.size() == (bias ? 2U : 1U));
+FullyConnectedLayerImpl::FullyConnectedLayerImpl(int axis_)
-        CV_Assert(blobs[0].dims() >= 2 && blobs[0].total() >= (size_t)numOutputs);
+{
-        CV_Assert(!bias || blobs[1].total() == (size_t)numOutputs);
+    axis = axis_;
-    }
+}
-    void FullyConnectedLayer::allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
+void FullyConnectedLayerImpl::allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
-    {
+{
-        CV_Assert(input.size() > 0);
+    CV_Assert(input.size() > 0);
+    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+    CV_Assert(blobs[0].dims() == 2);
-        axis = input[0]->canonicalAxis(axis_);
+    bias = (blobs.size() >= 1);
-        innerSize = (int)input[0]->total(axis);
+    axisCan = input[0]->canonicalAxis(axis);
+    dtype = input[0]->type();
+    numOutput = blobs[0].size(0);
+    innerSize = blobs[0].size(1);
+    outerSize = input[0]->total(0, axisCan);
-        CV_Assert((size_t)innerSize * (size_t)numOutputs == blobs[0].total());
+    CV_Assert((size_t)innerSize == input[0]->total(axisCan));
-        CV_Assert(blobs[0].size(-2) == numOutputs && blobs[0].size(-1) == innerSize);
+    CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
-        output.resize(input.size());
+    useOpenCL = ocl::useOpenCL();
-        for (size_t i = 0; i < input.size(); i++)
+    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_UMAT;
-        {
-            if (i != 0)
-                CV_Assert(input[i]->equalShape(*input[0]));
-            this->reshape(*input[i], output[i]);
+    biasOnesBlob.create(Shape(outerSize, 1), dtype, allocFlags);
-        }
+    biasOnesBlob.setTo(1);
-    }
-    void FullyConnectedLayer::reshape(const Blob &inp, Blob &out)
+    output.resize(input.size());
+    for (size_t i = 0; i < input.size(); i++)
    {
-        BlobShape inpShape = inp.shape();
+        CV_Assert(i == 0 || (input[i]->equalShape(*input[0]) && input[i]->type() == dtype));
-        BlobShape outShape(axis+1, inpShape.ptr());
+        Shape outShape = input[i]->shape().slice(0, axis) + Shape(numOutput);
-        outShape[axis] = numOutputs;
+        output[i].create(outShape, dtype, allocFlags);
+    }
+}
-        out.create(outShape, inp.type());
+void FullyConnectedLayerImpl::forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+{
+    #ifdef HAVE_OPENCL
+    if (useOpenCL)
+        forward_<UMat>(input, output);
+    else
+    #endif
+        forward_<Mat>(input, output);
+}
+template<typename XMat>
+void FullyConnectedLayerImpl::forward_(std::vector<Blob *> &input, std::vector<Blob> &output)
+{
+    const XMat &weight = blobs[0].getRefConst<XMat>();
+    const XMat *biasMat = NULL, *biasOnesMat = NULL;
+    if (bias)
+    {
+        biasOnesMat = &biasOnesBlob.getRefConst<XMat>();
+        biasMat = &blobs[1].getRefConst<XMat>();
    }
-    void FullyConnectedLayer::forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+    for (size_t i = 0; i < input.size(); i++)
    {
-        for (size_t i = 0; i < input.size(); i++)
+        const XMat srcMat = reshaped(input[i]->getRefConst<XMat>(), Shape(outerSize, innerSize));
-        {
+        XMat dstMat = reshaped(output[i].getRef<XMat>(), Shape(outerSize, numOutput));
-            int M = (int)input[i]->total(0, axis);
+        dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
-            int N = numOutputs;
-            int K = innerSize;
+        if (bias)
+            dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
-            Mat srcMat(M, K, input[i]->type(), input[i]->ptrf());
-            Mat weight(N, K, blobs[0].type(), blobs[0].ptrf());
-            Mat dstMat(M, N, output[i].type(), output[i].ptrf());
-            //important: Caffe stores weights as transposed array
-            cv::gemm(srcMat, weight, 1, noArray(), 0, dstMat, GEMM_2_T);
-            if (bias)
-            {
-                Mat biasOnesMat = Mat::ones(M, 1, CV_32F);
-                Mat biasMat(1, N, CV_32F, blobs[1].ptrf());
-                cv::gemm(biasOnesMat, biasMat, 1, dstMat, 1, dstMat);
-            }
-        }
    }
 }
+Ptr<InnerProductLayer> InnerProductLayer::create(int axis)
+{
+    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(axis));
+}
+}
 }
--- a/modules/dnn/src/layers/fully_connected_layer.hpp
+++ b/modules/dnn/src/layers/fully_connected_layer.hpp
@@ -42,26 +42,30 @@
 #ifndef __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 namespace cv
 {
 namespace dnn
 {
-    class FullyConnectedLayer : public Layer
-    {
-        bool bias;
-        int numOutputs;
-        int axis_, axis;
-        int innerSize;
+class FullyConnectedLayerImpl : public InnerProductLayer
+{
+    int axisCan, dtype;
+    int numOutput, innerSize, outerSize;
+    bool bias, useOpenCL;
+    Blob biasOnesBlob;
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &input, std::vector<Blob> &output);
+public:
-        void reshape(const Blob &inp, Blob &out);
+    FullyConnectedLayerImpl(int axisCan = 1);
+    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
-    public:
-        FullyConnectedLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@@ -46,44 +46,5 @@ namespace cv
 namespace dnn
 {
-void getKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW, int &strideH, int &strideW)
-{
-    if (params.has("kernel_h") && params.has("kernel_w"))
-    {
-        kernelH = params.get<int>("kernel_h");
-        kernelW = params.get<int>("kernel_w");
-    }
-    else if (params.has("kernel_size"))
-    {
-        kernelH = kernelW = params.get<int>("kernel_size");
-    }
-    else
-    {
-        CV_Error(cv::Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
-    }
-    if (params.has("pad_h") && params.has("pad_w"))
-    {
-        padH = params.get<int>("pad_h");
-        padW = params.get<int>("pad_w");
-    }
-    else
-    {
-        padH = padW = params.get<int>("pad", 0);
-    }
-    if (params.has("stride_h") && params.has("stride_w"))
-    {
-        strideH = params.get<int>("stride_h");
-        strideW = params.get<int>("stride_w");
-    }
-    else
-    {
-        strideH = strideW = params.get<int>("stride", 1);
-    }
-    CV_Assert(kernelH > 0 && kernelW > 0 && padH >= 0 && padW >= 0 && strideH > 0 && strideW > 0);
-}
 }
 }
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -42,14 +42,14 @@
 #ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #include <opencv2/dnn.hpp>
+#include "op_blas.hpp"
+#include "op_im2col.hpp"
 namespace cv
 {
 namespace dnn
 {
-void getKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW, int &strideH, int &strideW);
 }
 }

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -42,123 +42,213 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "lrn_layer.hpp"
+#include "modules/dnn/opencl_kernels_dnn.hpp"
 #include <opencv2/imgproc.hpp>
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
 #include <algorithm>
 namespace cv
 {
 namespace dnn
 {
-    LRNLayer::LRNLayer(LayerParams &params) : Layer(params)
-    {
-        String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
-        if (nrmType == "ACROSS_CHANNELS")
-            type = CHANNEL_NRM;
-        else if (nrmType == "WITHIN_CHANNEL")
-            type = SPATIAL_NRM;
-        else
-            CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
-        size = params.get<int>("local_size", 5);
-        if (size % 2 != 1 || size <= 0)
-            CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
-        alpha = params.get<double>("alpha", 1);
-        beta = params.get<double>("beta", 0.75);
-    }
-    void LRNLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+LRNLayerImpl::LRNLayerImpl(int type_, int size_, double alpha_, double beta_)
-    {
+{
-        CV_Assert(inputs.size() == 1);
+    type = type_;
-        outputs.resize(1);
+    size = size_;
+    alpha = alpha_;
+    beta = beta_;
+}
-        Vec4i shape = inputs[0]->shape4();
+void LRNLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-        outputs[0].create(shape);
+{
+    CV_Assert(inputs.size() == 1 && inputs[0]->dims() == 4);
+    CV_Assert(type == CHANNEL_NRM || type == SPATIAL_NRM);
+    useOpenCL = cv::ocl::useOpenCL();
-        shape[0] = 1; //maybe make shape[0] = 1 too
+    if (type == SPATIAL_NRM && !useOpenCL)
-        bufBlob.create(shape);
+        buf.create(inputs[0]->shape().slice(2), inputs[0]->type(), Blob::ALLOC_MAT);
-    }
+    if (type == CHANNEL_NRM && useOpenCL)
+        buf.create(inputs[0]->shape().slice(2), inputs[0]->type(), Blob::ALLOC_UMAT);
-    void LRNLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    outputs.resize(1);
+    outputs[0].create(inputs[0]->shape(), inputs[0]->type());
+}
+void LRNLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    Blob &src = *inputs[0];
+    Blob &dst = outputs[0];
+    switch (type)
    {
-        Blob &src = *inputs[0];
+    case CHANNEL_NRM:
-        Blob &dst = outputs[0];
+        channelNoramlization(src, dst);
+        break;
+    case SPATIAL_NRM:
+        spatialNormalization(src, dst);
+        break;
+    default:
+        CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
+        break;
+    }
+}
-        switch (type)
+template<typename XMat>
-        {
+static XMat getPlane(XMat &m, int n, int cn)
-        case CHANNEL_NRM:
+{
-            channelNoramlization(src, dst);
+    return reshaped(slice(m, n, cn), BlobShape::like(m).slice(2));
-            break;
+}
-        case SPATIAL_NRM:
-            spatialNormalization(src, dst);
+void LRNLayerImpl::channelNoramlization(Blob &src, Blob &dst)
-            break;
+{
-        default:
+    if (!useOpenCL)
-            CV_Error(cv::Error::StsNotImplemented, "Unimplemented mode of LRN layer");
+        channelNoramlization_<Mat>(src, dst);
-            break;
+    else
-        }
+    {
+        //channelNoramlization_ocl(src.getRefConst<UMat>(), dst.getRef<UMat>()); //consumes a lot of memory
+        channelNoramlization_<UMat>(src, dst);
    }
+}
-    void LRNLayer::channelNoramlization(Blob &srcBlob, Blob &dstBlob)
+template<typename XMat>
+void LRNLayerImpl::channelNoramlization_(Blob &srcBlob, Blob &dstBlob)
+{
+    int num = srcBlob.num();
+    int channels = srcBlob.channels();
+    int ksize = (size - 1) / 2;
+    XMat srcMat = srcBlob.getRefConst<XMat>();
+    XMat dstMat = dstBlob.getRef<XMat>();
+    for (int n = 0; n < num; n++)
    {
-        CV_DbgAssert(srcBlob.ptr() != dstBlob.ptr());
+        XMat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
+        accum.setTo(0);
-        int num = srcBlob.num();
+        for (int cn = 0; cn < std::min(ksize, channels); cn++)
-        int channels = srcBlob.channels();
+            cv::accumulateSquare(getPlane(srcMat, n, cn), accum);
-        int ksize = (size - 1) / 2;
-        for (int n = 0; n < num; n++)
+        for (int cn = 0; cn < channels; cn++)
        {
-            Mat accum = dstBlob.getPlane(n, channels-1); //trick for memory saving
+            if (cn + ksize < channels)
-            accum.setTo(0);
+            {
+                cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
-            for (int cn = 0; cn < std::min(ksize, channels); cn++)
+            }
-                cv::accumulateSquare(srcBlob.getPlane(n, cn), accum);
-            for (int cn = 0; cn < channels; cn++)
+            if (cn - ksize - 1 >= 0)
            {
-                if (cn + ksize < channels)
+                //subtractSquare
-                {
+                XMat left = getPlane(srcMat, n, cn - ksize - 1);
-                    cv::accumulateSquare(srcBlob.getPlane(n, cn + ksize), accum);
+                cv::pow(left, 2, left);
-                }
+                cv::subtract(accum, left, accum);
-                if (cn - ksize - 1 >= 0)
-                {
-                    Mat left = srcBlob.getPlane(n, cn - ksize - 1);
-                    cv::subtract(accum, left.mul(left), accum); //subtractSquare
-                }
-                Mat dst = dstBlob.getPlane(n, cn);
-                accum.convertTo(dst, dst.type(), alpha/size, 1);
-                cv::pow(dst, beta, dst);
-                cv::divide(srcBlob.getPlane(n, cn), dst, dst);
            }
+            XMat dst = getPlane(dstMat, n, cn);
+            accum.convertTo(dst, dst.type(), alpha/size, 1);
+            cv::pow(dst, beta, dst);
+            cv::divide(getPlane(srcMat, n, cn), dst, dst);
        }
    }
+}
-    void LRNLayer::spatialNormalization(Blob &srcBlob, Blob &dstBlob)
+bool LRNLayerImpl::channelNoramlization_ocl(const UMat &src, UMat &dst)
-    {
+{
-        int num = srcBlob.num();
+#ifdef HAVE_OPENCL
-        int channels = srcBlob.channels();
+    if (src.offset != 0 || dst.offset != 0) //TODO: add offset
+        return false;
+    String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+    ocl::Kernel kerScale("LRNFillScale", ocl::dnn::lrn_oclsrc, buildOpts);
+    if (kerScale.empty())
+        return false;
+    ocl::Kernel kerOutput("LRNComputeOutput", ocl::dnn::lrn_oclsrc, buildOpts);
+    if (kerOutput.empty())
+        return false;
+    Shape shape = Shape::like(src);
+    int ksize = (size - 1) / 2;
+    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+    UMat &scaleBuf = buf.umatRef();
+    size_t nthreads = (size_t)(shape.total() / shape[1]);
+    kerScale.args((int)nthreads,
+                  ocl::KernelArg::PtrReadOnly(src), shape[0], shape[1], shape[2], shape[3],
+                  size, (float)(alpha/size), (float)ksize, ocl::KernelArg::PtrWriteOnly(scaleBuf));
+    if (!kerScale.run(1, &nthreads, &wgSize, true))
+        return false;
+    nthreads = (size_t)shape.total();
+    kerOutput.args((int)nthreads,
+                   ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadOnly(scaleBuf),
+                   -beta, ocl::KernelArg::PtrWriteOnly(dst) );
+    if (!kerOutput.run(1, &nthreads, &wgSize, true))
+        return false;
+    return true;
+#else
+    (void)src;
+    (void)dst;
+    return false;
+#endif
+}
+void LRNLayerImpl::spatialNormalization(Blob &src, Blob &dst)
+{
+    if (!useOpenCL)
+        spatialNormalization_<Mat>(src, dst);
+    else
+        spatialNormalization_<UMat>(src, dst);
+}
+//TODO: fix cv::boxFilter with BORDER_ISOLATED flag in CPU mode
+template<>
+void LRNLayerImpl::sqrBoxFilter_<Mat>(const Mat &src, Mat &dst)
+{
+    Mat bufMat = buf.getRef<Mat>();
+    src.copyTo(bufMat);
+    cv::sqrBoxFilter(bufMat, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT);
+}
+template<>
+void LRNLayerImpl::sqrBoxFilter_<UMat>(const UMat &src, UMat &dst)
+{
+    cv::sqrBoxFilter(src, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT | BORDER_ISOLATED);
+}
-        for (int n = 0; n < num; n++)
+template<typename XMat>
+void LRNLayerImpl::spatialNormalization_(Blob &srcBlob, Blob &dstBlob)
+{
+    int num = srcBlob.num();
+    int channels = srcBlob.channels();
+    XMat srcMat = srcBlob.getRefConst<XMat>();
+    XMat dstMat = dstBlob.getRef<XMat>();
+    for (int n = 0; n < num; n++)
+    {
+        for (int cn = 0; cn < channels; cn++)
        {
-            for (int cn = 0; cn < channels; cn++)
+            XMat src = getPlane(srcMat, n, cn);
-            {
+            XMat dst = getPlane(dstMat, n, cn);
-                Mat src = srcBlob.getPlane(n, cn);
-                Mat dst = dstBlob.getPlane(n, cn);
+            sqrBoxFilter_(src, dst);
-                uchar *dataDst0 = dst.data;
+            dst.convertTo(dst, dst.type(), alpha/(size*size), 1);
-                cv::pow(srcBlob.getPlane(n, cn), 2, dst);
+            cv::pow(dst, beta, dst);
-                //TODO: check border type
+            cv::divide(src, dst, dst);
-                cv::boxFilter(dst, dst, dst.depth(), cv::Size(size, size), cv::Point(-1, -1), false, cv::BORDER_CONSTANT);
-                dst.convertTo(dst, dst.type(), alpha/(size*size), 1);
-                cv::pow(dst, beta, dst);
-                cv::divide(src, dst, dst);
-                CV_Assert(dataDst0 == dst.data); //debug
-            }
        }
    }
+}
+Ptr<LRNLayer> LRNLayer::create(int type, int size, double alpha, double beta)
+{
+    return Ptr<LRNLayer>(new LRNLayerImpl(type, size, alpha, beta));
+}
 }
 }
--- a/modules/dnn/src/layers/lrn_layer.hpp
+++ b/modules/dnn/src/layers/lrn_layer.hpp
@@ -42,34 +42,36 @@
 #ifndef __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 namespace cv
 {
 namespace dnn
 {
-    class LRNLayer : public Layer
-    {
-        enum
-        {
-            CHANNEL_NRM,
-            SPATIAL_NRM,
-            SPATIAL_CONTRAST_NRM //cuda-convnet feature
-        } type;
-        int size;
+class LRNLayerImpl : public LRNLayer
-        double alpha, beta;
+{
+    bool useOpenCL;
+    Blob buf;
+    void channelNoramlization(Blob &src, Blob &dst);
+    template<typename XMat>
+    void channelNoramlization_(Blob &src, Blob &dst);
+    bool channelNoramlization_ocl(const UMat &src, UMat &dst);
-        Blob bufBlob;
+    void spatialNormalization(Blob &src, Blob &dst);
+    template<typename XMat>
+    void spatialNormalization_(Blob &src, Blob &dst);
+    template<typename XMat>
+    void sqrBoxFilter_(const XMat &src, XMat &dst);
-        void channelNoramlization(Blob &src, Blob &dst);
+public:
-        void spatialNormalization(Blob &src, Blob &dst);
-    public:
+    LRNLayerImpl(int type = CHANNEL_NRM, int size = 5, double alpha = 1, double beta = 0.75);
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
-        LRNLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -42,20 +42,21 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "mvn_layer.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 namespace cv
 {
 namespace dnn
 {
-MVNLayer::MVNLayer(LayerParams &params) : Layer(params)
+MVNLayerImpl::MVNLayerImpl(bool normVariance_, bool acrossChannels_, double eps_)
 {
-    eps = params.get<double>("eps", 1e-9);
+    normVariance = normVariance_;
-    acrossChannels = params.get<bool>("across_channels", false);
+    acrossChannels = acrossChannels_;
-    normalizeVariance = params.get<bool>("normalize_variance", true);
+    eps = eps_;
 }
-void MVNLayer::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+void MVNLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
 {
    outputs.resize(inputs.size());
    for (size_t i = 0; i < inputs.size(); i++)
@@ -65,20 +66,17 @@ void MVNLayer::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &ou
    }
 }
-void MVNLayer::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+void MVNLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
 {
    for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
    {
        Blob &inpBlob = *inputs[inpIdx];
        Blob &outBlob = outputs[inpIdx];
-        int workSize[2];
        int splitDim = (acrossChannels) ? 1 : 2;
-        workSize[0] = (int)inpBlob.total(0, splitDim);
+        Shape workSize((int)inpBlob.total(0, splitDim), (int)inpBlob.total(splitDim));
-        workSize[1] = (int)inpBlob.total(splitDim);
+        Mat inpMat = reshaped(inpBlob.matRefConst(), workSize);
+        Mat outMat = reshaped(outBlob.matRef(), workSize);
-        Mat inpMat = inpBlob.matRef().reshape(1, 2, workSize);
-        Mat outMat = outBlob.matRef().reshape(1, 2, workSize);
        Scalar mean, dev;
        for (int i = 0; i < workSize[0]; i++)
@@ -86,12 +84,18 @@ void MVNLayer::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
            Mat inpRow = inpMat.row(i);
            Mat outRow = outMat.row(i);
-            cv::meanStdDev(inpRow, mean, (normalizeVariance) ? dev : noArray());
+            cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
-            double alpha = (normalizeVariance) ? 1/(eps + dev[0]) : 1;
+            double alpha = (normVariance) ? 1/(eps + dev[0]) : 1;
            inpRow.convertTo(outRow, outRow.type(), alpha, -mean[0] * alpha);
        }
    }
 }
+Ptr<MVNLayer> MVNLayer::create(bool normVariance, bool acrossChannels, double eps)
+{
+    return Ptr<MVNLayer>(new MVNLayerImpl(normVariance, acrossChannels, eps));
+}
 }
 }
--- a/modules/dnn/src/layers/mvn_layer.hpp
+++ b/modules/dnn/src/layers/mvn_layer.hpp
@@ -42,20 +42,18 @@
 #ifndef __OPENCV_DNN_LAYERS_MVN_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_MVN_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 namespace cv
 {
 namespace dnn
 {
-class MVNLayer : public Layer
+class MVNLayerImpl : public MVNLayer
 {
-    double eps;
-    bool acrossChannels, normalizeVariance;
 public:
-    MVNLayer(LayerParams &params);
+    MVNLayerImpl(bool normVariance_ = true, bool acrossChannels_ = false, double eps_ = 1e-9);
    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
 };

--- a/modules/dnn/src/layers/op_blas.cpp
+++ b/modules/dnn/src/layers/op_blas.cpp
+#include "op_blas.hpp"
+#if HAVE_CBLAS
+#include "opencv_cblas.hpp"
+#endif
+#include <iostream>
+namespace cv
+{
+namespace dnn
+{
+void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
+{
+    if (C.isMat())
+        gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
+    else
+    {
+        cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
+    }
+}
+inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
+{
+    CV_DbgAssert(A.dims == 2);
+    rows = (isTrans) ? A.cols : A.rows;
+    cols = (isTrans) ? A.rows : A.cols;
+}
+void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/)
+{
+    #if HAVE_CBLAS
+    bool transA = static_cast<bool>(flags & GEMM_1_T);
+    bool transB = static_cast<bool>(flags & GEMM_2_T);
+    bool transC = static_cast<bool>(flags & GEMM_3_T);
+    int Arows, Acols, Brows, Bcols, Crows, Ccols;
+    SwapRowCols(A, Arows, Acols, transA);
+    SwapRowCols(B, Brows, Bcols, transB);
+    SwapRowCols(C, Crows, Ccols, transC);
+    CV_Assert(!(flags & GEMM_3_T));
+    CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
+    CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
+    CV_Assert(A.type() == B.type() && B.type() == C.type());
+    CV_Assert(A.data != C.data && B.data != C.data);
+    if (C.type() == CV_32F)
+    {
+        cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
+                    Arows, Bcols, Acols,
+                    (float)alpha, A.ptr<float>(), A.cols,
+                    B.ptr<float>(), B.cols,
+                    (float)beta, C.ptr<float>(), C.cols);
+    }
+    else if (C.type() == CV_64F)
+    {
+        //TODO: Should be tested
+        cblas_dgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
+                    Arows, Bcols, Acols,
+                    alpha, A.ptr<double>(), A.cols,
+                    B.ptr<double>(), B.cols,
+                    beta, C.ptr<double>(), C.cols);
+    }
+    else
+    {
+        CV_Error(Error::BadDepth, "Only floating point types are supported");
+    }
+    #else
+    cv::gemm(A, B, alpha, C, beta, C, flags);
+    #endif
+}
+int getBlasThreads()
+{
+    #ifdef OPENBLAS_VERSION
+    return openblas_get_num_threads();
+    #else
+    return 1;
+    #endif
+}
+void setBlasThreads(int numThreads)
+{
+    #ifdef OPENBLAS_VERSION
+    openblas_set_num_threads(numThreads);
+    goto_set_num_threads(numThreads);
+    #else
+    (void)numThreads;   //suppress compilers' warning
+    #endif
+}
+}
+}
--- a/modules/dnn/src/layers/im2col.cpp
+++ b/modules/dnn/src/layers/im2col.cpp
@@ -39,47 +39,21 @@
 //
 //M*/
+#ifndef __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
+#define __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
 #include "../precomp.hpp"
-#include <opencv2/core/ocl.hpp>
-#include "im2col.hpp"
-#include "opencl_kernels_dnn.hpp"
 namespace cv
 {
 namespace dnn
 {
+    int getBlasThreads();
-#ifdef HAVE_OPENCL
+    void setBlasThreads(int numThreads);
-void im2col_ocl(UMat &img,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                UMat &col)
-{
-    int h_out = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-    int w_out = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-    CV_Assert(img.isContinuous() && col.isContinuous());
-    CV_Assert(img.total() == (size_t)channels * height * width);
-    CV_Assert(col.total() == (size_t)channels * kernel_h * kernel_w * h_out * w_out);
-    ocl::Kernel im2col_ker("im2col", ocl::dnn::im2col_oclsrc);
-    CV_Assert(!im2col_ker.empty());
-    im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset,
+    void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags = 0);
-             channels, height, width,
-             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-             h_out, w_out,
-             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset
-        );
-    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
-    size_t globalSize = (size_t)channels * h_out * w_out;
-    CV_Assert(im2col_ker.run(1, &globalSize, &localSize, true));
-}
-#endif // HAVE_OPENCL
+    void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
 }
 }
+#endif
\ No newline at end of file
--- a/modules/dnn/src/layers/im2col.hpp
+++ b/modules/dnn/src/layers/im2col.hpp
@@ -39,88 +39,84 @@
 //
 //M*/
-#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#include "../precomp.hpp"
-#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#include <opencv2/core/ocl.hpp>
+#include "opencl_kernels_dnn.hpp"
+#include "op_im2col.hpp"
 namespace cv
 {
 namespace dnn
 {
-template <typename Dtype>
+#ifdef HAVE_OPENCL
-void im2col_cpu(const Dtype* data_im,
-                int channels, int height, int width,
+bool im2col_ocl(const UMat &img,
-                int kernel_h, int kernel_w,
+                 int channels, int height, int width,
-                int pad_h, int pad_w,
+                 int kernel_h, int kernel_w,
-                int stride_h, int stride_w,
+                 int pad_h, int pad_w,
-                Dtype* data_col)
+                 int stride_h, int stride_w,
+                 UMat &col)
 {
    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
    int channels_col = channels * kernel_h * kernel_w;
-    for (int c = 0; c < channels_col; ++c) {
+    int esz = img.elemSize();
-        int w_offset = c % kernel_w;
-        int h_offset = (c / kernel_w) % kernel_h;
+    CV_Assert(img.isContinuous() && col.isContinuous());
-        int c_im = c / kernel_h / kernel_w;
+    CV_Assert(img.total() == (size_t)channels * height * width);
-        for (int h = 0; h < height_col; ++h) {
+    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
-            for (int w = 0; w < width_col; ++w) {
-                int h_pad = h * stride_h - pad_h + h_offset;
+    ocl::Kernel ker("im2col", ocl::dnn::im2col_oclsrc, String("-DT=") + ocl::typeToStr(img.type()));
-                int w_pad = w * stride_w - pad_w + w_offset;
+    if (ker.empty())
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+        return false;
-                    data_col[(c * height_col + h) * width_col + w] =
-                    data_im[(c_im * height + h_pad) * width + w_pad];
+    ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset/esz,
-                else
+             channels, height, width,
-                    data_col[(c * height_col + h) * width_col + w] = 0;
+             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-            }
+             height_col, width_col,
-        }
+             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset/esz
-    }
+             );
+    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t globalSize = (size_t)channels * height_col * width_col;
+    return ker.run(1, &globalSize, &localSize, true);
 }
-template <typename Dtype>
+bool col2im_ocl(const UMat &col,
-void col2im_cpu(const Dtype* data_col,
                int channels, int height, int width,
-                int patch_h, int patch_w,
+                int kernel_h, int kernel_w,
                int pad_h, int pad_w,
                int stride_h, int stride_w,
-                Dtype* data_im)
+                UMat &img)
 {
-    memset(data_im, 0, height * width * channels * sizeof(Dtype));
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+    int esz = img.elemSize();
-    int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+    CV_Assert(img.isContinuous() && col.isContinuous());
-    int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+    CV_Assert(img.total() == (size_t)channels * height * width);
-    int channels_col = channels * patch_h * patch_w;
+    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
-    for (int c = 0; c < channels_col; ++c)
+    ocl::Kernel ker("col2im", ocl::dnn::col2im_oclsrc, String("-DT=") + ocl::typeToStr(col.type()));
-    {
+    if (ker.empty())
-        int w_offset = c % patch_w;
+        return false;
-        int h_offset = (c / patch_w) % patch_h;
-        int c_im = c / patch_h / patch_w;
-        for (int h = 0; h < height_col; ++h)
+    ker.args((int)img.total(),
-        {
+             ocl::KernelArg::PtrReadOnly(col), (int)col.offset/esz,
-            for (int w = 0; w < width_col; ++w)
+             height, width, channels,
-            {
+             kernel_h, kernel_w,
-                int h_pad = h * stride_h - pad_h + h_offset;
+             pad_h, pad_w,
-                int w_pad = w * stride_w - pad_w + w_offset;
+             stride_h, stride_w,
+             height_col, width_col,
+             ocl::KernelArg::PtrWriteOnly(img), (int)img.offset/esz);
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
-                    data_im[(c_im * height + h_pad) * width + w_pad] +=
+    size_t globalSize = img.total();
-                    data_col[(c * height_col + h) * width_col + w];
+    return ker.run(1, &globalSize, &localSize, true);
-            }
-        }
-    }
 }
-#ifdef HAVE_OPENCL
-void im2col_ocl(UMat &img,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                UMat &col);
 #endif
 }
 }
-#endif
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
--- a/modules/dnn/src/layers/pooling_layer.hpp
+++ b/modules/dnn/src/layers/pooling_layer.hpp
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -42,37 +42,39 @@
 #ifndef __OPENCV_DNN_LAYERS_POOLING_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_POOLING_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 namespace cv
 {
 namespace dnn
 {
-    class PoolingLayer : public Layer
-    {
-        enum
-        {
-            MAX,
-            AVE,
-            STOCHASTIC
-        };
-        int type;
+class PoolingLayerImpl : public PoolingLayer
-        int padH, padW;
+{
-        int strideH, strideW;
+    bool useOpenCL;
-        int kernelH, kernelW;
+    Size inp, out;
+    void computeOutputShape(Size inpSz);
+    bool pooling_ocl(const char *kname, const Blob &src, Blob &dst, Blob *mask = NULL);
+    void maxPooling(Blob &src, Blob &dst);
+    void maxPooling_cpu(Blob &src, Blob &dst);
+    bool maxPooling_ocl(Blob &src, Blob &dst);
+    void avePooling(Blob &src, Blob &dst);
+    void avePooling_cpu(Blob &src, Blob &dst);
+    bool avePooling_ocl(Blob &src, Blob &dst);
+public:
-        int inpH, inpW;
+    PoolingLayerImpl();
-        int outH, outW;
+    PoolingLayerImpl(int type, Size kernel, Size stride, Size pad);
-        void computeOutputShape(int inpH, int inpW);
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void maxPooling(Blob &input, Blob &output);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void avePooling(Blob &input, Blob &output);
+};
-    public:
-        PoolingLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
--- a/modules/dnn/src/layers/recurrent_layers.hpp
+++ b/modules/dnn/src/layers/recurrent_layers.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_DNN_LAYERS_RECURRENT_LAYERS_HPP__
+#define __OPENCV_DNN_LAYERS_RECURRENT_LAYERS_HPP__
+#include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
+namespace cv
+{
+namespace dnn
+{
+}
+}
+#endif
\ No newline at end of file
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
--- a/modules/dnn/src/layers/reshape_layer.hpp
+++ b/modules/dnn/src/layers/reshape_layer.hpp
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
--- a/modules/dnn/src/layers/slice_layer.hpp
+++ b/modules/dnn/src/layers/slice_layer.hpp
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
--- a/modules/dnn/src/layers/softmax_layer.hpp
+++ b/modules/dnn/src/layers/softmax_layer.hpp
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
--- a/modules/dnn/src/layers/split_layer.hpp
+++ b/modules/dnn/src/layers/split_layer.hpp
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
--- a/modules/dnn/src/opencl/col2im.cl
+++ b/modules/dnn/src/opencl/col2im.cl
--- a/modules/dnn/src/opencl/im2col.cl
+++ b/modules/dnn/src/opencl/im2col.cl
--- a/modules/dnn/src/opencl/lrn.cl
+++ b/modules/dnn/src/opencl/lrn.cl
--- a/modules/dnn/src/opencl/pooling.cl
+++ b/modules/dnn/src/opencl/pooling.cl
--- a/modules/dnn/src/opencl/softmax.cl
+++ b/modules/dnn/src/opencl/softmax.cl
--- a/modules/dnn/src/precomp.hpp
+++ b/modules/dnn/src/precomp.hpp
@@ -40,4 +40,5 @@
 //M*/
 #include <opencv2/core.hpp>
+#include "cvconfig.h"
 #include <opencv2/dnn.hpp>
--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
--- a/modules/dnn/test/test_googlenet.cpp
+++ b/modules/dnn/test/test_googlenet.cpp
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
--- a/modules/dnn/test/test_main.cpp
+++ b/modules/dnn/test/test_main.cpp
--- a/modules/dnn/testdata/dnn/.gitignore
+++ b/modules/dnn/testdata/dnn/.gitignore
+*.caffemodel