adding of BLAS and MKL support into dnn module

b1c87936 · Vitaliy Lyudvichenko · cd5993c6 · b1c87936 · b1c87936 · b1c87936
Commit b1c87936 authored Jun 09, 2016 by Vitaliy Lyudvichenko
4 changed files
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -24,6 +24,48 @@ ocv_add_samples()
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()

+# ----------------------------------------------------------------------------
+# Find BLAS library
+# ----------------------------------------------------------------------------
+OCV_OPTION(${the_module}_WITH_BLAS "Use external BLAS library to speedup processing" OFF)
+if(${the_module}_WITH_BLAS)
+    set(BLAS_CBLAS_H "cblas.h")
+    
+    include(cmake/OpenCVFindMKL.cmake)
+    if(MKL_FOUND)
+        set(BLAS_INCLUDE_DIR    ${MKL_INCLUDE_DIRS})
+        set(BLAS_CBLAS_H        "mkl_cblas.h"      )
+        set(BLAS_LIBRARIES      ${MKL_LIBRARIES}   )
+        set(BLAS_BINARIES       ""                 )
+    endif()
+
+    set(BLAS_PREF ${the_module}_BLAS)
+    set(${BLAS_PREF}_INCLUDE_DIR    ${BLAS_INCLUDE_DIR} CACHE PATH      "Path to BLAS include dir")
+    set(${BLAS_PREF}_CBLAS_H        ${BLAS_CBLAS_H}     CACHE STRING    "Name of cblas.h")
+    set(${BLAS_PREF}_LIBRARIES      ${BLAS_LIBRARIES}   CACHE FILEPATH  "Path to BLAS libraries that will be linked with ${the_module} module")
+    set(${BLAS_PREF}_BINARIES       ${BLAS_BINARIES}    CACHE FILEPATH  "Path to BLAS binaries (.so, .dll) that will be installed with ${the_module} module")
+    
+    set(CBLAS_H ${${the_module}_BLAS_INCLUDE_DIR}/${${BLAS_PREF}_CBLAS_H})
+    if(${BLAS_PREF}_INCLUDE_DIR AND NOT EXISTS ${CBLAS_H})
+        message(WARNING "cblas.h at \"${CBLAS_H}\" not found")
+    endif()
+    
+    ocv_module_include_directories(${${the_module}_BLAS_INCLUDE_DIR})
+    list(APPEND OPENCV_MODULE_${the_module}_DEPS_EXT ${${the_module}_BLAS_LIBRARIES})
+    target_link_libraries(${the_module} ${${the_module}_BLAS_LIBRARIES})
+    add_definitions(-DHAVE_CBLAS)
+    add_definitions(-DCBLAS_H_INCLUDE=<${${BLAS_PREF}_CBLAS_H}>)
+    message(CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR})
+    add_custom_command(TARGET ${the_module} PRE_BUILD #OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/cblas.h 
+                       COMMAND ${CMAKE_COMMAND} ARGS -E echo \"\#include <${${BLAS_PREF}_CBLAS_H}>\" > ${CMAKE_CURRENT_BINARY_DIR}/cblas.h
+                       COMMENT "Adding proxy cblas.h header")
+    
+    if(${the_module}_BLAS_BINARIES)
+        ocv_install_target(${the_module} EXPORT ${the_module}_BLAS_BINARIES
+                           RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
+    endif()
+endif()
+
 # ----------------------------------------------------------------------------
 # Download pre-trained models for complex testing on GoogLeNet and AlexNet
 # ----------------------------------------------------------------------------

--- a/modules/dnn/cmake/OpenCVFindMKL.cmake
+++ b/modules/dnn/cmake/OpenCVFindMKL.cmake
+#
+# The script to detect Intel(R) Math Kernel Library (MKL)
+# installation/package
+#
+# Parameters:
+# MKL_WITH_TBB
+#
+# On return this will define:
+#
+# HAVE_MKL          - True if Intel IPP found
+# MKL_ROOT_DIR      - root of IPP installation
+# MKL_INCLUDE_DIRS  - IPP include folder
+# MKL_LIBRARIES     - IPP libraries that are used by OpenCV
+#
+
+if(NOT DEFINED MKL_USE_MULTITHREAD)
+    OCV_OPTION(MKL_WITH_TBB "Use MKL with TBB multithreading" OFF)#ON IF WITH_TBB)
+    OCV_OPTION(MKL_WITH_OPENMP "Use MKL with OpenMP multithreading" OFF)#ON IF WITH_OPENMP)
+endif()
+
+#check current MKL_ROOT_DIR
+if(NOT MKL_ROOT_DIR OR NOT EXISTS ${MKL_ROOT_DIR}/include/mkl.h)
+    set(MKLROOT_PATHS ${MKL_ROOT_DIR})
+    if(DEFINED $ENV{MKLROOT})
+        list(APPEND MKLROOT_PATHS $ENV{MKLROOT})
+    endif()
+    if(WIN32)
+        set(ProgramFilesx86 "ProgramFiles(x86)")
+        list(APPEND MKLROOT_PATHS $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows/mkl)
+    endif()
+    if(UNIX)
+        list(APPEND MKLROOT_PATHS "opt/intel/mkl")
+    endif()
+
+    find_path(MKL_ROOT_DIR include/mkl.h PATHS ${MKLROOT_PATHS})
+endif()
+
+set(MKL_INCLUDE_DIRS ${MKL_ROOT_DIR}/include)
+set(MKL_INCLUDE_HEADERS ${MKL_INCLUDE_DIRS}/mkl.h ${MKL_INCLUDE_DIRS}/mkl_version.h)
+
+macro(get_mkl_version VERSION_FILE)
+    # read MKL version info from file
+    file(STRINGS ${VERSION_FILE} STR1 REGEX "__INTEL_MKL__")
+    file(STRINGS ${VERSION_FILE} STR2 REGEX "__INTEL_MKL_MINOR__")
+    file(STRINGS ${VERSION_FILE} STR3 REGEX "__INTEL_MKL_UPDATE__")
+    #file(STRINGS ${VERSION_FILE} STR4 REGEX "INTEL_MKL_VERSION")
+    
+    # extract info and assign to variables
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_MAJOR ${STR1})
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_MINOR ${STR2})
+    string(REGEX MATCHALL "[0-9]+" MKL_VERSION_UPDATE ${STR3})
+    set(MKL_VERSION_STR "${MKL_VERSION_MAJOR}.${MKL_VERSION_MINOR}.${MKL_VERSION_UPDATE}" CACHE STRING "MKL version" FORCE)
+endmacro()
+
+#determine arch
+if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
+    set(MKL_X64 1)
+    set(MKL_ARCH "intel64")
+    
+    include(CheckTypeSize)
+    CHECK_TYPE_SIZE(int _sizeof_int)
+    if (_sizeof_int EQUAL 4)
+        set(MKL_LP64 "lp64")
+    else()
+        set(MKL_LP64 "ilp64")
+    endif()
+else()
+    set(MKL_ARCH "ia32")
+endif()
+
+if(MSVC)
+    set(MKL_EXT ".lib")
+else()
+    set(MKL_EXT ".a")
+endif()
+
+set(MKL_LIB_DIR ${MKL_ROOT_DIR}/lib/${MKL_ARCH})
+set(MKL_LIBRARIES ${MKL_LIB_DIR}/mkl_core${MKL_EXT} ${MKL_LIB_DIR}/mkl_intel_${MKL_LP64}${MKL_EXT})
+
+if(MKL_WITH_TBB)
+    list(APPEND MKL_LIBRARIES ${MKL_LIB_DIR}/mkl_tbb_thread${MKL_EXT})
+    list(APPEND MKL_LIBRARIES ${MKL_ROOT_DIR}/../tbb/lib/${MKL_ARCH}/tbb${MKL_EXT})
+elseif(MKL_WITH_OPENMP)
+    message(FATAL_ERROR "Multithreaded MKL is not supported yet")
+else()
+    list(APPEND MKL_LIBRARIES ${MKL_LIB_DIR}/mkl_sequential${MKL_EXT})
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL MKL_INCLUDE_HEADERS MKL_LIBRARIES)
+
+if(MKL_FOUND)
+    get_mkl_version(${MKL_INCLUDE_DIRS}/mkl_version.h)
+    message(STATUS "Found MKL ${MKL_VERSION_STR} at: ${MKL_ROOT_DIR}")
+    
+    set(HAVE_MKL ON CACHE BOOL "True if MKL found")
+    set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
+    set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIRS} CACHE PATH "Path to MKL include directory")
+    set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE FILEPATH "MKL libarries")
+else()
+    set(HAVE_MKL OFF CACHE BOOL "True if MKL found")
+    set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
+    unset(MKL_INCLUDE_DIRS)
+    unset(MKL_LIBRARIES)
+endif()
\ No newline at end of file
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -46,6 +46,10 @@
 #include "im2col.hpp"
 #include <iostream>

+#if HAVE_CBLAS
+#include "cblas.h"
+#endif
+
 namespace cv
 {
 namespace dnn
@@ -73,6 +77,20 @@ namespace dnn

        //TBD
        useOpenCL = params.has("use_opencl");
+
+        //init BLAS
+        #if HAVE_CBLAS
+        {
+            #ifdef OPENBLAS_VERSION
+            if (openblas_get_num_threads() != cv::getNumThreads())
+            {
+                openblas_set_num_threads(cv::getNumThreads());
+                goto_set_num_threads(cv::getNumThreads());
+            }
+            //std::cout << "OpenBLAS threads " << openblas_get_num_threads() << "/" << openblas_get_num_procs() << "\n";
+            #endif
+        }
+        #endif
    }

    void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
@@ -109,7 +127,7 @@ namespace dnn

    inline bool ConvolutionLayer::is1x1() const
    {
-        return (kerH == 1 && kerW == 1);
+        return (kerH == 1 && kerW == 1) && (strideW == 1 && strideH == 1); //hotfix with stride
    }

    void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
@@ -130,13 +148,13 @@ namespace dnn
                    Mat kerMat(outGroupCn, ksize, wgtBlob.type(), wgtBlob.ptr(g*outGroupCn));
                    Mat dstMat(outGroupCn, outH*outW, outBlob.type(), outBlob.ptr(n, g*outGroupCn));

-                    cv::gemm(kerMat, colMat, 1, noArray(), 0, dstMat);
-
+                    gemmCPU(kerMat, colMat, 1, dstMat, 0);
+                    
                    if (bias)
                    {
                        float *biasPtr = blobs[1].ptrf() + g*outGroupCn;
                        Mat biasMat(outGroupCn, 1, CV_32F, biasPtr);
-                        cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat);
+                        gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv
                    }
                }
            }
@@ -223,7 +241,7 @@ namespace dnn

                    Mat convMat(outGroupCn, outH*outW, convBlob.type(), convBlob.ptr(n, g*outGroupCn));
                    Mat wghtMat(outGroupCn, ksize, wghtBlob.type(), wghtBlob.ptr(g*outGroupCn));
-                    cv::gemm(wghtMat, convMat, 1, noArray(), 0, colMat, GEMM_1_T);
+                    gemmCPU(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);

                    col2im(dstMat);

@@ -231,7 +249,7 @@ namespace dnn
                    {
                        float *biasPtr = blobs[1].ptrf() + g*inpGroupCn;
                        Mat biasMat(inpGroupCn, 1, CV_32F, biasPtr);
-                        cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat);
+                        gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv
                    }
                }
            }
@@ -247,5 +265,57 @@ namespace dnn
        if (dstMat.type() == CV_64F)
            col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr());
    }
+
+    void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags /*= 0*/)
+    {
+        cv::gemm(A, B, alpha, C, beta, C, flags);
+    }
+
+    inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool transA = false)
+    {
+        rows = (transA) ? A.cols : A.rows;
+        cols = (transA) ? A.rows : A.cols;
+    }
+
+    void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/)
+    {
+        #if HAVE_CBLAS
+        bool transA = flags & GEMM_1_T;
+        bool transB = flags & GEMM_2_T;
+        bool transC = flags & GEMM_3_T;
+
+        int Arows, Acols, Brows, Bcols, Crows, Ccols;
+        SwapRowCols(A, Arows, Acols, transA);
+        SwapRowCols(B, Brows, Bcols, transB);
+        SwapRowCols(C, Crows, Ccols, transC);
+
+        CV_DbgAssert(!(flags & GEMM_3_T));
+        CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
+        CV_DbgAssert(A.isContinuous() && B.isContinuous() && C.isContinuous());
+        CV_DbgAssert(A.type() == CV_32F || A.type() == CV_64F);
+        CV_DbgAssert(A.type() == B.type() && B.type() == C.type());
+
+        if (C.type() == CV_32F)
+        {
+            cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
+                        Arows, Bcols, Acols,
+                        (float)alpha, A.ptr<float>(), A.cols,
+                        B.ptr<float>(), B.cols,
+                        (float)beta, C.ptr<float>(), C.cols);
+        }
+        else if (C.type() == CV_64F)
+        {
+            //TODO: Should be tested
+            cblas_dgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
+                        Arows, Bcols, Acols,
+                        alpha, A.ptr<double>(), A.cols,
+                        B.ptr<double>(), B.cols,
+                        beta, C.ptr<double>(), C.cols);
+        }
+        #else
+        cv::gemm(A, B, alpha, C, beta, C, flags);
+        #endif
+    }
+
 }
 }
--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ b/modules/dnn/src/layers/convolution_layer.hpp
@@ -87,6 +87,10 @@ namespace dnn
        DeConvolutionLayer(LayerParams &params);
        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
    };
+
+    void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags = 0);
+
+    void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
 }
 }
 #endif