Reworked HolisticWordspotter to work with dnn module

3aa88889 · Maksim Shabunin · bad02f37 · 3aa88889 · bad02f37 · bad02f37
Commit 3aa88889 authored Sep 28, 2017 by Maksim Shabunin
9 changed files
--- a/modules/text/CMakeLists.txt
+++ b/modules/text/CMakeLists.txt
 set(the_description "Text Detection and Recognition")
-ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python java)
+ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_dnn OPTIONAL opencv_highgui WRAP python java)
 if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT)
  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
@@ -22,62 +22,3 @@ ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR})
 ocv_add_testdata(samples/ contrib/text
    FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg"
 )
-#Principal source from which adaptation came is the cnn_3dobj module
-find_package(Caffe)
-if(Caffe_FOUND)
-  message(STATUS "Caffe:   YES")
-  set(HAVE_CAFFE 1)
-else()
-  message(STATUS "Caffe:   NO")
-endif()
-find_package(Protobuf)
-if(Protobuf_FOUND)
-  message(STATUS "Protobuf:   YES")
-  set(HAVE_PROTOBUF 1)
-else()
-  message(STATUS "Protobuf:   NO")
-endif()
-find_package(Glog)
-if(Glog_FOUND)
-  message(STATUS "Glog:   YES")
-  set(HAVE_GLOG 1)
-else()
-  message(STATUS "Glog:   NO")
-endif()
-if(HAVE_CAFFE)
-message(STATUS "HAVE CAFFE!!!")
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
-               ${CMAKE_CURRENT_SOURCE_DIR}/include/opencv2/text_config.hpp @ONLY)
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-if(${Caffe_FOUND})
-  include_directories(${Caffe_INCLUDE_DIR})
-  #taken from caffe's cmake
-  find_package(HDF5 COMPONENTS HL REQUIRED)
-  include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
-  find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
-  include_directories(SYSTEM ${Boost_INCLUDE_DIR})
-  include_directories(SYSTEM /usr/local/cuda-7.5/targets/x86_64-linux/include/)
-  list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
-endif()
-if(${Caffe_FOUND})
-  #taken from caffe's cmake
-  target_link_libraries(opencv_text ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES})
-endif()
-endif()
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
-               ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY)
\ No newline at end of file
--- a/modules/text/FindCaffe.cmake
+++ b/modules/text/FindCaffe.cmake
-# Caffe package for CNN Triplet training
-unset(Caffe_FOUND)
-find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp
-  HINTS
-  /usr/local/include)
-find_library(Caffe_LIBS NAMES caffe
-  HINTS
-  /usr/local/lib)
-if(Caffe_LIBS AND Caffe_INCLUDE_DIR)
-    set(Caffe_FOUND 1)
-endif()
--- a/modules/text/FindGlog.cmake
+++ b/modules/text/FindGlog.cmake
-#Required for Caffe
-unset(Glog_FOUND)
-find_library(Glog_LIBS NAMES glog
-  HINTS
-  /usr/local/lib)
-if(Glog_LIBS)
-    set(Glog_FOUND 1)
-endif()
--- a/modules/text/FindProtobuf.cmake
+++ b/modules/text/FindProtobuf.cmake
-#Protobuf package required for Caffe
-unset(Protobuf_FOUND)
-find_library(Protobuf_LIBS NAMES protobuf
-  HINTS
-  /usr/local/lib)
-if(Protobuf_LIBS)
-    set(Protobuf_FOUND 1)
-endif()
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@@ -46,10 +46,6 @@
 #include <vector>
 #include <string>
-#include <iostream>
-#include <sstream>
 namespace cv
 {
@@ -540,107 +536,24 @@ at each window location.
 CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const String& filename);
-//! @}
-//Classifiers should provide diferent backends
-//For the moment only caffe is implemeted
-enum{
-    OCR_HOLISTIC_BACKEND_NONE,
-    OCR_HOLISTIC_BACKEND_CAFFE
-};
-/** @brief Abstract class that implements the classifcation of text images.
- *
- * The interface is generic enough to describe any image classifier. And allows
- * to take advantage of compouting in batches. While word classifiers are the default
- * networks, any image classifers should work.
- *
- */
-class CV_EXPORTS_W TextImageClassifier
-{
-protected:
-    Size inputSz_;
-    int channelCount_;
-    /** @brief all image preprocessing is handled here including whitening etc.
-     *
-     *  @param input the image to be preprocessed for the classifier. If the depth
-     * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
-     *
-     * @param output reference to the image to be fed to the classifier, the preprocessor will
-     * resize the image to the apropriate size and convert it to the apropriate depth\
-     *
-     * The method preprocess should never be used externally, it is up to classify and classifyBatch
-     * methods to employ it.
-     */
-    virtual void preprocess(Mat& input,Mat& output)=0;
-public:
-    virtual ~TextImageClassifier() {}
-    /** @brief produces a class confidence row-vector given an image
-     */
-    CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0;
-    /** @brief produces a matrix containing class confidence row-vectors given an collection of images
-     */
-    CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0;
-    /** @brief simple getter method returning the size of the oputput row-vector
-     */
-    CV_WRAP virtual int getOutputSize()=0;
-    /** @brief simple getter method returning the size of the minibatches for this classifier.
-     * If not applicabe this method should return 1
-     */
-    CV_WRAP virtual int getMinibatchSize()=0;
-    /** @brief simple getter method returning a value describing the framework beeing employed to implement the classifier
-     */
-    CV_WRAP virtual int getBackend(){return OCR_HOLISTIC_BACKEND_NONE;}
-};
-class CV_EXPORTS_W DictNet:public TextImageClassifier
-{
-    /** @brief Class that uses a pretrained caffe model for word classification.
-     *
-     * This network is described in detail in:
-     * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
-     * http://arxiv.org/abs/1412.1842
-     */
-public:
-    virtual ~DictNet() {};
-    CV_WRAP virtual bool usingGpu()=0;
-    /** @brief Constructs a DictNet object from a caffe pretrained model
-     *
-     * @param archFilename is the path to the prototxt file containing the deployment model architecture description.
-     *
-     * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be
-     * very large, up to 2GB.
-     *
-     * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
-     * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
-     *
-     * @param useGpu boolean flag setting GPU or CPU computation
-     *
-     * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
-     * the only option
-     */
-    CV_WRAP static Ptr<DictNet> create(String archFilename,String weightsFilename,int minibatchSz=100,bool useGpu=0,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE);
-};
 /** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
- * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable
+ * Given a predefined vocabulary , a DictNet is employed to select the most probable
 * word given an input image.
 *
- * This class implements the logic of providing transcriptions given a vocabulary and and an image
+ * DictNet is described in detail in:
- * classifer.
+ * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
+ * http://arxiv.org/abs/1412.1842
 */
-class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR
+class CV_EXPORTS OCRHolisticWordRecognizer : public BaseOCR
 {
 public:
-    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+    virtual void run(Mat& image,
-                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     std::string& output_text,
-                     int component_level=OCR_LEVEL_WORD)=0;
+                     std::vector<Rect>* component_rects = NULL,
+                     std::vector<std::string>* component_texts = NULL,
+                     std::vector<float>* component_confidences = NULL,
+                     int component_level = OCR_LEVEL_WORD) = 0;
    /** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
@@ -665,68 +578,24 @@ public:
    @param component_level must be OCR_LEVEL_WORD.
     */
+    virtual void run(Mat& image,
-    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     Mat& mask,
-                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     std::string& output_text,
-                     int component_level=OCR_LEVEL_WORD)=0;
+                     std::vector<Rect>* component_rects = NULL,
+                     std::vector<std::string>* component_texts = NULL,
+                     std::vector<float>* component_confidences = NULL,
-    /**
+                     int component_level = OCR_LEVEL_WORD) = 0;
-    @brief Method that provides a quick and simple interface to a single word image classifcation
-    @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word
-    @param transcription an opencv string that will store the detected word transcription
-    @param confidence a double that will be updated with the confidence the classifier has for the selected word
-    */
-    CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0;
-    /**
-    @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
-    the classifiers parallel capabilities.
-    @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
-    to contain a single word.
-    @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
-    input image
-    @param confidences a vector of double that will be updated with the confidence the classifier has for each of the
-    selected words.
-    */
-    CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
-    /**
-    @brief simple getted for the vocabulary employed
-    */
-    CV_WRAP virtual const std::vector<String>& getVocabulary()=0;
    /** @brief Creates an instance of the OCRHolisticWordRecognizer class.
-    @param classifierPtr an instance of TextImageClassifier, normaly a DictNet instance
-    @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
-    in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
-    of the classifier.
     */
-    CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename);
+    static Ptr<OCRHolisticWordRecognizer> create(const std::string &archFilename,
-    /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DictNet classifier.
+                                                 const std::string &weightsFilename,
+                                                 const std::string &wordsFilename);
-    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
-    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
-    @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
-    in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
-    of the classifier.
-    */
-    CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename);
 };
+//! @}
-}
+}} // cv::text::
-}
 #endif // _OPENCV_TEXT_OCR_HPP_
--- a/modules/text/samples/dictnet_demo.cpp
+++ b/modules/text/samples/dictnet_demo.cpp
@@ -12,79 +12,50 @@
 #include  "opencv2/imgproc.hpp"
 #include  <sstream>
-#include  <vector>
 #include  <iostream>
-#include  <iomanip>
-#include  <fstream>
-inline std::string getHelpStr(std::string progFname){
+using namespace std;
-    std::stringstream out;
+using namespace cv;
-    out << "    Demo of wordspotting CNN for text recognition." << std::endl;
+using namespace cv::text;
-    out << "    Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
-    out << "    Usage: " << progFname << " <output_file> <input_image1> <input_image2> ... <input_imageN>" << std::endl;
+inline void printHelp()
-    out << "    Caffe Model files  (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<std::endl;
+{
-    out << "      must be in the current directory." << std::endl << std::endl;
+    cout << "    Demo of wordspotting CNN for text recognition." << endl;
+    cout << "    Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
-    out << "    Obtaining Caffe Model files in linux shell:"<<std::endl;
+    cout << "    Usage: program <input_image>" << endl;
-    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<std::endl;
+    cout << "    Caffe Model files  (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<endl;
-    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<std::endl;
+    cout << "      must be in the current directory." << endl << endl;
-    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<std::endl<<std::endl;
-    return out.str();
-}
-inline bool fileExists (std::string filename) {
+    cout << "    Obtaining Caffe Model files in linux shell:"<<endl;
-    std::ifstream f(filename.c_str());
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<endl;
-    return f.good();
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<endl;
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<endl<<endl;
 }
+int main(int argc, const char * argv[])
-int main(int argc, const char * argv[]){
+{
-    const int USE_GPU=0;
+    if (argc != 2)
+    {
-    if (argc < 3){
+        printHelp();
-        std::cout<<getHelpStr(argv[0]);
-        std::cout<<"Insufiecient parameters. Aborting!"<<std::endl;
        exit(1);
    }
-    if (!fileExists("dictnet_vgg.caffemodel") ||
+    Mat image = imread(argv[1], IMREAD_GRAYSCALE);
-            !fileExists("dictnet_vgg_deploy.prototxt") ||
-            !fileExists("dictnet_vgg_labels.txt")){
-        std::cout<<getHelpStr(argv[0]);
-        std::cout<<"Model files not found in the current directory. Aborting!"<<std::endl;
-        exit(1);
-    }
-    if (fileExists(argv[1])){
+    cout << "Read image (" << argv[1] << "): " << image.size << ", channels: " << image.channels() << ", depth: " << image.depth() << endl;
-        std::cout<<getHelpStr(argv[0]);
-        std::cout<<"Output file must not exist. Aborting!"<<std::endl;
-        exit(1);
-    }
-    std::vector<cv::Mat> imageList;
+    if (image.empty())
-    for(int imageIdx=2;imageIdx<argc;imageIdx++){
+    {
-        if (fileExists(argv[imageIdx])){
+        printHelp();
-            imageList.push_back(cv::imread(cv::String(argv[imageIdx])));
+        exit(1);
-        }else{
-            std::cout<<getHelpStr(argv[0]);
-            std::cout<<argv[imageIdx]<<" doesn't exist. Aborting";
-        }
    }
-    cv::Ptr<cv::text::DictNet> cnn=cv::text::DictNet::create(
-                "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",100,USE_GPU);
-    cv::Ptr<cv::text::OCRHolisticWordRecognizer> wordSpotter=
+    Ptr<OCRHolisticWordRecognizer> wordSpotter = OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
-            cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt");
-    std::vector<cv::String> wordList;
+    std::string word;
-    std::vector<double> outProbabillities;
+    vector<float> confs;
-    wordSpotter->recogniseImageBatch(imageList,wordList,outProbabillities);
+    wordSpotter->run(image, word, 0, 0, &confs);
-    std::ofstream out;
+    cout << "Detected word: '" << word << "', confidence: " << confs[0] << endl;
-    out.open(argv[1]);
-    for(int imgIdx=0;imgIdx<int(imageList.size());imgIdx++){
-        out<<argv[imgIdx+2]<<","<<wordList[imgIdx]<<","<<outProbabillities[imgIdx]<<std::endl;
-    }
-    out.close();
 }
--- a/modules/text/samples/dictnet_demo.py
+++ b/modules/text/samples/dictnet_demo.py
-#!/usr/bin/env python
-import cv2
-import sys
-import os.path
-#Global variable shared between the Mouse callback and main
-refPt = []
-cropping = False
-image=None
-drawImage=None
-dictNet=None
-wordSpotter=None
-def mouseCallback(event, x, y, flags, param):
-    # grab references to the global variables
-    global refPt, cropping,wordSpotter,drawImage,image
-    # if the left mouse button was clicked, record the starting
-    # (x, y) coordinates and indicate that cropping is being
-    # performed
-    if event == cv2.EVENT_LBUTTONDOWN:
-        refPt = [(x, y)]
-        cropping = True
-    # check to see if the left mouse button was released
-    elif event == cv2.EVENT_LBUTTONUP:
-        # record the ending (x, y) coordinates and indicate that
-        # the cropping operation is finished
-        refPt.append((x, y))
-        cropping = False
-        # draw a rectangle around the region of interest
-        roi = image[refPt[0][1]:refPt[1][1], refPt[0][0]:refPt[1][0]]
-        res=wordSpotter.recogniseImage(roi)
-        drawImage = image.copy()
-        cv2.rectangle(drawImage, refPt[0], refPt[1], (0, 255, 0), 2)
-        cv2.putText(drawImage,"%s:%f"%(res[0],res[1]),refPt[0],cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2)
-        cv2.imshow("Select A Region", drawImage)
-if __name__=='__main__':
-    USEGPU=False
-    helpStr="""Usage: """+sys.argv[0]+""" IMAGE_FILENAME
-    Press 'q' or 'Q' exit
-    The modelFiles must be available in the current directory.
-    In linux shell they can be downloaded (~2GB) with the following commands:
-    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel
-    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt
-    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt
-    """
-    if((len(sys.argv)!=2 )or not(os.path.isfile(sys.argv[1]) )):
-        print helpStr
-        print 'No image file given Aborting!'
-        sys.exit(1)
-    if not (os.path.isfile('dictnet_vgg_deploy.prototxt') and
-        os.path.isfile('dictnet_vgg.caffemodel') and
-        os.path.isfile('dictnet_vgg_labels.txt')):
-        print helpStr
-        print 'Model files not present, Aborting!'
-        sys.exit(1)
-    dictNet=cv2.text.DictNet_create('./dictnet_vgg_deploy.prototxt','./dictnet_vgg.caffemodel',100,USEGPU)
-    wordSpotter=cv2.text.OCRHolisticWordRecognizer_create(dictNet,"./dictnet_vgg_labels.txt")
-    image = cv2.imread(sys.argv[1])
-    drawImage = image.copy()
-    cv2.namedWindow("Select A Region")
-    cv2.setMouseCallback("Select A Region", mouseCallback)
-    while True:
-        cv2.imshow("Select A Region", drawImage)
-        key = cv2.waitKey(1) & 0xFF
-        # if the 'q' key is pressed, break from the loop
-        if key == ord("q") or key == ord("Q"):
-            break
-    cv2.destroyAllWindows()
--- a/modules/text/src/ocr_holistic.cpp
+++ b/modules/text/src/ocr_holistic.cpp
--- a/modules/text/text_config.hpp.in
+++ b/modules/text/text_config.hpp.in
 #ifndef __OPENCV_TEXT_CONFIG_HPP__
 #define __OPENCV_TEXT_CONFIG_HPP__
-// HAVE CAFFE
-#cmakedefine HAVE_CAFFE
 // HAVE OCR Tesseract
 #cmakedefine HAVE_TESSERACT
 #endif