Merge pull request #321 from lluisgomez:ocr_refactor_even

6e4d6bca · Vadim Pisarevsky · f9d42886 · 796c26f6 · 6e4d6bca · 6e4d6bca
Commit 6e4d6bca authored Aug 05, 2015 by Vadim Pisarevsky
17 changed files
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@@ -69,6 +69,9 @@ public:
    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0) = 0;
+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0) = 0;
 };

 /** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++.
@@ -106,6 +109,10 @@ public:
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0);

+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
    /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract.

    @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the
@@ -170,11 +177,11 @@ public:
 public:
    /** @brief Recognize text using HMM.

-    Takes image on input and returns recognized text in the output_text parameter. Optionally
+    Takes binary image on input and returns recognized text in the output_text parameter. Optionally
    provides also the Rects for individual text elements found (e.g. words), and the list of those
    text elements with their confidence values.

-    @param image Input image CV_8UC1 with a single text line (or word).
+    @param image Input binary image CV_8UC1 with a single text line (or word).

    @param output_text Output text. Most likely character sequence found by the HMM decoder.

@@ -193,6 +200,33 @@ public:
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0);

+    /** @brief Recognize text using HMM.
+
+    Takes an image and a mask (where each connected component corresponds to a segmented character)
+    on input and returns recognized text in the output_text parameter. Optionally
+    provides also the Rects for individual text elements found (e.g. words), and the list of those
+    text elements with their confidence values.
+
+    @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word).
+    @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image.
+
+    @param output_text Output text. Most likely character sequence found by the HMM decoder.
+
+    @param component_rects If provided the method will output a list of Rects for the individual
+    text elements found (e.g. words).
+
+    @param component_texts If provided the method will output a list of text strings for the
+    recognition of individual text elements found (e.g. words).
+
+    @param component_confidences If provided the method will output a list of confidence values
+    for the recognition of individual text elements found (e.g. words).
+
+    @param component_level Only OCR_LEVEL_WORD is supported.
+     */
+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
    /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.

    @param classifier The character classifier with built in feature extractor.
@@ -231,7 +265,7 @@ protected:

 @param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)

-The default classifier is based in the scene text recognition method proposed by Lukás Neumann &
+The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann &
 Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a
 fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector
 based on gradient orientations along the chain-code of its perimeter. Then, the region is classified
@@ -240,19 +274,32 @@ types.
 */
 CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);

-/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
+/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.

-@param vocabulary The language vocabulary (chars when ascii english text).
+@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)

-@param lexicon The list of words that are expected to be found in a particular image.
+The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
+Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
+a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
+at each window location.
+ */
+CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const std::string& filename);

-@param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
+//! @}

-The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
-@note
-   -   (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
-        <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
- */
+/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
+ *
+ * @param vocabulary The language vocabulary (chars when ascii english text).
+ *
+ * @param lexicon The list of words that are expected to be found in a particular image.
+ *
+ * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
+ *
+ * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
+ * @note
+ *    -   (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
+ *            <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
+ *             */
 CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table);


@@ -319,6 +366,10 @@ public:
                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
                     int component_level=0);

+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
    /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder.

    @param classifier The character classifier with built in feature extractor.
@@ -359,10 +410,10 @@ protected:

 /** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object.

-@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
+@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)

-The default classifier is based in the scene text recognition method proposed by Adam Coates &
-Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and
+The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
+Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
 a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
 at each window location.
 */

--- a/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz
+++ b/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz
--- a/modules/text/samples/cropped_word_recognition.cpp
+++ b/modules/text/samples/cropped_word_recognition.cpp
@@ -36,7 +36,7 @@ int main(int argc, char* argv[])
        return(0);
    }

-    string vocabulary = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyx0123456789"; // must have the same order as the clasifier output classes
+    string vocabulary = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes
    vector<string> lexicon;  // a list of words expected to be found on the input image
    lexicon.push_back(string("abb"));
    lexicon.push_back(string("patata"));

--- a/modules/text/samples/scenetext_segmented_word01.jpg
+++ b/modules/text/samples/scenetext_segmented_word01.jpg
--- a/modules/text/samples/scenetext_segmented_word01_mask.png
+++ b/modules/text/samples/scenetext_segmented_word01_mask.png
--- a/modules/text/samples/scenetext_segmented_word02.jpg
+++ b/modules/text/samples/scenetext_segmented_word02.jpg
--- a/modules/text/samples/scenetext_segmented_word02_mask.png
+++ b/modules/text/samples/scenetext_segmented_word02_mask.png
--- a/modules/text/samples/scenetext_segmented_word03.jpg
+++ b/modules/text/samples/scenetext_segmented_word03.jpg
--- a/modules/text/samples/scenetext_segmented_word03_mask.png
+++ b/modules/text/samples/scenetext_segmented_word03_mask.png
--- a/modules/text/samples/scenetext_segmented_word04.jpg
+++ b/modules/text/samples/scenetext_segmented_word04.jpg
--- a/modules/text/samples/scenetext_segmented_word04_mask.png
+++ b/modules/text/samples/scenetext_segmented_word04_mask.png
--- a/modules/text/samples/scenetext_segmented_word05.jpg
+++ b/modules/text/samples/scenetext_segmented_word05.jpg
--- a/modules/text/samples/scenetext_segmented_word05_mask.png
+++ b/modules/text/samples/scenetext_segmented_word05_mask.png
--- a/modules/text/samples/segmented_word_recognition.cpp
+++ b/modules/text/samples/segmented_word_recognition.cpp
+/*
+ * segmented_word_recognition.cpp
+ *
+ * A demo program on segmented word recognition.
+ * Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.
+ *
+ * Created on: Jul 31, 2015
+ *     Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
+ */
+
+#include "opencv2/text.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include <iostream>
+
+using namespace std;
+using namespace cv;
+using namespace text;
+
+
+int main(int argc, char* argv[]) {
+
+    const String keys =
+      "{help h usage ? |      | print this message.}"
+      "{@image         |      | source image for recognition.}"
+      "{@mask          |      | binary segmentation mask where each contour is a character.}"
+      "{lexicon lex l  |      | (optional) lexicon provided as a list of comma separated words.}"
+      ;
+    CommandLineParser parser(argc, argv, keys);
+
+    parser.about("\nSegmented word recognition.\nA demo program on segmented word recognition. Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.\n");
+
+    String filename1 = parser.get<String>(0);
+    String filename2 = parser.get<String>(1);
+
+    parser.printMessage();
+    cout << endl << endl;
+    if ((parser.has("help")) || (filename1.size()==0))
+    {
+        return 0;
+    }
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 0;
+    }
+
+    Mat image = imread(filename1);
+    Mat mask;
+    if (filename2.size() > 0)
+      mask = imread(filename2);
+    else
+      image.copyTo(mask);
+
+    // be sure the mask is a binry image
+    cvtColor(mask, mask, COLOR_BGR2GRAY);
+    threshold(mask, mask, 128., 255, THRESH_BINARY);
+
+    // character recognition vocabulary
+    string voc = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+    // Emission probabilities for the HMM language model (identity matrix by default)
+    Mat emissionProbabilities = Mat::eye((int)voc.size(), (int)voc.size(), CV_64FC1);
+    // Bigram transition probabilities for the HMM language model
+    Mat transitionProbabilities;
+
+    string lex = parser.get<string>("lex");
+    if (lex.size()>0)
+    {
+        // Build tailored language model for the provided lexicon
+        vector<string> lexicon;
+        size_t pos = 0;
+        string delimiter = ",";
+        std::string token;
+        while ((pos = lex.find(delimiter)) != std::string::npos) {
+            token = lex.substr(0, pos);
+            lexicon.push_back(token);
+            lex.erase(0, pos + delimiter.length());
+        }
+        lexicon.push_back(lex);
+        createOCRHMMTransitionsTable(voc,lexicon,transitionProbabilities);
+    } else {
+        // Or load the generic language model (from Aspell English dictionary)
+        FileStorage fs("./OCRHMM_transitions_table.xml", FileStorage::READ);
+        fs["transition_probabilities"] >> transitionProbabilities;
+        fs.release();
+    }
+
+    Ptr<OCRTesseract>  ocrTes = OCRTesseract::create();
+
+    Ptr<OCRHMMDecoder> ocrNM  = OCRHMMDecoder::create(
+                                 loadOCRHMMClassifierNM("./OCRHMM_knn_model_data.xml.gz"),
+                                 voc, transitionProbabilities, emissionProbabilities);
+
+    Ptr<OCRHMMDecoder> ocrCNN = OCRHMMDecoder::create(
+                                 loadOCRHMMClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"),
+                                 voc, transitionProbabilities, emissionProbabilities);
+
+    std::string output;
+    double t_r = (double)getTickCount();
+    ocrTes->run(mask, output);
+    output.erase(remove(output.begin(), output.end(), '\n'), output.end());
+    cout << " OCR_Tesseract  output \"" << output << "\". Done in "
+         << ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
+
+    t_r = (double)getTickCount();
+    ocrNM->run(mask, output);
+    cout << " OCR_NM         output \"" << output << "\". Done in "
+         << ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
+
+    t_r = (double)getTickCount();
+    ocrCNN->run(image, mask, output);
+    cout << " OCR_CNN        output \"" << output << "\". Done in "
+         << ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
+}
--- a/modules/text/src/ocr_beamsearch_decoder.cpp
+++ b/modules/text/src/ocr_beamsearch_decoder.cpp
@@ -73,6 +73,22 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* co
        component_confidences->clear();
 }

+void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
+                               vector<string>* component_texts, vector<float>* component_confidences,
+                               int component_level)
+{
+    CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+    CV_Assert( mask.type() == CV_8UC1 );
+    CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
+    output_text.clear();
+    if (component_rects != NULL)
+        component_rects->clear();
+    if (component_texts != NULL)
+        component_texts->clear();
+    if (component_confidences != NULL)
+        component_confidences->clear();
+}
+

 void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
 {
@@ -136,7 +152,7 @@ public:
        if (component_confidences != NULL)
            component_confidences->clear();

-        // TODO split a line into words
+        // TODO We must split a line into words or specify we only work with words

        if(src.type() == CV_8UC3)
        {
@@ -174,14 +190,7 @@ public:
        }


-        //TODO it would be interesting to have a hash table with a vector of booleans
-        // but this is not possible when we have a large number of possible segmentations.
-        //vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
-        // options are using std::set<unsigned long long int> to store only the keys of visited nodes
-        // but will deteriorate the time performance.
        set<unsigned long long int> visited_nodes; //TODO make it member of class
-        // it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
-        // for which there is a change on the class prediction)

        vector<int> start_segmentation;
        start_segmentation.push_back(oversegmentation[0]);
@@ -221,6 +230,21 @@ public:
        return;
    }

+    void run( Mat& src,
+              Mat& mask,
+              string& out_sequence,
+              vector<Rect>* component_rects,
+              vector<string>* component_texts,
+              vector<float>* component_confidences,
+              int component_level)
+    {
+
+        CV_Assert( mask.type() == CV_8UC1 );
+
+        // Nothing to do with a mask here. We do slidding window anyway.
+        run( src, out_sequence, component_rects, component_texts, component_confidences, component_level );
+    }
+
 private:

    ////////////////////////////////////////////////////////////
@@ -421,22 +445,29 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename)
        fs["feature_min"] >> feature_min;
        fs["feature_max"] >> feature_max;
        fs.release();
-        // TODO check all matrix dimensions match correctly and no one is empty
    }
    else
        CV_Error(Error::StsBadArg, "Default classifier data file not found!");

-    nr_feature = weights.rows;
-    nr_class   = weights.cols;
-    // TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols))
-    step_size   = 4;
+    // check all matrix dimensions match correctly and no one is empty
+    CV_Assert( (M.cols > 0) && (M.rows > 0) );
+    CV_Assert( (P.cols > 0) && (P.rows > 0) );
+    CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) );
+    CV_Assert( (weights.cols > 0) && (weights.rows > 0) );
+    CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) );
+    CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) );
+
+    nr_feature  = weights.rows;
+    nr_class    = weights.cols;
+    patch_size  = (int)sqrt(kernels.cols);
+    // algorithm internal parameters
    window_size = 32;
    quad_size   = 12;
-    patch_size  = 8;
    num_quads   = 25;
    num_tiles   = 25;
    alpha       = 0.5;

+    step_size   = 4; // TODO showld this be a parameter for the user?

 }

@@ -459,7 +490,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
        cvtColor(src,src,COLOR_RGB2GRAY);
    }

-    // TODO shall we resize the input image or make a copy ?
    resize(src,src,Size(window_size*src.cols/src.rows,window_size));

    int seg_points = 0;
@@ -555,8 +585,8 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
        double *p = new double[nr_class];
        double predict_label = eval_feature(feature,p);
        //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
-        if (predict_label < 0) // TODO use cvError
-            cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl;
+        if (predict_label < 0)
+          CV_Error(Error::StsInternal, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()");


        seg_points++;

--- a/modules/text/src/ocr_hmm_decoder.cpp
+++ b/modules/text/src/ocr_hmm_decoder.cpp
--- a/modules/text/src/ocr_tesseract.cpp
+++ b/modules/text/src/ocr_tesseract.cpp
@@ -70,6 +70,22 @@ void OCRTesseract::run(Mat& image, string& output_text, vector<Rect>* component_
        component_confidences->clear();
 }

+void OCRTesseract::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
+                       vector<string>* component_texts, vector<float>* component_confidences,
+                       int component_level)
+{
+    CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+    CV_Assert( mask.type() == CV_8UC1 );
+    CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
+    output_text.clear();
+    if (component_rects != NULL)
+        component_rects->clear();
+    if (component_texts != NULL)
+        component_texts->clear();
+    if (component_confidences != NULL)
+        component_confidences->clear();
+}
+
 class OCRTesseractImpl : public OCRTesseract
 {
 private:
@@ -189,6 +205,16 @@ public:
 #endif
    }

+    void run(Mat& image, Mat& mask, string& output, vector<Rect>* component_rects=NULL,
+             vector<string>* component_texts=NULL, vector<float>* component_confidences=NULL,
+             int component_level=0)
+    {
+        CV_Assert( mask.type() == CV_8UC1 );
+        CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
+
+        run( mask, output, component_rects, component_texts, component_confidences, component_level);
+    }
+

 };