Merge pull request #61 from lluisgomez/master

adds class OCRHMMDecoder API interface and implementation and updates webcam_demo sample to use it

Merge pull request #61 from lluisgomez/master
adds class OCRHMMDecoder API interface and implementation and updates webcam_demo sample to use it
468d3ecc · Vadim Pisarevsky · 37af0432 · 0898fb9d · 468d3ecc · 468d3ecc
Commit 468d3ecc authored Aug 15, 2014 by Vadim Pisarevsky
6 changed files
--- a/modules/text/doc/ocr.rst
+++ b/modules/text/doc/ocr.rst
@@ -7,11 +7,12 @@ OCRTesseract
 ------------
 .. ocv:class:: OCRTesseract : public BaseOCR

-OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. Notice that it is compiled only when tesseract-ocr is correctly installed. ::
+OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. Notice that it is compiled only when tesseract-ocr is correctly installed. 

 .. note::

    * (C++) An example of OCRTesseract recognition combined with scene text detection can be found at the end_to_end_recognition demo: https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/end_to_end_recognition.cpp
+    * (C++) Another example of OCRTesseract recognition combined with scene text detection can be found at the webcam_demo: https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp

 OCRTesseract::create
 --------------------
@@ -37,3 +38,69 @@ Recognize text using the tesseract-ocr API. Takes image on input and returns rec
    :param component_text: If provided the method will output a list of text strings for the recognition of individual text elements found (e.g. words or text lines).
    :param component_confidences: If provided the method will output a list of confidence values for the recognition of individual text elements found (e.g. words or text lines).
    :param component_level: ``OCR_LEVEL_WORD`` (by default), or ``OCR_LEVEL_TEXT_LINE``.
+
+OCRHMMDecoder
+-------------
+.. ocv:class:: OCRHMMDecoder : public BaseOCR
+
+OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models.
+
+.. note::
+
+    * (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can be found at the webcam_demo sample: https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp
+
+
+OCRHMMDecoder::ClassifierCallback
+---------------------------------
+Callback with the character classifier is made a class. This way it hides the feature extractor and the classifier itself, so developers can write their own OCR code.
+
+.. ocv:class:: OCRHMMDecoder::ClassifierCallback
+
+The default character classifier and feature extractor can be loaded using the utility funtion ``loadOCRHMMClassifierNM`` and KNN model provided in https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_knn_model_data.xml.gz.
+
+OCRHMMDecoder::ClassifierCallback::eval
+---------------------------------------
+The character classifier must return a (ranked list of) class(es) id('s)
+
+.. ocv:function:: void OCRHMMDecoder::ClassifierCallback::eval( InputArray image, std::vector<int>& out_class, std::vector<double>& out_confidence)
+
+    :param image: Input image ``CV_8UC1`` or ``CV_8UC3`` with a single letter.
+    :param out_class: The classifier returns the character class categorical label, or list of class labels, to which the input image corresponds.
+    :param out_confidence: The classifier returns the probability of the input image corresponding to each classes in ``out_class``.
+
+OCRHMMDecoder::create
+---------------------
+Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.
+
+.. ocv:function:: Ptr<OCRHMMDecoder> OCRHMMDecoder::create(const Ptr<OCRHMMDecoder::ClassifierCallback> classifier, const std::string& vocabulary, InputArray transition_probabilities_table, InputArray emission_probabilities_table, decoder_mode mode = OCR_DECODER_VITERBI)
+
+    :param classifier: The character classifier with built in feature extractor.
+    :param vocabulary: The language vocabulary (chars when ascii english text). vocabulary.size() must be equal to the number of classes of the classifier.
+    :param transition_probabilities_table: Table with transition probabilities between character pairs. cols == rows == vocabulary.size().
+    :param emission_probabilities_table: Table with observation emission probabilities. cols == rows == vocabulary.size().
+    :param mode: HMM Decoding algorithm. Only ``OCR_DECODER_VITERBI`` is available for the moment (http://en.wikipedia.org/wiki/Viterbi_algorithm).
+
+OCRHMMDecoder::run
+------------------
+Recognize text using HMM. Takes image on input and returns recognized text in the output_text parameter. Optionally provides also the Rects for individual text elements found (e.g. words), and the list of those text elements with their confidence values.
+
+.. ocv:function:: void OCRHMMDecoder::run(Mat& image, string& output_text, vector<Rect>* component_rects=NULL, vector<string>* component_texts=NULL, vector<float>* component_confidences=NULL, int component_level=0)
+
+    :param image: Input image ``CV_8UC1`` with a single text line (or word).
+    :param output_text: Output text. Most likely character sequence found by the HMM decoder.
+    :param component_rects: If provided the method will output a list of Rects for the individual text elements found (e.g. words).
+    :param component_text: If provided the method will output a list of text strings for the recognition of individual text elements found (e.g. words).
+    :param component_confidences: If provided the method will output a list of confidence values for the recognition of individual text elements found (e.g. words).
+    :param component_level: Only ``OCR_LEVEL_WORD`` is supported.
+
+loadOCRHMMClassifierNM
+----------------------
+Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
+
+.. ocv:function:: Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename)
+
+    :param filename: The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
+
+The default classifier is based in the scene text recognition method proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector based on gradient orientations along the chain-code of its perimeter. Then, the region is classified using a KNN model trained with synthetic data of rendered characters with different standard font types.
+
+.. [Neumann11b] Neumann L., Matas J.: Text Localization in Real-world Images using Efficiently Pruned Exhaustive Search, ICDAR 2011. The paper is available online at http://cmp.felk.cvut.cz/~neumalu1/icdar2011_article.pdf
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@@ -52,7 +52,6 @@ namespace cv
 namespace text
 {

-
 enum
 {
    OCR_LEVEL_WORD,
@@ -69,6 +68,8 @@ public:
                     int component_level=0) = 0;
 };

+/* OCR Tesseract */
+
 class CV_EXPORTS OCRTesseract : public BaseOCR
 {
 public:
@@ -81,6 +82,52 @@ public:
 };


+/* OCR HMM Decoder */
+
+enum decoder_mode
+{
+    OCR_DECODER_VITERBI = 0 // Other algorithms may be added
+};
+
+class CV_EXPORTS OCRHMMDecoder : public BaseOCR
+{
+public:
+
+    //! callback with the character classifier is made a class. This way we hide the feature extractor and the classifier itself
+    class CV_EXPORTS ClassifierCallback
+    {
+    public:
+        virtual ~ClassifierCallback() { }
+        //! The classifier must return a (ranked list of) class(es) id('s)
+        virtual void eval( InputArray image, std::vector<int>& out_class, std::vector<double>& out_confidence);
+    };
+
+public:
+    //! Decode a group of regions and output the most likely sequence of characters
+    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=0);
+
+    static Ptr<OCRHMMDecoder> create(const Ptr<OCRHMMDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
+                                     const std::string& vocabulary,                    // The language vocabulary (chars when ascii english text)
+                                                                                       //     size() must be equal to the number of classes
+                                     InputArray transition_probabilities_table,        // Table with transition probabilities between character pairs
+                                                                                       //     cols == rows == vocabulari.size()
+                                     InputArray emission_probabilities_table,          // Table with observation emission probabilities
+                                                                                       //     cols == rows == vocabulari.size()
+                                     decoder_mode mode = OCR_DECODER_VITERBI);         // HMM Decoding algorithm (only Viterbi for the moment)
+
+protected:
+
+    Ptr<OCRHMMDecoder::ClassifierCallback> classifier;
+    std::string vocabulary;
+    Mat transition_p;
+    Mat emission_p;
+    decoder_mode mode;
+};
+
+CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);
+
 }
 }
 #endif // _OPENCV_TEXT_OCR_HPP_
--- a/modules/text/samples/OCRHMM_knn_model_data.xml.gz
+++ b/modules/text/samples/OCRHMM_knn_model_data.xml.gz
--- a/modules/text/samples/OCRHMM_transitions_table.xml
+++ b/modules/text/samples/OCRHMM_transitions_table.xml
--- a/modules/text/samples/webcam_demo.cpp
+++ b/modules/text/samples/webcam_demo.cpp
@@ -46,6 +46,7 @@ public:
 };

 //OCR recognition is done in parallel for different detections
+template <class T>
 class Parallel_OCR: public cv::ParallelLoopBody
 {
 private:
@@ -54,12 +55,12 @@ private:
    vector< vector<Rect> > &boxes;
    vector< vector<string> > &words;
    vector< vector<float> > &confidences;
-    vector< Ptr<OCRTesseract> > &ocrs;
+    vector< Ptr<T> > &ocrs;

 public:
    Parallel_OCR(vector<Mat> &_detections, vector<string> &_outputs, vector< vector<Rect> > &_boxes,
                 vector< vector<string> > &_words, vector< vector<float> > &_confidences, 
-                 vector< Ptr<OCRTesseract> > &_ocrs)
+                 vector< Ptr<T> > &_ocrs)
        : detections(_detections), outputs(_outputs), boxes(_boxes), words(_words), 
          confidences(_confidences), ocrs(_ocrs)
    {}
@@ -88,6 +89,7 @@ int main(int argc, char* argv[])
    cout << "  Usage:  " << argv[0] << " [camera_index]" << endl << endl;
    cout << "  Press 'e' to switch between MSER/CSER regions." << endl;
    cout << "  Press 'g' to switch between Horizontal and Arbitrary oriented grouping." << endl;
+    cout << "  Press 'o' to switch between OCRTesseract/OCRHMMDecoder recognition." << endl;
    cout << "  Press 's' to scale down frame size to 320x240." << endl;
    cout << "  Press 'ESC' to exit." << endl << endl;

@@ -98,7 +100,7 @@ int main(int argc, char* argv[])
    int  RECOGNITION = 0;
    char *region_types_str[2] = {const_cast<char *>("ERStats"), const_cast<char *>("MSER")};
    char *grouping_algorithms_str[2] = {const_cast<char *>("exhaustive_search"), const_cast<char *>("multioriented")};
-    char *recognitions_str[3] = {const_cast<char *>("Tesseract"), const_cast<char *>("NM_chain_features + KNN"), const_cast<char *>("NM_chain_features + MLP")};
+    char *recognitions_str[2] = {const_cast<char *>("Tesseract"), const_cast<char *>("NM_chain_features + KNN")};

    Mat frame,grey,orig_grey,out_img;
    vector<Mat> channels;
@@ -119,6 +121,7 @@ int main(int argc, char* argv[])
    //double t_r = getTickCount();

    //Initialize OCR engine (we initialize 10 instances in order to work several recognitions in parallel)
+    cout << "Initializing OCR engines ..." << endl;
    int num_ocrs = 10;
    vector< Ptr<OCRTesseract> > ocrs;
    for (int o=0; o<num_ocrs; o++)
@@ -126,6 +129,22 @@ int main(int argc, char* argv[])
      ocrs.push_back(OCRTesseract::create());
    }

+    Mat transition_p;
+    string filename = "OCRHMM_transitions_table.xml";
+    FileStorage fs(filename, FileStorage::READ);
+    fs["transition_probabilities"] >> transition_p;
+    fs.release();
+    Mat emission_p = Mat::eye(62,62,CV_64FC1);
+    string voc = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+    vector< Ptr<OCRHMMDecoder> > decoders;
+    for (int o=0; o<num_ocrs; o++)
+    {
+      decoders.push_back(OCRHMMDecoder::create(loadOCRHMMClassifierNM("OCRHMM_knn_model_data.xml.gz"), 
+                                               voc, transition_p, emission_p));
+    }
+    cout << " Done!" << endl;
+
    //cout << "TIME_OCR_INITIALIZATION_ALT = "<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl;


@@ -216,7 +235,12 @@ int main(int argc, char* argv[])
        float scale_img  = (float)(600.f/frame.rows);
        float scale_font = (float)(2-scale_img)/1.4f;
        vector<string> words_detection;
-        float min_confidence1 = 51.f, min_confidence2 = 60.f;
+        float min_confidence1 = 0.f, min_confidence2 = 0.f;
+        
+        if (RECOGNITION == 0)
+        {
+          min_confidence1 = 51.f; min_confidence2 = 60.f;
+        }

        vector<Mat> detections;

@@ -238,25 +262,32 @@ int main(int argc, char* argv[])
        vector< vector<string> > words((int)detections.size());
        vector< vector<float> > confidences((int)detections.size());
        
-        // parallel process detections in batches of ocrs.size()
-        for (int i=0; i<(int)detections.size(); i=i+(int)ocrs.size()) 
+        // parallel process detections in batches of ocrs.size() (== num_ocrs)
+        for (int i=0; i<(int)detections.size(); i=i+(int)num_ocrs) 
        {
          Range r;
-          if (i+(int)ocrs.size() <= (int)detections.size())
-            r = Range(i,i+(int)ocrs.size());
+          if (i+(int)num_ocrs <= (int)detections.size())
+            r = Range(i,i+(int)num_ocrs);
          else
            r = Range(i,(int)detections.size());

-          parallel_for_(r, Parallel_OCR(detections, outputs, boxes, words, confidences, ocrs));
+          switch(RECOGNITION)
+          {
+            case 0:
+              parallel_for_(r, Parallel_OCR<OCRTesseract>(detections, outputs, boxes, words, confidences, ocrs));
+              break;
+            case 1:
+              parallel_for_(r, Parallel_OCR<OCRHMMDecoder>(detections, outputs, boxes, words, confidences, decoders));
+              break;
+          }
        }


        for (int i=0; i<(int)detections.size(); i++)
        {

-
            outputs[i].erase(remove(outputs[i].begin(), outputs[i].end(), '\n'), outputs[i].end());
-            //cout << "OCR output = \"" << output << "\" lenght = " << output.size() << endl;
+            //cout << "OCR output = \"" << outputs[i] << "\" lenght = " << outputs[i].size() << endl;
            if (outputs[i].size() < 3)
                continue;

@@ -308,15 +339,15 @@ int main(int argc, char* argv[])
            {
            case 103: //g
                GROUPING_ALGORITHM = (GROUPING_ALGORITHM+1)%2;
-                cout << "Grouping switched to " << GROUPING_ALGORITHM << endl;
+                cout << "Grouping switched to " << grouping_algorithms_str[GROUPING_ALGORITHM] << endl;
+                break;
+            case 111: //o
+                RECOGNITION = (RECOGNITION+1)%2;
+                cout << "OCR switched to " << recognitions_str[RECOGNITION] << endl;
                break;
-                //case 111: //o
-                //  RECOGNITION = (RECOGNITION+1)%3;
-                //  cout << "OCR switched to " << RECOGNITION << endl;
-                //  break;
            case 114: //r
                REGION_TYPE = (REGION_TYPE+1)%2;
-                cout << "Regions switched to " << REGION_TYPE << endl;
+                cout << "Regions switched to " << region_types_str[REGION_TYPE] << endl;
                break;
            case 115: //s
                downsize = !downsize;

--- a/modules/text/src/ocr_hmm_decoder.cpp
+++ b/modules/text/src/ocr_hmm_decoder.cpp