Commit 6e4d6bca authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #321 from lluisgomez:ocr_refactor_even

parents f9d42886 796c26f6
...@@ -69,6 +69,9 @@ public: ...@@ -69,6 +69,9 @@ public:
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0) = 0; int component_level=0) = 0;
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
}; };
/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. /** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++.
...@@ -106,6 +109,10 @@ public: ...@@ -106,6 +109,10 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract. /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract.
@param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the
...@@ -170,11 +177,11 @@ public: ...@@ -170,11 +177,11 @@ public:
public: public:
/** @brief Recognize text using HMM. /** @brief Recognize text using HMM.
Takes image on input and returns recognized text in the output_text parameter. Optionally Takes binary image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values. text elements with their confidence values.
@param image Input image CV_8UC1 with a single text line (or word). @param image Input binary image CV_8UC1 with a single text line (or word).
@param output_text Output text. Most likely character sequence found by the HMM decoder. @param output_text Output text. Most likely character sequence found by the HMM decoder.
...@@ -193,6 +200,33 @@ public: ...@@ -193,6 +200,33 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
/** @brief Recognize text using HMM.
Takes an image and a mask (where each connected component corresponds to a segmented character)
on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word).
@param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image.
@param output_text Output text. Most likely character sequence found by the HMM decoder.
@param component_rects If provided the method will output a list of Rects for the individual
text elements found (e.g. words).
@param component_texts If provided the method will output a list of text strings for the
recognition of individual text elements found (e.g. words).
@param component_confidences If provided the method will output a list of confidence values
for the recognition of individual text elements found (e.g. words).
@param component_level Only OCR_LEVEL_WORD is supported.
*/
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder. /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.
@param classifier The character classifier with built in feature extractor. @param classifier The character classifier with built in feature extractor.
...@@ -231,7 +265,7 @@ protected: ...@@ -231,7 +265,7 @@ protected:
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) @param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
The default classifier is based in the scene text recognition method proposed by Lukás Neumann & The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann &
Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a
fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector
based on gradient orientations along the chain-code of its perimeter. Then, the region is classified based on gradient orientations along the chain-code of its perimeter. Then, the region is classified
...@@ -240,19 +274,32 @@ types. ...@@ -240,19 +274,32 @@ types.
*/ */
CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename); CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). /** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
@param vocabulary The language vocabulary (chars when ascii english text). @param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
@param lexicon The list of words that are expected to be found in a particular image. The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
at each window location.
*/
CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const std::string& filename);
@param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). //! @}
The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. /** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
@note *
- (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : * @param vocabulary The language vocabulary (chars when ascii english text).
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml> *
*/ * @param lexicon The list of words that are expected to be found in a particular image.
*
* @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
*
* The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
* @note
* - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
* */
CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table); CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table);
...@@ -319,6 +366,10 @@ public: ...@@ -319,6 +366,10 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder. /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder.
@param classifier The character classifier with built in feature extractor. @param classifier The character classifier with built in feature extractor.
...@@ -359,10 +410,10 @@ protected: ...@@ -359,10 +410,10 @@ protected:
/** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object. /** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object.
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) @param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
The default classifier is based in the scene text recognition method proposed by Adam Coates & The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
at each window location. at each window location.
*/ */
......
...@@ -36,7 +36,7 @@ int main(int argc, char* argv[]) ...@@ -36,7 +36,7 @@ int main(int argc, char* argv[])
return(0); return(0);
} }
string vocabulary = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyx0123456789"; // must have the same order as the clasifier output classes string vocabulary = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes
vector<string> lexicon; // a list of words expected to be found on the input image vector<string> lexicon; // a list of words expected to be found on the input image
lexicon.push_back(string("abb")); lexicon.push_back(string("abb"));
lexicon.push_back(string("patata")); lexicon.push_back(string("patata"));
......
/*
* segmented_word_recognition.cpp
*
* A demo program on segmented word recognition.
* Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.
*
* Created on: Jul 31, 2015
* Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
*/
#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include <iostream>
using namespace std;
using namespace cv;
using namespace text;
int main(int argc, char* argv[]) {
const String keys =
"{help h usage ? | | print this message.}"
"{@image | | source image for recognition.}"
"{@mask | | binary segmentation mask where each contour is a character.}"
"{lexicon lex l | | (optional) lexicon provided as a list of comma separated words.}"
;
CommandLineParser parser(argc, argv, keys);
parser.about("\nSegmented word recognition.\nA demo program on segmented word recognition. Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.\n");
String filename1 = parser.get<String>(0);
String filename2 = parser.get<String>(1);
parser.printMessage();
cout << endl << endl;
if ((parser.has("help")) || (filename1.size()==0))
{
return 0;
}
if (!parser.check())
{
parser.printErrors();
return 0;
}
Mat image = imread(filename1);
Mat mask;
if (filename2.size() > 0)
mask = imread(filename2);
else
image.copyTo(mask);
// be sure the mask is a binry image
cvtColor(mask, mask, COLOR_BGR2GRAY);
threshold(mask, mask, 128., 255, THRESH_BINARY);
// character recognition vocabulary
string voc = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
// Emission probabilities for the HMM language model (identity matrix by default)
Mat emissionProbabilities = Mat::eye((int)voc.size(), (int)voc.size(), CV_64FC1);
// Bigram transition probabilities for the HMM language model
Mat transitionProbabilities;
string lex = parser.get<string>("lex");
if (lex.size()>0)
{
// Build tailored language model for the provided lexicon
vector<string> lexicon;
size_t pos = 0;
string delimiter = ",";
std::string token;
while ((pos = lex.find(delimiter)) != std::string::npos) {
token = lex.substr(0, pos);
lexicon.push_back(token);
lex.erase(0, pos + delimiter.length());
}
lexicon.push_back(lex);
createOCRHMMTransitionsTable(voc,lexicon,transitionProbabilities);
} else {
// Or load the generic language model (from Aspell English dictionary)
FileStorage fs("./OCRHMM_transitions_table.xml", FileStorage::READ);
fs["transition_probabilities"] >> transitionProbabilities;
fs.release();
}
Ptr<OCRTesseract> ocrTes = OCRTesseract::create();
Ptr<OCRHMMDecoder> ocrNM = OCRHMMDecoder::create(
loadOCRHMMClassifierNM("./OCRHMM_knn_model_data.xml.gz"),
voc, transitionProbabilities, emissionProbabilities);
Ptr<OCRHMMDecoder> ocrCNN = OCRHMMDecoder::create(
loadOCRHMMClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"),
voc, transitionProbabilities, emissionProbabilities);
std::string output;
double t_r = (double)getTickCount();
ocrTes->run(mask, output);
output.erase(remove(output.begin(), output.end(), '\n'), output.end());
cout << " OCR_Tesseract output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
t_r = (double)getTickCount();
ocrNM->run(mask, output);
cout << " OCR_NM output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
t_r = (double)getTickCount();
ocrCNN->run(image, mask, output);
cout << " OCR_CNN output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
}
...@@ -73,6 +73,22 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* co ...@@ -73,6 +73,22 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* co
component_confidences->clear(); component_confidences->clear();
} }
void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
vector<string>* component_texts, vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
output_text.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
}
void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation) void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
{ {
...@@ -136,7 +152,7 @@ public: ...@@ -136,7 +152,7 @@ public:
if (component_confidences != NULL) if (component_confidences != NULL)
component_confidences->clear(); component_confidences->clear();
// TODO split a line into words // TODO We must split a line into words or specify we only work with words
if(src.type() == CV_8UC3) if(src.type() == CV_8UC3)
{ {
...@@ -174,14 +190,7 @@ public: ...@@ -174,14 +190,7 @@ public:
} }
//TODO it would be interesting to have a hash table with a vector of booleans
// but this is not possible when we have a large number of possible segmentations.
//vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
// options are using std::set<unsigned long long int> to store only the keys of visited nodes
// but will deteriorate the time performance.
set<unsigned long long int> visited_nodes; //TODO make it member of class set<unsigned long long int> visited_nodes; //TODO make it member of class
// it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
// for which there is a change on the class prediction)
vector<int> start_segmentation; vector<int> start_segmentation;
start_segmentation.push_back(oversegmentation[0]); start_segmentation.push_back(oversegmentation[0]);
...@@ -221,6 +230,21 @@ public: ...@@ -221,6 +230,21 @@ public:
return; return;
} }
void run( Mat& src,
Mat& mask,
string& out_sequence,
vector<Rect>* component_rects,
vector<string>* component_texts,
vector<float>* component_confidences,
int component_level)
{
CV_Assert( mask.type() == CV_8UC1 );
// Nothing to do with a mask here. We do slidding window anyway.
run( src, out_sequence, component_rects, component_texts, component_confidences, component_level );
}
private: private:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
...@@ -421,22 +445,29 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename) ...@@ -421,22 +445,29 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename)
fs["feature_min"] >> feature_min; fs["feature_min"] >> feature_min;
fs["feature_max"] >> feature_max; fs["feature_max"] >> feature_max;
fs.release(); fs.release();
// TODO check all matrix dimensions match correctly and no one is empty
} }
else else
CV_Error(Error::StsBadArg, "Default classifier data file not found!"); CV_Error(Error::StsBadArg, "Default classifier data file not found!");
// check all matrix dimensions match correctly and no one is empty
CV_Assert( (M.cols > 0) && (M.rows > 0) );
CV_Assert( (P.cols > 0) && (P.rows > 0) );
CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) );
CV_Assert( (weights.cols > 0) && (weights.rows > 0) );
CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) );
CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) );
nr_feature = weights.rows; nr_feature = weights.rows;
nr_class = weights.cols; nr_class = weights.cols;
// TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols)) patch_size = (int)sqrt(kernels.cols);
step_size = 4; // algorithm internal parameters
window_size = 32; window_size = 32;
quad_size = 12; quad_size = 12;
patch_size = 8;
num_quads = 25; num_quads = 25;
num_tiles = 25; num_tiles = 25;
alpha = 0.5; alpha = 0.5;
step_size = 4; // TODO showld this be a parameter for the user?
} }
...@@ -459,7 +490,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> > ...@@ -459,7 +490,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
cvtColor(src,src,COLOR_RGB2GRAY); cvtColor(src,src,COLOR_RGB2GRAY);
} }
// TODO shall we resize the input image or make a copy ?
resize(src,src,Size(window_size*src.cols/src.rows,window_size)); resize(src,src,Size(window_size*src.cols/src.rows,window_size));
int seg_points = 0; int seg_points = 0;
...@@ -555,8 +585,8 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> > ...@@ -555,8 +585,8 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
double *p = new double[nr_class]; double *p = new double[nr_class];
double predict_label = eval_feature(feature,p); double predict_label = eval_feature(feature,p);
//cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl; //cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
if (predict_label < 0) // TODO use cvError if (predict_label < 0)
cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl; CV_Error(Error::StsInternal, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()");
seg_points++; seg_points++;
......
This diff is collapsed.
...@@ -70,6 +70,22 @@ void OCRTesseract::run(Mat& image, string& output_text, vector<Rect>* component_ ...@@ -70,6 +70,22 @@ void OCRTesseract::run(Mat& image, string& output_text, vector<Rect>* component_
component_confidences->clear(); component_confidences->clear();
} }
void OCRTesseract::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
vector<string>* component_texts, vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
output_text.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
}
class OCRTesseractImpl : public OCRTesseract class OCRTesseractImpl : public OCRTesseract
{ {
private: private:
...@@ -189,6 +205,16 @@ public: ...@@ -189,6 +205,16 @@ public:
#endif #endif
} }
void run(Mat& image, Mat& mask, string& output, vector<Rect>* component_rects=NULL,
vector<string>* component_texts=NULL, vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
run( mask, output, component_rects, component_texts, component_confidences, component_level);
}
}; };
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment