Commit 6e4d6bca authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #321 from lluisgomez:ocr_refactor_even

parents f9d42886 796c26f6
......@@ -69,6 +69,9 @@ public:
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
};
/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++.
......@@ -106,6 +109,10 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract.
@param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the
......@@ -170,11 +177,11 @@ public:
public:
/** @brief Recognize text using HMM.
Takes image on input and returns recognized text in the output_text parameter. Optionally
Takes binary image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 with a single text line (or word).
@param image Input binary image CV_8UC1 with a single text line (or word).
@param output_text Output text. Most likely character sequence found by the HMM decoder.
......@@ -193,6 +200,33 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Recognize text using HMM.
Takes an image and a mask (where each connected component corresponds to a segmented character)
on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word).
@param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image.
@param output_text Output text. Most likely character sequence found by the HMM decoder.
@param component_rects If provided the method will output a list of Rects for the individual
text elements found (e.g. words).
@param component_texts If provided the method will output a list of text strings for the
recognition of individual text elements found (e.g. words).
@param component_confidences If provided the method will output a list of confidence values
for the recognition of individual text elements found (e.g. words).
@param component_level Only OCR_LEVEL_WORD is supported.
*/
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.
@param classifier The character classifier with built in feature extractor.
......@@ -231,7 +265,7 @@ protected:
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
The default classifier is based in the scene text recognition method proposed by Lukás Neumann &
The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann &
Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a
fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector
based on gradient orientations along the chain-code of its perimeter. Then, the region is classified
......@@ -240,19 +274,32 @@ types.
*/
CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
@param vocabulary The language vocabulary (chars when ascii english text).
@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
@param lexicon The list of words that are expected to be found in a particular image.
The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
at each window location.
*/
CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const std::string& filename);
@param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
//! @}
The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
@note
- (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
*/
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
*
* @param vocabulary The language vocabulary (chars when ascii english text).
*
* @param lexicon The list of words that are expected to be found in a particular image.
*
* @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
*
* The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
* @note
* - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
* */
CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table);
......@@ -319,6 +366,10 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder.
@param classifier The character classifier with built in feature extractor.
......@@ -359,10 +410,10 @@ protected:
/** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object.
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
The default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and
The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
at each window location.
*/
......
......@@ -36,7 +36,7 @@ int main(int argc, char* argv[])
return(0);
}
string vocabulary = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyx0123456789"; // must have the same order as the clasifier output classes
string vocabulary = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes
vector<string> lexicon; // a list of words expected to be found on the input image
lexicon.push_back(string("abb"));
lexicon.push_back(string("patata"));
......
/*
* segmented_word_recognition.cpp
*
* A demo program on segmented word recognition.
* Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.
*
* Created on: Jul 31, 2015
* Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
*/
#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include <iostream>
using namespace std;
using namespace cv;
using namespace text;
int main(int argc, char* argv[]) {
const String keys =
"{help h usage ? | | print this message.}"
"{@image | | source image for recognition.}"
"{@mask | | binary segmentation mask where each contour is a character.}"
"{lexicon lex l | | (optional) lexicon provided as a list of comma separated words.}"
;
CommandLineParser parser(argc, argv, keys);
parser.about("\nSegmented word recognition.\nA demo program on segmented word recognition. Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.\n");
String filename1 = parser.get<String>(0);
String filename2 = parser.get<String>(1);
parser.printMessage();
cout << endl << endl;
if ((parser.has("help")) || (filename1.size()==0))
{
return 0;
}
if (!parser.check())
{
parser.printErrors();
return 0;
}
Mat image = imread(filename1);
Mat mask;
if (filename2.size() > 0)
mask = imread(filename2);
else
image.copyTo(mask);
// be sure the mask is a binry image
cvtColor(mask, mask, COLOR_BGR2GRAY);
threshold(mask, mask, 128., 255, THRESH_BINARY);
// character recognition vocabulary
string voc = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
// Emission probabilities for the HMM language model (identity matrix by default)
Mat emissionProbabilities = Mat::eye((int)voc.size(), (int)voc.size(), CV_64FC1);
// Bigram transition probabilities for the HMM language model
Mat transitionProbabilities;
string lex = parser.get<string>("lex");
if (lex.size()>0)
{
// Build tailored language model for the provided lexicon
vector<string> lexicon;
size_t pos = 0;
string delimiter = ",";
std::string token;
while ((pos = lex.find(delimiter)) != std::string::npos) {
token = lex.substr(0, pos);
lexicon.push_back(token);
lex.erase(0, pos + delimiter.length());
}
lexicon.push_back(lex);
createOCRHMMTransitionsTable(voc,lexicon,transitionProbabilities);
} else {
// Or load the generic language model (from Aspell English dictionary)
FileStorage fs("./OCRHMM_transitions_table.xml", FileStorage::READ);
fs["transition_probabilities"] >> transitionProbabilities;
fs.release();
}
Ptr<OCRTesseract> ocrTes = OCRTesseract::create();
Ptr<OCRHMMDecoder> ocrNM = OCRHMMDecoder::create(
loadOCRHMMClassifierNM("./OCRHMM_knn_model_data.xml.gz"),
voc, transitionProbabilities, emissionProbabilities);
Ptr<OCRHMMDecoder> ocrCNN = OCRHMMDecoder::create(
loadOCRHMMClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"),
voc, transitionProbabilities, emissionProbabilities);
std::string output;
double t_r = (double)getTickCount();
ocrTes->run(mask, output);
output.erase(remove(output.begin(), output.end(), '\n'), output.end());
cout << " OCR_Tesseract output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
t_r = (double)getTickCount();
ocrNM->run(mask, output);
cout << " OCR_NM output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
t_r = (double)getTickCount();
ocrCNN->run(image, mask, output);
cout << " OCR_CNN output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
}
......@@ -73,6 +73,22 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* co
component_confidences->clear();
}
void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
vector<string>* component_texts, vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
output_text.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
}
void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
{
......@@ -136,7 +152,7 @@ public:
if (component_confidences != NULL)
component_confidences->clear();
// TODO split a line into words
// TODO We must split a line into words or specify we only work with words
if(src.type() == CV_8UC3)
{
......@@ -174,14 +190,7 @@ public:
}
//TODO it would be interesting to have a hash table with a vector of booleans
// but this is not possible when we have a large number of possible segmentations.
//vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
// options are using std::set<unsigned long long int> to store only the keys of visited nodes
// but will deteriorate the time performance.
set<unsigned long long int> visited_nodes; //TODO make it member of class
// it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
// for which there is a change on the class prediction)
vector<int> start_segmentation;
start_segmentation.push_back(oversegmentation[0]);
......@@ -221,6 +230,21 @@ public:
return;
}
void run( Mat& src,
Mat& mask,
string& out_sequence,
vector<Rect>* component_rects,
vector<string>* component_texts,
vector<float>* component_confidences,
int component_level)
{
CV_Assert( mask.type() == CV_8UC1 );
// Nothing to do with a mask here. We do slidding window anyway.
run( src, out_sequence, component_rects, component_texts, component_confidences, component_level );
}
private:
////////////////////////////////////////////////////////////
......@@ -421,22 +445,29 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename)
fs["feature_min"] >> feature_min;
fs["feature_max"] >> feature_max;
fs.release();
// TODO check all matrix dimensions match correctly and no one is empty
}
else
CV_Error(Error::StsBadArg, "Default classifier data file not found!");
nr_feature = weights.rows;
nr_class = weights.cols;
// TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols))
step_size = 4;
// check all matrix dimensions match correctly and no one is empty
CV_Assert( (M.cols > 0) && (M.rows > 0) );
CV_Assert( (P.cols > 0) && (P.rows > 0) );
CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) );
CV_Assert( (weights.cols > 0) && (weights.rows > 0) );
CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) );
CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) );
nr_feature = weights.rows;
nr_class = weights.cols;
patch_size = (int)sqrt(kernels.cols);
// algorithm internal parameters
window_size = 32;
quad_size = 12;
patch_size = 8;
num_quads = 25;
num_tiles = 25;
alpha = 0.5;
step_size = 4; // TODO showld this be a parameter for the user?
}
......@@ -459,7 +490,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
cvtColor(src,src,COLOR_RGB2GRAY);
}
// TODO shall we resize the input image or make a copy ?
resize(src,src,Size(window_size*src.cols/src.rows,window_size));
int seg_points = 0;
......@@ -555,8 +585,8 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
double *p = new double[nr_class];
double predict_label = eval_feature(feature,p);
//cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
if (predict_label < 0) // TODO use cvError
cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl;
if (predict_label < 0)
CV_Error(Error::StsInternal, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()");
seg_points++;
......
This diff is collapsed.
......@@ -70,6 +70,22 @@ void OCRTesseract::run(Mat& image, string& output_text, vector<Rect>* component_
component_confidences->clear();
}
void OCRTesseract::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
vector<string>* component_texts, vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
output_text.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
}
class OCRTesseractImpl : public OCRTesseract
{
private:
......@@ -189,6 +205,16 @@ public:
#endif
}
void run(Mat& image, Mat& mask, string& output, vector<Rect>* component_rects=NULL,
vector<string>* component_texts=NULL, vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
run( mask, output, component_rects, component_texts, component_confidences, component_level);
}
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment