Commit 6e4d6bca authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #321 from lluisgomez:ocr_refactor_even

parents f9d42886 796c26f6
......@@ -69,6 +69,9 @@ public:
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
};
/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++.
......@@ -106,6 +109,10 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract.
@param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the
......@@ -170,11 +177,11 @@ public:
public:
/** @brief Recognize text using HMM.
Takes image on input and returns recognized text in the output_text parameter. Optionally
Takes binary image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 with a single text line (or word).
@param image Input binary image CV_8UC1 with a single text line (or word).
@param output_text Output text. Most likely character sequence found by the HMM decoder.
......@@ -193,6 +200,33 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Recognize text using HMM.
Takes an image and a mask (where each connected component corresponds to a segmented character)
on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word).
@param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image.
@param output_text Output text. Most likely character sequence found by the HMM decoder.
@param component_rects If provided the method will output a list of Rects for the individual
text elements found (e.g. words).
@param component_texts If provided the method will output a list of text strings for the
recognition of individual text elements found (e.g. words).
@param component_confidences If provided the method will output a list of confidence values
for the recognition of individual text elements found (e.g. words).
@param component_level Only OCR_LEVEL_WORD is supported.
*/
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.
@param classifier The character classifier with built in feature extractor.
......@@ -231,7 +265,7 @@ protected:
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
The default classifier is based in the scene text recognition method proposed by Lukás Neumann &
The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann &
Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a
fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector
based on gradient orientations along the chain-code of its perimeter. Then, the region is classified
......@@ -240,19 +274,32 @@ types.
*/
CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string& filename);
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
@param vocabulary The language vocabulary (chars when ascii english text).
@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
@param lexicon The list of words that are expected to be found in a particular image.
The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
at each window location.
*/
CV_EXPORTS Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const std::string& filename);
@param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
//! @}
The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
@note
- (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
*/
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
*
* @param vocabulary The language vocabulary (chars when ascii english text).
*
* @param lexicon The list of words that are expected to be found in a particular image.
*
* @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
*
* The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
* @note
* - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
* */
CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table);
......@@ -319,6 +366,10 @@ public:
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder.
@param classifier The character classifier with built in feature extractor.
......@@ -359,10 +410,10 @@ protected:
/** @brief Allow to implicitly load the default character classifier when creating an OCRBeamSearchDecoder object.
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
The default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier sonsists in a Single Layer Convolutional Neural Network and
The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
at each window location.
*/
......
......@@ -36,7 +36,7 @@ int main(int argc, char* argv[])
return(0);
}
string vocabulary = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyx0123456789"; // must have the same order as the clasifier output classes
string vocabulary = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes
vector<string> lexicon; // a list of words expected to be found on the input image
lexicon.push_back(string("abb"));
lexicon.push_back(string("patata"));
......
/*
* segmented_word_recognition.cpp
*
* A demo program on segmented word recognition.
* Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.
*
* Created on: Jul 31, 2015
* Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
*/
#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include <iostream>
using namespace std;
using namespace cv;
using namespace text;
int main(int argc, char* argv[]) {
const String keys =
"{help h usage ? | | print this message.}"
"{@image | | source image for recognition.}"
"{@mask | | binary segmentation mask where each contour is a character.}"
"{lexicon lex l | | (optional) lexicon provided as a list of comma separated words.}"
;
CommandLineParser parser(argc, argv, keys);
parser.about("\nSegmented word recognition.\nA demo program on segmented word recognition. Shows the use of the OCRHMMDecoder API with the two provided default character classifiers.\n");
String filename1 = parser.get<String>(0);
String filename2 = parser.get<String>(1);
parser.printMessage();
cout << endl << endl;
if ((parser.has("help")) || (filename1.size()==0))
{
return 0;
}
if (!parser.check())
{
parser.printErrors();
return 0;
}
Mat image = imread(filename1);
Mat mask;
if (filename2.size() > 0)
mask = imread(filename2);
else
image.copyTo(mask);
// be sure the mask is a binry image
cvtColor(mask, mask, COLOR_BGR2GRAY);
threshold(mask, mask, 128., 255, THRESH_BINARY);
// character recognition vocabulary
string voc = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
// Emission probabilities for the HMM language model (identity matrix by default)
Mat emissionProbabilities = Mat::eye((int)voc.size(), (int)voc.size(), CV_64FC1);
// Bigram transition probabilities for the HMM language model
Mat transitionProbabilities;
string lex = parser.get<string>("lex");
if (lex.size()>0)
{
// Build tailored language model for the provided lexicon
vector<string> lexicon;
size_t pos = 0;
string delimiter = ",";
std::string token;
while ((pos = lex.find(delimiter)) != std::string::npos) {
token = lex.substr(0, pos);
lexicon.push_back(token);
lex.erase(0, pos + delimiter.length());
}
lexicon.push_back(lex);
createOCRHMMTransitionsTable(voc,lexicon,transitionProbabilities);
} else {
// Or load the generic language model (from Aspell English dictionary)
FileStorage fs("./OCRHMM_transitions_table.xml", FileStorage::READ);
fs["transition_probabilities"] >> transitionProbabilities;
fs.release();
}
Ptr<OCRTesseract> ocrTes = OCRTesseract::create();
Ptr<OCRHMMDecoder> ocrNM = OCRHMMDecoder::create(
loadOCRHMMClassifierNM("./OCRHMM_knn_model_data.xml.gz"),
voc, transitionProbabilities, emissionProbabilities);
Ptr<OCRHMMDecoder> ocrCNN = OCRHMMDecoder::create(
loadOCRHMMClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"),
voc, transitionProbabilities, emissionProbabilities);
std::string output;
double t_r = (double)getTickCount();
ocrTes->run(mask, output);
output.erase(remove(output.begin(), output.end(), '\n'), output.end());
cout << " OCR_Tesseract output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
t_r = (double)getTickCount();
ocrNM->run(mask, output);
cout << " OCR_NM output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
t_r = (double)getTickCount();
ocrCNN->run(image, mask, output);
cout << " OCR_CNN output \"" << output << "\". Done in "
<< ((double)getTickCount() - t_r)*1000/getTickFrequency() << " ms." << endl;
}
......@@ -73,6 +73,22 @@ void OCRBeamSearchDecoder::run(Mat& image, string& output_text, vector<Rect>* co
component_confidences->clear();
}
void OCRBeamSearchDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
vector<string>* component_texts, vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
output_text.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
}
void OCRBeamSearchDecoder::ClassifierCallback::eval( InputArray image, vector< vector<double> >& recognition_probabilities, vector<int>& oversegmentation)
{
......@@ -136,7 +152,7 @@ public:
if (component_confidences != NULL)
component_confidences->clear();
// TODO split a line into words
// TODO We must split a line into words or specify we only work with words
if(src.type() == CV_8UC3)
{
......@@ -174,14 +190,7 @@ public:
}
//TODO it would be interesting to have a hash table with a vector of booleans
// but this is not possible when we have a large number of possible segmentations.
//vector<bool> visited_nodes(pow(2,oversegmentation.size()),false); // hash table for visited nodes
// options are using std::set<unsigned long long int> to store only the keys of visited nodes
// but will deteriorate the time performance.
set<unsigned long long int> visited_nodes; //TODO make it member of class
// it is also possible to reduce the number of seg. points in some way (e.g. use only seg.points
// for which there is a change on the class prediction)
vector<int> start_segmentation;
start_segmentation.push_back(oversegmentation[0]);
......@@ -221,6 +230,21 @@ public:
return;
}
void run( Mat& src,
Mat& mask,
string& out_sequence,
vector<Rect>* component_rects,
vector<string>* component_texts,
vector<float>* component_confidences,
int component_level)
{
CV_Assert( mask.type() == CV_8UC1 );
// Nothing to do with a mask here. We do slidding window anyway.
run( src, out_sequence, component_rects, component_texts, component_confidences, component_level );
}
private:
////////////////////////////////////////////////////////////
......@@ -421,22 +445,29 @@ OCRBeamSearchClassifierCNN::OCRBeamSearchClassifierCNN (const string& filename)
fs["feature_min"] >> feature_min;
fs["feature_max"] >> feature_max;
fs.release();
// TODO check all matrix dimensions match correctly and no one is empty
}
else
CV_Error(Error::StsBadArg, "Default classifier data file not found!");
nr_feature = weights.rows;
nr_class = weights.cols;
// TODO some of this can be inferred from the input file (e.g. patch size must be sqrt(filters.cols))
step_size = 4;
// check all matrix dimensions match correctly and no one is empty
CV_Assert( (M.cols > 0) && (M.rows > 0) );
CV_Assert( (P.cols > 0) && (P.rows > 0) );
CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) );
CV_Assert( (weights.cols > 0) && (weights.rows > 0) );
CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) );
CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) );
nr_feature = weights.rows;
nr_class = weights.cols;
patch_size = (int)sqrt(kernels.cols);
// algorithm internal parameters
window_size = 32;
quad_size = 12;
patch_size = 8;
num_quads = 25;
num_tiles = 25;
alpha = 0.5;
step_size = 4; // TODO showld this be a parameter for the user?
}
......@@ -459,7 +490,6 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
cvtColor(src,src,COLOR_RGB2GRAY);
}
// TODO shall we resize the input image or make a copy ?
resize(src,src,Size(window_size*src.cols/src.rows,window_size));
int seg_points = 0;
......@@ -555,8 +585,8 @@ void OCRBeamSearchClassifierCNN::eval( InputArray _src, vector< vector<double> >
double *p = new double[nr_class];
double predict_label = eval_feature(feature,p);
//cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
if (predict_label < 0) // TODO use cvError
cout << "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()" << endl;
if (predict_label < 0)
CV_Error(Error::StsInternal, "OCRBeamSearchClassifierCNN::eval Error: unexpected prediction in eval_feature()");
seg_points++;
......
......@@ -74,6 +74,22 @@ void OCRHMMDecoder::run(Mat& image, string& output_text, vector<Rect>* component
component_confidences->clear();
}
void OCRHMMDecoder::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
vector<string>* component_texts, vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
output_text.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
}
void OCRHMMDecoder::ClassifierCallback::eval( InputArray image, vector<int>& out_class, vector<double>& out_confidence)
{
CV_Assert(( image.getMat().type() == CV_8UC3 ) || ( image.getMat().type() == CV_8UC1 ));
......@@ -263,6 +279,264 @@ public:
obs.push_back(out_class[0]);
observations.push_back(out_class);
confidences.push_back(out_conf);
//cout << " out class = " << vocabulary[out_class[0]] << endl;
}
//This must be extracted from dictionary, or just assumed to be equal for all characters
vector<double> start_p(vocabulary.size());
for (int i=0; i<(int)vocabulary.size(); i++)
start_p[i] = 1.0/vocabulary.size();
Mat V = Mat::zeros((int)observations.size(),(int)vocabulary.size(),CV_64FC1);
vector<string> path(vocabulary.size());
// Initialize base cases (t == 0)
for (int i=0; i<(int)vocabulary.size(); i++)
{
for (int j=0; j<(int)observations[0].size(); j++)
{
emission_p.at<double>(observations[0][j],obs[0]) = confidences[0][j];
}
V.at<double>(0,i) = start_p[i] * emission_p.at<double>(i,obs[0]);
path[i] = vocabulary.at(i);
}
// Run Viterbi for t > 0
for (int t=1; t<(int)obs.size(); t++)
{
//Dude this has to be done each time!!
emission_p = Mat::eye(62,62,CV_64FC1);
for (int e=0; e<(int)observations[t].size(); e++)
{
emission_p.at<double>(observations[t][e],obs[t]) = confidences[t][e];
}
vector<string> newpath(vocabulary.size());
for (int i=0; i<(int)vocabulary.size(); i++)
{
double max_prob = 0;
int best_idx = 0;
for (int j=0; j<(int)vocabulary.size(); j++)
{
double prob = V.at<double>(t-1,j) * transition_p.at<double>(j,i) * emission_p.at<double>(i,obs[t]);
if ( prob > max_prob)
{
max_prob = prob;
best_idx = j;
}
}
V.at<double>(t,i) = max_prob;
newpath[i] = path[best_idx] + vocabulary.at(i);
}
// Don't need to remember the old paths
path.swap(newpath);
}
double max_prob = 0;
int best_idx = 0;
for (int i=0; i<(int)vocabulary.size(); i++)
{
double prob = V.at<double>((int)obs.size()-1,i);
if ( prob > max_prob)
{
max_prob = prob;
best_idx = i;
}
}
//cout << path[best_idx] << endl;
if (out_sequence.size()>0) out_sequence = out_sequence+" "+path[best_idx];
else out_sequence = path[best_idx];
if (component_rects != NULL)
component_rects->push_back(words_rect[w]);
if (component_texts != NULL)
component_texts->push_back(path[best_idx]);
if (component_confidences != NULL)
component_confidences->push_back((float)max_prob);
}
return;
}
void run( Mat& image,
Mat& mask,
string& out_sequence,
vector<Rect>* component_rects,
vector<string>* component_texts,
vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (image.cols > 0) && (image.rows > 0) );
CV_Assert( (image.cols == mask.cols) && (image.rows == mask.rows) );
CV_Assert( component_level == OCR_LEVEL_WORD );
out_sequence.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
// First we split a line into words
vector<Mat> words_mask;
vector<Rect> words_rect;
/// Find contours
vector<vector<Point> > contours;
vector<Vec4i> hierarchy;
Mat tmp;
mask.copyTo(tmp);
findContours( tmp, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE, Point(0, 0) );
if (contours.size() < 6)
{
//do not split lines with less than 6 characters
words_mask.push_back(mask);
words_rect.push_back(Rect(0,0,mask.cols,mask.rows));
}
else
{
Mat_<float> vector_w((int)mask.cols,1);
reduce(mask, vector_w, 0, REDUCE_SUM, -1);
vector<int> spaces;
vector<int> spaces_start;
vector<int> spaces_end;
int space_count=0;
int last_one_idx;
int s_init = 0, s_end=vector_w.cols;
for (int s=0; s<vector_w.cols; s++)
{
if (vector_w.at<float>(0,s) == 0)
s_init = s+1;
else
break;
}
for (int s=vector_w.cols-1; s>=0; s--)
{
if (vector_w.at<float>(0,s) == 0)
s_end = s;
else
break;
}
for (int s=s_init; s<s_end; s++)
{
if (vector_w.at<float>(0,s) == 0)
{
space_count++;
} else {
if (space_count!=0)
{
spaces.push_back(space_count);
spaces_start.push_back(last_one_idx);
spaces_end.push_back(s-1);
}
space_count = 0;
last_one_idx = s;
}
}
Scalar mean_space,std_space;
meanStdDev(Mat(spaces),mean_space,std_space);
int num_word_spaces = 0;
int last_word_space_end = 0;
for (int s=0; s<(int)spaces.size(); s++)
{
if (spaces_end.at(s)-spaces_start.at(s) > mean_space[0]+(mean_space[0]*1.1)) //this 1.1 is a param?
{
if (num_word_spaces == 0)
{
//cout << " we have a word from 0 to " << spaces_start.at(s) << endl;
Mat word_mask;
Rect word_rect = Rect(0,0,spaces_start.at(s),mask.rows);
mask(word_rect).copyTo(word_mask);
words_mask.push_back(word_mask);
words_rect.push_back(word_rect);
}
else
{
//cout << " we have a word from " << last_word_space_end << " to " << spaces_start.at(s) << endl;
Mat word_mask;
Rect word_rect = Rect(last_word_space_end,0,spaces_start.at(s)-last_word_space_end,mask.rows);
mask(word_rect).copyTo(word_mask);
words_mask.push_back(word_mask);
words_rect.push_back(word_rect);
}
num_word_spaces++;
last_word_space_end = spaces_end.at(s);
}
}
//cout << " we have a word from " << last_word_space_end << " to " << vector_w.cols << endl << endl << endl;
Mat word_mask;
Rect word_rect = Rect(last_word_space_end,0,vector_w.cols-last_word_space_end,mask.rows);
mask(word_rect).copyTo(word_mask);
words_mask.push_back(word_mask);
words_rect.push_back(word_rect);
}
for (int w=0; w<(int)words_mask.size(); w++)
{
vector< vector<int> > observations;
vector< vector<double> > confidences;
vector<int> obs;
// First find contours and sort by x coordinate of bbox
words_mask[w].copyTo(tmp);
if (tmp.empty())
continue;
contours.clear();
hierarchy.clear();
/// Find contours
findContours( tmp, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE, Point(0, 0) );
vector<Rect> contours_rect;
for (int i=0; i<(int)contours.size(); i++)
{
contours_rect.push_back(boundingRect(contours[i]));
}
sort(contours_rect.begin(), contours_rect.end(), sort_rect_horiz);
// Do character recognition foreach contour
for (int i=0; i<(int)contours.size(); i++)
{
vector<int> out_class;
vector<double> out_conf;
//take the center of the char rect and translate it to the real origin
Point char_center = Point(contours_rect.at(i).x+contours_rect.at(i).width/2,
contours_rect.at(i).y+contours_rect.at(i).height/2);
char_center.x += words_rect[w].x;
char_center.y += words_rect[w].y;
int win_size = max(contours_rect.at(i).width,contours_rect.at(i).height);
win_size += (int)(win_size*0.6); // add some pixels in the border TODO: is this a parameter for the user space?
Rect char_rect = Rect(char_center.x-win_size/2,char_center.y-win_size/2,win_size,win_size);
char_rect &= Rect(0,0,image.cols,image.rows);
Mat tmp_image;
image(char_rect).copyTo(tmp_image);
classifier->eval(tmp_image,out_class,out_conf);
if (!out_class.empty())
obs.push_back(out_class[0]);
//cout << " out class = " << vocabulary[out_class[0]] << "(" << out_conf[0] << ")" << endl;
observations.push_back(out_class);
confidences.push_back(out_conf);
}
......@@ -335,7 +609,8 @@ public:
}
//cout << path[best_idx] << endl;
out_sequence = out_sequence+" "+path[best_idx];
if (out_sequence.size()>0) out_sequence = out_sequence+" "+path[best_idx];
else out_sequence = path[best_idx];
if (component_rects != NULL)
component_rects->push_back(words_rect[w]);
......@@ -598,6 +873,278 @@ Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const std::string&
return makePtr<OCRHMMClassifierKNN>(filename);
}
class CV_EXPORTS OCRHMMClassifierCNN : public OCRHMMDecoder::ClassifierCallback
{
public:
//constructor
OCRHMMClassifierCNN(const std::string& filename);
// Destructor
~OCRHMMClassifierCNN() {}
void eval( InputArray image, vector<int>& out_class, vector<double>& out_confidence );
protected:
void normalizeAndZCA(Mat& patches);
double eval_feature(Mat& feature, double* prob_estimates);
private:
int nr_class; // number of classes
int nr_feature; // number of features
Mat feature_min; // scale range
Mat feature_max;
Mat weights; // Logistic Regression weights
Mat kernels; // CNN kernels
Mat M, P; // ZCA Whitening parameters
int window_size; // window size
int quad_size;
int patch_size;
int num_quads; // extract 25 quads (12x12) from each image
int num_tiles; // extract 25 patches (8x8) from each quad
double alpha; // used in non-linear activation function z = max(0, |D*a| - alpha)
};
OCRHMMClassifierCNN::OCRHMMClassifierCNN (const string& filename)
{
if (ifstream(filename.c_str()))
{
FileStorage fs(filename, FileStorage::READ);
// Load kernels bank and withenning params
fs["kernels"] >> kernels;
fs["M"] >> M;
fs["P"] >> P;
// Load Logistic Regression weights
fs["weights"] >> weights;
// Load feature scaling ranges
fs["feature_min"] >> feature_min;
fs["feature_max"] >> feature_max;
fs.release();
}
else
CV_Error(Error::StsBadArg, "Default classifier data file not found!");
// check all matrix dimensions match correctly and no one is empty
CV_Assert( (M.cols > 0) && (M.rows > 0) );
CV_Assert( (P.cols > 0) && (P.rows > 0) );
CV_Assert( (kernels.cols > 0) && (kernels.rows > 0) );
CV_Assert( (weights.cols > 0) && (weights.rows > 0) );
CV_Assert( (feature_min.cols > 0) && (feature_min.rows > 0) );
CV_Assert( (feature_max.cols > 0) && (feature_max.rows > 0) );
nr_feature = weights.rows;
nr_class = weights.cols;
patch_size = (int)sqrt(kernels.cols);
// algorithm internal parameters
window_size = 32;
num_quads = 25;
num_tiles = 25;
quad_size = 12;
alpha = 0.5;
}
void OCRHMMClassifierCNN::eval( InputArray _src, vector<int>& out_class, vector<double>& out_confidence )
{
CV_Assert(( _src.getMat().type() == CV_8UC3 ) || ( _src.getMat().type() == CV_8UC1 ));
out_class.clear();
out_confidence.clear();
Mat img = _src.getMat();
if(img.type() == CV_8UC3)
{
cvtColor(img,img,COLOR_RGB2GRAY);
}
// shall we resize the input image or make a copy ?
resize(img,img,Size(window_size,window_size));
Mat quad;
Mat tmp;
int patch_count = 0;
vector< vector<double> > data_pool(9);
int quad_id = 1;
for (int q_x=0; q_x<=window_size-quad_size; q_x=q_x+(int)(quad_size/2-1))
{
for (int q_y=0; q_y<=window_size-quad_size; q_y=q_y+(int)(quad_size/2-1))
{
Rect quad_rect = Rect(q_x,q_y,quad_size,quad_size);
quad = img(quad_rect);
//start sliding window (8x8) in each tile and store the patch as row in data_pool
for (int w_x=0; w_x<=quad_size-patch_size; w_x++)
{
for (int w_y=0; w_y<=quad_size-patch_size; w_y++)
{
quad(Rect(w_x,w_y,patch_size,patch_size)).copyTo(tmp);
tmp = tmp.reshape(0,1);
tmp.convertTo(tmp, CV_64F);
normalizeAndZCA(tmp);
vector<double> patch;
tmp.copyTo(patch);
if ((quad_id == 1)||(quad_id == 2)||(quad_id == 6)||(quad_id == 7))
data_pool[0].insert(data_pool[0].end(),patch.begin(),patch.end());
if ((quad_id == 2)||(quad_id == 7)||(quad_id == 3)||(quad_id == 8)||(quad_id == 4)||(quad_id == 9))
data_pool[1].insert(data_pool[1].end(),patch.begin(),patch.end());
if ((quad_id == 4)||(quad_id == 9)||(quad_id == 5)||(quad_id == 10))
data_pool[2].insert(data_pool[2].end(),patch.begin(),patch.end());
if ((quad_id == 6)||(quad_id == 11)||(quad_id == 16)||(quad_id == 7)||(quad_id == 12)||(quad_id == 17))
data_pool[3].insert(data_pool[3].end(),patch.begin(),patch.end());
if ((quad_id == 7)||(quad_id == 12)||(quad_id == 17)||(quad_id == 8)||(quad_id == 13)||(quad_id == 18)||(quad_id == 9)||(quad_id == 14)||(quad_id == 19))
data_pool[4].insert(data_pool[4].end(),patch.begin(),patch.end());
if ((quad_id == 9)||(quad_id == 14)||(quad_id == 19)||(quad_id == 10)||(quad_id == 15)||(quad_id == 20))
data_pool[5].insert(data_pool[5].end(),patch.begin(),patch.end());
if ((quad_id == 16)||(quad_id == 21)||(quad_id == 17)||(quad_id == 22))
data_pool[6].insert(data_pool[6].end(),patch.begin(),patch.end());
if ((quad_id == 17)||(quad_id == 22)||(quad_id == 18)||(quad_id == 23)||(quad_id == 19)||(quad_id == 24))
data_pool[7].insert(data_pool[7].end(),patch.begin(),patch.end());
if ((quad_id == 19)||(quad_id == 24)||(quad_id == 20)||(quad_id == 25))
data_pool[8].insert(data_pool[8].end(),patch.begin(),patch.end());
patch_count++;
}
}
quad_id++;
}
}
//do dot product of each normalized and whitened patch
//each pool is averaged and this yields a representation of 9xD
Mat feature = Mat::zeros(9,kernels.rows,CV_64FC1);
for (int i=0; i<9; i++)
{
Mat pool = Mat(data_pool[i]);
pool = pool.reshape(0,(int)data_pool[i].size()/kernels.cols);
for (int p=0; p<pool.rows; p++)
{
for (int f=0; f<kernels.rows; f++)
{
feature.row(i).at<double>(0,f) = feature.row(i).at<double>(0,f) + max(0.0,std::abs(pool.row(p).dot(kernels.row(f)))-alpha);
}
}
}
feature = feature.reshape(0,1);
// data must be normalized within the range obtained during training
double lower = -1.0;
double upper = 1.0;
for (int k=0; k<feature.cols; k++)
{
feature.at<double>(0,k) = lower + (upper-lower) *
(feature.at<double>(0,k)-feature_min.at<double>(0,k))/
(feature_max.at<double>(0,k)-feature_min.at<double>(0,k));
}
double *p = new double[nr_class];
double predict_label = eval_feature(feature,p);
//cout << " Prediction: " << vocabulary[predict_label] << " with probability " << p[0] << endl;
if (predict_label < 0)
CV_Error(Error::StsInternal, "OCRHMMClassifierCNN::eval Error: unexpected prediction in eval_feature()");
out_class.push_back((int)predict_label);
out_confidence.push_back(p[(int)predict_label]);
for (int i = 0; i<nr_class; i++)
{
if ( (i != (int)predict_label) && (p[i] != 0.) )
{
out_class.push_back(i);
out_confidence.push_back(p[i]);
}
}
}
// normalize for contrast and apply ZCA whitening to a set of image patches
void OCRHMMClassifierCNN::normalizeAndZCA(Mat& patches)
{
//Normalize for contrast
for (int i=0; i<patches.rows; i++)
{
Scalar row_mean, row_std;
meanStdDev(patches.row(i),row_mean,row_std);
row_std[0] = sqrt(pow(row_std[0],2)*patches.cols/(patches.cols-1)+10);
patches.row(i) = (patches.row(i) - row_mean[0]) / row_std[0];
}
//ZCA whitening
if ((M.dims == 0) || (P.dims == 0))
{
Mat CC;
calcCovarMatrix(patches,CC,M,COVAR_NORMAL|COVAR_ROWS|COVAR_SCALE);
CC = CC * patches.rows / (patches.rows-1);
Mat e_val,e_vec;
eigen(CC.t(),e_val,e_vec);
e_vec = e_vec.t();
sqrt(1./(e_val + 0.1), e_val);
Mat V = Mat::zeros(e_vec.rows, e_vec.cols, CV_64FC1);
Mat D = Mat::eye(e_vec.rows, e_vec.cols, CV_64FC1);
for (int i=0; i<e_vec.cols; i++)
{
e_vec.col(e_vec.cols-i-1).copyTo(V.col(i));
D.col(i) = D.col(i) * e_val.at<double>(0,e_val.rows-i-1);
}
P = V * D * V.t();
}
for (int i=0; i<patches.rows; i++)
patches.row(i) = patches.row(i) - M;
patches = patches * P;
}
double OCRHMMClassifierCNN::eval_feature(Mat& feature, double* prob_estimates)
{
for(int i=0;i<nr_class;i++)
prob_estimates[i] = 0;
for(int idx=0; idx<nr_feature; idx++)
for(int i=0;i<nr_class;i++)
prob_estimates[i] += weights.at<float>(idx,i)*feature.at<double>(0,idx); //TODO use vectorized dot product
int dec_max_idx = 0;
for(int i=1;i<nr_class;i++)
{
if(prob_estimates[i] > prob_estimates[dec_max_idx])
dec_max_idx = i;
}
for(int i=0;i<nr_class;i++)
prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
double sum=0;
for(int i=0; i<nr_class; i++)
sum+=prob_estimates[i];
for(int i=0; i<nr_class; i++)
prob_estimates[i]=prob_estimates[i]/sum;
return dec_max_idx;
}
Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const std::string& filename)
{
return makePtr<OCRHMMClassifierCNN>(filename);
}
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
@param vocabulary The language vocabulary (chars when ascii english text).
......
......@@ -70,6 +70,22 @@ void OCRTesseract::run(Mat& image, string& output_text, vector<Rect>* component_
component_confidences->clear();
}
void OCRTesseract::run(Mat& image, Mat& mask, string& output_text, vector<Rect>* component_rects,
vector<string>* component_texts, vector<float>* component_confidences,
int component_level)
{
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (component_level == OCR_LEVEL_TEXTLINE) || (component_level == OCR_LEVEL_WORD) );
output_text.clear();
if (component_rects != NULL)
component_rects->clear();
if (component_texts != NULL)
component_texts->clear();
if (component_confidences != NULL)
component_confidences->clear();
}
class OCRTesseractImpl : public OCRTesseract
{
private:
......@@ -189,6 +205,16 @@ public:
#endif
}
void run(Mat& image, Mat& mask, string& output, vector<Rect>* component_rects=NULL,
vector<string>* component_texts=NULL, vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert( mask.type() == CV_8UC1 );
CV_Assert( (image.type() == CV_8UC1) || (image.type() == CV_8UC3) );
run( mask, output, component_rects, component_texts, component_confidences, component_level);
}
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment