Commit 3aa88889 authored by Maksim Shabunin's avatar Maksim Shabunin

Reworked HolisticWordspotter to work with dnn module

parent bad02f37
set(the_description "Text Detection and Recognition") set(the_description "Text Detection and Recognition")
ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python java) ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_dnn OPTIONAL opencv_highgui WRAP python java)
if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT) if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
...@@ -22,62 +22,3 @@ ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR}) ...@@ -22,62 +22,3 @@ ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR})
ocv_add_testdata(samples/ contrib/text ocv_add_testdata(samples/ contrib/text
FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg" FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg"
) )
#Principal source from which adaptation came is the cnn_3dobj module
find_package(Caffe)
if(Caffe_FOUND)
message(STATUS "Caffe: YES")
set(HAVE_CAFFE 1)
else()
message(STATUS "Caffe: NO")
endif()
find_package(Protobuf)
if(Protobuf_FOUND)
message(STATUS "Protobuf: YES")
set(HAVE_PROTOBUF 1)
else()
message(STATUS "Protobuf: NO")
endif()
find_package(Glog)
if(Glog_FOUND)
message(STATUS "Glog: YES")
set(HAVE_GLOG 1)
else()
message(STATUS "Glog: NO")
endif()
if(HAVE_CAFFE)
message(STATUS "HAVE CAFFE!!!")
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
${CMAKE_CURRENT_SOURCE_DIR}/include/opencv2/text_config.hpp @ONLY)
include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(${Caffe_FOUND})
include_directories(${Caffe_INCLUDE_DIR})
#taken from caffe's cmake
find_package(HDF5 COMPONENTS HL REQUIRED)
include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
include_directories(SYSTEM ${Boost_INCLUDE_DIR})
include_directories(SYSTEM /usr/local/cuda-7.5/targets/x86_64-linux/include/)
list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
endif()
if(${Caffe_FOUND})
#taken from caffe's cmake
target_link_libraries(opencv_text ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES})
endif()
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
${CMAKE_BINARY_DIR}/text_config.hpp @ONLY)
\ No newline at end of file
# Caffe package for CNN Triplet training
unset(Caffe_FOUND)
find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp
HINTS
/usr/local/include)
find_library(Caffe_LIBS NAMES caffe
HINTS
/usr/local/lib)
if(Caffe_LIBS AND Caffe_INCLUDE_DIR)
set(Caffe_FOUND 1)
endif()
#Required for Caffe
unset(Glog_FOUND)
find_library(Glog_LIBS NAMES glog
HINTS
/usr/local/lib)
if(Glog_LIBS)
set(Glog_FOUND 1)
endif()
#Protobuf package required for Caffe
unset(Protobuf_FOUND)
find_library(Protobuf_LIBS NAMES protobuf
HINTS
/usr/local/lib)
if(Protobuf_LIBS)
set(Protobuf_FOUND 1)
endif()
...@@ -46,10 +46,6 @@ ...@@ -46,10 +46,6 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <iostream>
#include <sstream>
namespace cv namespace cv
{ {
...@@ -540,107 +536,24 @@ at each window location. ...@@ -540,107 +536,24 @@ at each window location.
CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const String& filename); CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const String& filename);
//! @}
//Classifiers should provide diferent backends
//For the moment only caffe is implemeted
enum{
OCR_HOLISTIC_BACKEND_NONE,
OCR_HOLISTIC_BACKEND_CAFFE
};
/** @brief Abstract class that implements the classifcation of text images.
*
* The interface is generic enough to describe any image classifier. And allows
* to take advantage of compouting in batches. While word classifiers are the default
* networks, any image classifers should work.
*
*/
class CV_EXPORTS_W TextImageClassifier
{
protected:
Size inputSz_;
int channelCount_;
/** @brief all image preprocessing is handled here including whitening etc.
*
* @param input the image to be preprocessed for the classifier. If the depth
* is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
*
* @param output reference to the image to be fed to the classifier, the preprocessor will
* resize the image to the apropriate size and convert it to the apropriate depth\
*
* The method preprocess should never be used externally, it is up to classify and classifyBatch
* methods to employ it.
*/
virtual void preprocess(Mat& input,Mat& output)=0;
public:
virtual ~TextImageClassifier() {}
/** @brief produces a class confidence row-vector given an image
*/
CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0;
/** @brief produces a matrix containing class confidence row-vectors given an collection of images
*/
CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0;
/** @brief simple getter method returning the size of the oputput row-vector
*/
CV_WRAP virtual int getOutputSize()=0;
/** @brief simple getter method returning the size of the minibatches for this classifier.
* If not applicabe this method should return 1
*/
CV_WRAP virtual int getMinibatchSize()=0;
/** @brief simple getter method returning a value describing the framework beeing employed to implement the classifier
*/
CV_WRAP virtual int getBackend(){return OCR_HOLISTIC_BACKEND_NONE;}
};
class CV_EXPORTS_W DictNet:public TextImageClassifier
{
/** @brief Class that uses a pretrained caffe model for word classification.
*
* This network is described in detail in:
* Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
* http://arxiv.org/abs/1412.1842
*/
public:
virtual ~DictNet() {};
CV_WRAP virtual bool usingGpu()=0;
/** @brief Constructs a DictNet object from a caffe pretrained model
*
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
*
* @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be
* very large, up to 2GB.
*
* @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
* has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
*
* @param useGpu boolean flag setting GPU or CPU computation
*
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
* the only option
*/
CV_WRAP static Ptr<DictNet> create(String archFilename,String weightsFilename,int minibatchSz=100,bool useGpu=0,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE);
};
/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. /** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
* Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable * Given a predefined vocabulary , a DictNet is employed to select the most probable
* word given an input image. * word given an input image.
* *
* This class implements the logic of providing transcriptions given a vocabulary and and an image * DictNet is described in detail in:
* classifer. * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
* http://arxiv.org/abs/1412.1842
*/ */
class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR class CV_EXPORTS OCRHolisticWordRecognizer : public BaseOCR
{ {
public: public:
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, virtual void run(Mat& image,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, std::string& output_text,
int component_level=OCR_LEVEL_WORD)=0; std::vector<Rect>* component_rects = NULL,
std::vector<std::string>* component_texts = NULL,
std::vector<float>* component_confidences = NULL,
int component_level = OCR_LEVEL_WORD) = 0;
/** @brief Recognize text using a segmentation based word-spotting/classifier cnn. /** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
...@@ -665,68 +578,24 @@ public: ...@@ -665,68 +578,24 @@ public:
@param component_level must be OCR_LEVEL_WORD. @param component_level must be OCR_LEVEL_WORD.
*/ */
virtual void run(Mat& image,
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL, Mat& mask,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, std::string& output_text,
int component_level=OCR_LEVEL_WORD)=0; std::vector<Rect>* component_rects = NULL,
std::vector<std::string>* component_texts = NULL,
std::vector<float>* component_confidences = NULL,
/** int component_level = OCR_LEVEL_WORD) = 0;
@brief Method that provides a quick and simple interface to a single word image classifcation
@param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word
@param transcription an opencv string that will store the detected word transcription
@param confidence a double that will be updated with the confidence the classifier has for the selected word
*/
CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0;
/**
@brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
the classifiers parallel capabilities.
@param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
to contain a single word.
@param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
input image
@param confidences a vector of double that will be updated with the confidence the classifier has for each of the
selected words.
*/
CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
/**
@brief simple getted for the vocabulary employed
*/
CV_WRAP virtual const std::vector<String>& getVocabulary()=0;
/** @brief Creates an instance of the OCRHolisticWordRecognizer class. /** @brief Creates an instance of the OCRHolisticWordRecognizer class.
@param classifierPtr an instance of TextImageClassifier, normaly a DictNet instance
@param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename);
/** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DictNet classifier.
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
@param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/ */
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename); static Ptr<OCRHolisticWordRecognizer> create(const std::string &archFilename,
const std::string &weightsFilename,
const std::string &wordsFilename);
}; };
//! @}
} }} // cv::text::
}
#endif // _OPENCV_TEXT_OCR_HPP_ #endif // _OPENCV_TEXT_OCR_HPP_
...@@ -12,79 +12,50 @@ ...@@ -12,79 +12,50 @@
#include "opencv2/imgproc.hpp" #include "opencv2/imgproc.hpp"
#include <sstream> #include <sstream>
#include <vector>
#include <iostream> #include <iostream>
#include <iomanip>
#include <fstream>
inline std::string getHelpStr(std::string progFname){ using namespace std;
std::stringstream out; using namespace cv;
out << " Demo of wordspotting CNN for text recognition." << std::endl; using namespace cv::text;
out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
out << " Usage: " << progFname << " <output_file> <input_image1> <input_image2> ... <input_imageN>" << std::endl; inline void printHelp()
out << " Caffe Model files (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<std::endl; {
out << " must be in the current directory." << std::endl << std::endl; cout << " Demo of wordspotting CNN for text recognition." << endl;
cout << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
out << " Obtaining Caffe Model files in linux shell:"<<std::endl; cout << " Usage: program <input_image>" << endl;
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<std::endl; cout << " Caffe Model files (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<endl;
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<std::endl; cout << " must be in the current directory." << endl << endl;
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<std::endl<<std::endl;
return out.str();
}
inline bool fileExists (std::string filename) { cout << " Obtaining Caffe Model files in linux shell:"<<endl;
std::ifstream f(filename.c_str()); cout << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<endl;
return f.good(); cout << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<endl;
cout << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<endl<<endl;
} }
int main(int argc, const char * argv[])
int main(int argc, const char * argv[]){ {
const int USE_GPU=0; if (argc != 2)
{
if (argc < 3){ printHelp();
std::cout<<getHelpStr(argv[0]);
std::cout<<"Insufiecient parameters. Aborting!"<<std::endl;
exit(1); exit(1);
} }
if (!fileExists("dictnet_vgg.caffemodel") || Mat image = imread(argv[1], IMREAD_GRAYSCALE);
!fileExists("dictnet_vgg_deploy.prototxt") ||
!fileExists("dictnet_vgg_labels.txt")){
std::cout<<getHelpStr(argv[0]);
std::cout<<"Model files not found in the current directory. Aborting!"<<std::endl;
exit(1);
}
if (fileExists(argv[1])){ cout << "Read image (" << argv[1] << "): " << image.size << ", channels: " << image.channels() << ", depth: " << image.depth() << endl;
std::cout<<getHelpStr(argv[0]);
std::cout<<"Output file must not exist. Aborting!"<<std::endl;
exit(1);
}
std::vector<cv::Mat> imageList; if (image.empty())
for(int imageIdx=2;imageIdx<argc;imageIdx++){ {
if (fileExists(argv[imageIdx])){ printHelp();
imageList.push_back(cv::imread(cv::String(argv[imageIdx]))); exit(1);
}else{
std::cout<<getHelpStr(argv[0]);
std::cout<<argv[imageIdx]<<" doesn't exist. Aborting";
}
} }
cv::Ptr<cv::text::DictNet> cnn=cv::text::DictNet::create(
"dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",100,USE_GPU);
cv::Ptr<cv::text::OCRHolisticWordRecognizer> wordSpotter= Ptr<OCRHolisticWordRecognizer> wordSpotter = OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt");
std::vector<cv::String> wordList; std::string word;
std::vector<double> outProbabillities; vector<float> confs;
wordSpotter->recogniseImageBatch(imageList,wordList,outProbabillities); wordSpotter->run(image, word, 0, 0, &confs);
std::ofstream out; cout << "Detected word: '" << word << "', confidence: " << confs[0] << endl;
out.open(argv[1]);
for(int imgIdx=0;imgIdx<int(imageList.size());imgIdx++){
out<<argv[imgIdx+2]<<","<<wordList[imgIdx]<<","<<outProbabillities[imgIdx]<<std::endl;
}
out.close();
} }
#!/usr/bin/env python
import cv2
import sys
import os.path
#Global variable shared between the Mouse callback and main
refPt = []
cropping = False
image=None
drawImage=None
dictNet=None
wordSpotter=None
def mouseCallback(event, x, y, flags, param):
# grab references to the global variables
global refPt, cropping,wordSpotter,drawImage,image
# if the left mouse button was clicked, record the starting
# (x, y) coordinates and indicate that cropping is being
# performed
if event == cv2.EVENT_LBUTTONDOWN:
refPt = [(x, y)]
cropping = True
# check to see if the left mouse button was released
elif event == cv2.EVENT_LBUTTONUP:
# record the ending (x, y) coordinates and indicate that
# the cropping operation is finished
refPt.append((x, y))
cropping = False
# draw a rectangle around the region of interest
roi = image[refPt[0][1]:refPt[1][1], refPt[0][0]:refPt[1][0]]
res=wordSpotter.recogniseImage(roi)
drawImage = image.copy()
cv2.rectangle(drawImage, refPt[0], refPt[1], (0, 255, 0), 2)
cv2.putText(drawImage,"%s:%f"%(res[0],res[1]),refPt[0],cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2)
cv2.imshow("Select A Region", drawImage)
if __name__=='__main__':
USEGPU=False
helpStr="""Usage: """+sys.argv[0]+""" IMAGE_FILENAME
Press 'q' or 'Q' exit
The modelFiles must be available in the current directory.
In linux shell they can be downloaded (~2GB) with the following commands:
wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel
wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt
wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt
"""
if((len(sys.argv)!=2 )or not(os.path.isfile(sys.argv[1]) )):
print helpStr
print 'No image file given Aborting!'
sys.exit(1)
if not (os.path.isfile('dictnet_vgg_deploy.prototxt') and
os.path.isfile('dictnet_vgg.caffemodel') and
os.path.isfile('dictnet_vgg_labels.txt')):
print helpStr
print 'Model files not present, Aborting!'
sys.exit(1)
dictNet=cv2.text.DictNet_create('./dictnet_vgg_deploy.prototxt','./dictnet_vgg.caffemodel',100,USEGPU)
wordSpotter=cv2.text.OCRHolisticWordRecognizer_create(dictNet,"./dictnet_vgg_labels.txt")
image = cv2.imread(sys.argv[1])
drawImage = image.copy()
cv2.namedWindow("Select A Region")
cv2.setMouseCallback("Select A Region", mouseCallback)
while True:
cv2.imshow("Select A Region", drawImage)
key = cv2.waitKey(1) & 0xFF
# if the 'q' key is pressed, break from the loop
if key == ord("q") or key == ord("Q"):
break
cv2.destroyAllWindows()
#include "precomp.hpp" #include "precomp.hpp"
#include "opencv2/imgproc.hpp" #include "opencv2/imgproc.hpp"
#include "opencv2/core.hpp" #include "opencv2/core.hpp"
#include "opencv2/dnn.hpp"
#include <iostream>
#include <fstream> #include <fstream>
#include <sstream>
#include <queue>
#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
//should this be moved elsewhere?
//In precomp.hpp It doesn't work
#ifdef HAVE_CAFFE
#include "caffe/caffe.hpp"
#endif
using namespace std;
namespace cv { namespace text { namespace cv { namespace text {
//Maybe OpenCV has a routine better suited class OCRHolisticWordRecognizerImpl : public OCRHolisticWordRecognizer
inline bool fileExists (String filename) { {
std::ifstream f(filename.c_str());
return f.good();
}
class DictNetCaffeImpl: public DictNet{
protected:
void preprocess(Mat& input,Mat& output){
if(input.channels()==3){
Mat tmpInput;
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U){
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else{
if(input.channels()==1){
if(input.depth()==CV_8U){
input.convertTo(output, CV_32FC1,1/255.0);
}else{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else{
CV_Error(Error::StsError,"Expecting images with either 1 or 3 channels");
}
}
resize(output,output,this->inputGeometry_);
Scalar dev,mean;
meanStdDev(output,mean,dev);
subtract(output,mean[0],output);
divide(output,(dev[0]/128.0),output);
}
void classifyMiniBatch(std::vector<Mat> inputImageList, Mat outputMat){
//Classifies a list of images containing at most minibatchSz_ images
CV_Assert(int(inputImageList.size())<=this->minibatchSz_);
CV_Assert(outputMat.isContinuous());
#ifdef HAVE_CAFFE
net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width);
net_->Reshape();
float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data();
float* inputData=inputBuffer;
for(size_t imgNum=0;imgNum<inputImageList.size();imgNum++){
Mat preprocessed;
cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData);
this->preprocess(inputImageList[imgNum],preprocessed);
preprocessed.copyTo(netInputWraped);
inputData+=(this->inputGeometry_.height*this->inputGeometry_.width);
}
this->net_->ForwardPrefilled();
const float* outputNetData=net_->output_blobs()[0]->cpu_data();
float*outputMatData=(float*)(outputMat.data);
memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size());
#endif
}
#ifdef HAVE_CAFFE
Ptr<caffe::Net<float> > net_;
#endif
Size inputGeometry_;
int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
bool gpuBackend_;//The existence of the assignment operator mandates this to be nonconst
int outputSize_;
public:
DictNetCaffeImpl(const DictNetCaffeImpl& dn):inputGeometry_(dn.inputGeometry_),minibatchSz_(dn.minibatchSz_),
gpuBackend_(dn.gpuBackend_),outputSize_(dn.outputSize_){
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
}
DictNetCaffeImpl& operator=(const DictNetCaffeImpl &dn){
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
this->inputGeometry_=dn.inputGeometry_;
this->minibatchSz_=dn.minibatchSz_;
this->gpuBackend_=dn.gpuBackend_;
this->outputSize_=dn.outputSize_;
return *this;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
}
DictNetCaffeImpl(String modelArchFilename, String modelWeightsFilename, int maxMinibatchSz, bool useGpu)
:minibatchSz_(maxMinibatchSz), gpuBackend_(useGpu){
CV_Assert(this->minibatchSz_>0);
CV_Assert(fileExists(modelArchFilename));
CV_Assert(fileExists(modelWeightsFilename));
#ifdef HAVE_CAFFE
if(this->gpuBackend_){
caffe::Caffe::set_mode(caffe::Caffe::GPU);
}else{
caffe::Caffe::set_mode(caffe::Caffe::CPU);
}
this->net_.reset(new caffe::Net<float>(modelArchFilename, caffe::TEST));
CV_Assert(net_->num_inputs()==1);
CV_Assert(net_->num_outputs()==1);
CV_Assert(this->net_->input_blobs()[0]->channels()==1);
this->net_->CopyTrainedLayersFrom(modelWeightsFilename);
caffe::Blob<float>* inputLayer = this->net_->input_blobs()[0];
this->inputGeometry_=Size(inputLayer->width(), inputLayer->height());
inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width);
net_->Reshape();
this->outputSize_=net_->output_blobs()[0]->channels();
#else
CV_Error(Error::StsError,"Caffe not available during compilation!");
#endif
}
void classify(InputArray image, OutputArray classProbabilities){
std::vector<Mat> inputImageList;
inputImageList.push_back(image.getMat());
classifyBatch(inputImageList,classProbabilities);
}
void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities){
std::vector<Mat> allImageVector;
inputImageList.getMatVector(allImageVector);
size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic
size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic
classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F);
Mat outputMat = classProbabilities.getMat();
for(size_t imgNum=0;imgNum<allImageVector.size();imgNum+=minibatchSize){
size_t rangeEnd=imgNum+std::min<size_t>(allImageVector.size()-imgNum,minibatchSize);
std::vector<Mat>::const_iterator from=std::vector<Mat>::const_iterator(allImageVector.begin()+imgNum);
std::vector<Mat>::const_iterator to=std::vector<Mat>::const_iterator(allImageVector.begin()+rangeEnd);
std::vector<Mat> minibatchInput(from,to);
classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd)));
}
}
int getOutputSize(){
return this->outputSize_;
}
int getMinibatchSize(){
return this->minibatchSz_;
}
bool usingGpu(){
return this->gpuBackend_;
}
int getBackend(){
return OCR_HOLISTIC_BACKEND_CAFFE;
}
};
Ptr<DictNet> DictNet::create(String archFilename,String weightsFilename,int minibatchSz,bool useGpu,int backEnd){
switch(backEnd){
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DictNet>(new DictNetCaffeImpl(archFilename, weightsFilename, minibatchSz, useGpu));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DictNet::create backend not implemented");
return Ptr<DictNet>();
break;
}
}
class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{
private: private:
struct NetOutput{ dnn::Net net;
//Auxiliary structure that handles the logic of getting class ids and probabillities from vector<string> words;
//the raw outputs of caffe
int wordIdx;
float probabillity;
static bool sorter(const NetOutput& o1,const NetOutput& o2){//used with std::sort to provide the most probable class
return o1.probabillity>o2.probabillity;
}
static void getOutputs(const float* buffer,int nbOutputs,std::vector<NetOutput>& res){
res.resize(nbOutputs);
for(int k=0;k<nbOutputs;k++){
res[k].wordIdx=k;
res[k].probabillity=buffer[k];
}
std::sort(res.begin(),res.end(),NetOutput::sorter);
}
static void getClassification(const float* buffer,int nbOutputs,int &classNum,double& confidence){
std::vector<NetOutput> tmp;
getOutputs(buffer,nbOutputs,tmp);
classNum=tmp[0].wordIdx;
confidence=tmp[0].probabillity;
}
};
protected:
std::vector<String> labels_;
Ptr<TextImageClassifier> classifier_;
public: public:
OCRHolisticWordRecognizerImpl(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename):classifier_(classifierPtr){ OCRHolisticWordRecognizerImpl(const string &archFilename, const string &weightsFilename, const string &wordsFilename)
CV_Assert(fileExists(vocabullaryFilename));//this fails for some rason {
std::ifstream labelsFile(vocabullaryFilename.c_str()); net = dnn::readNetFromCaffe(archFilename, weightsFilename);
if(!labelsFile){ std::ifstream in(wordsFilename.c_str());
CV_Error(Error::StsError,"Could not read Labels from file"); if (!in)
{
CV_Error(Error::StsError, "Could not read Labels from file");
} }
std::string line; std::string line;
while (std::getline(labelsFile, line)){ while (std::getline(in, line))
labels_.push_back(std::string(line)); words.push_back(line);
} CV_Assert(getClassCount() == words.size());
CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size()));
}
void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence){
Mat netOutput;
this->classifier_->classify(inputImage,netOutput);
int classNum;
NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence);
transcription=this->labels_[classNum];
}
void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptionVec,CV_OUT std::vector<double>& confidenceVec){
Mat netOutput;
this->classifier_->classifyBatch(inputImageList,netOutput);
for(int k=0;k<netOutput.rows;k++){
int classNum;
double confidence;
NetOutput::getClassification((float*)(netOutput.row(k).data),this->classifier_->getOutputSize(),classNum,confidence);
transcriptionVec.push_back(this->labels_[classNum]);
confidenceVec.push_back(confidence);
}
} }
void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, int component_level=0)
void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, {
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, CV_Assert(component_level==OCR_LEVEL_WORD); //Componnents not applicable for word spotting
int component_level=0){
CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting
double confidence; double confidence;
String transcription; output_text = classify(image, confidence);
recogniseImage(image,transcription,confidence);
output_text=transcription.c_str();
if(component_rects!=NULL){ if(component_rects!=NULL){
component_rects->resize(1); component_rects->resize(1);
(*component_rects)[0]=Rect(0,0,image.size().width,image.size().height); (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height);
} }
if(component_texts!=NULL){ if(component_texts!=NULL){
component_texts->resize(1); component_texts->resize(1);
(*component_texts)[0]=transcription.c_str(); (*component_texts)[0] = output_text;
} }
if(component_confidences!=NULL){ if(component_confidences!=NULL){
component_confidences->resize(1); component_confidences->resize(1);
(*component_confidences)[0]=float(confidence); (*component_confidences)[0] = float(confidence);
} }
} }
void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, int component_level=0)
int component_level=0){ {
CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image //Mask is ignored because the CNN operates on a full image
this->run(image,output_text,component_rects,component_texts,component_confidences,component_level); CV_Assert(mask.cols == image.cols && mask.rows == image.rows);
this->run(image, output_text, component_rects, component_texts, component_confidences, component_level);
} }
std::vector<String>& getVocabulary(){
return this->labels_; protected:
Size getPerceptiveField() const
{
return Size(100, 32);
}
size_t getClassCount()
{
int id = net.getLayerId("prob");
dnn::MatShape inputShape;
inputShape.push_back(1);
inputShape.push_back(1);
inputShape.push_back(getPerceptiveField().height);
inputShape.push_back(getPerceptiveField().width);
vector<dnn::MatShape> inShapes, outShapes;
net.getLayerShapes(inputShape, id, inShapes, outShapes);
CV_Assert(outShapes.size() == 1 && outShapes[0].size() == 4);
CV_Assert(outShapes[0][0] == 1 && outShapes[0][2] == 1 && outShapes[0][3] == 1);
return outShapes[0][1];
}
string classify(InputArray image, double & conf)
{
CV_Assert(image.channels() == 1 && image.depth() == CV_8U);
Mat resized;
resize(image, resized, getPerceptiveField());
Mat blob = dnn::blobFromImage(resized);
net.setInput(blob, "data");
Mat prob = net.forward("prob");
CV_Assert(prob.dims == 4 && !prob.empty() && prob.size[1] == (int)getClassCount());
int idx[4] = {0};
minMaxIdx(prob, 0, &conf, 0, idx);
CV_Assert(0 <= idx[1] && idx[1] < (int)words.size());
return words[idx[1]];
} }
};
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename ){ };
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabullaryFilename));
}
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename){ Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(const string &archFilename, const string &weightsFilename, const string &wordsFilename)
Ptr<TextImageClassifier> classifierPtr(new DictNetCaffeImpl(modelArchFilename,modelWeightsFilename, 100,0)); {
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabullaryFilename)); return makePtr<OCRHolisticWordRecognizerImpl>(archFilename, weightsFilename, wordsFilename);
} }
} } //namespace text namespace cv }} // cv::text::
#ifndef __OPENCV_TEXT_CONFIG_HPP__ #ifndef __OPENCV_TEXT_CONFIG_HPP__
#define __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__
// HAVE CAFFE
#cmakedefine HAVE_CAFFE
// HAVE OCR Tesseract // HAVE OCR Tesseract
#cmakedefine HAVE_TESSERACT #cmakedefine HAVE_TESSERACT
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment