Commit 951e1827 authored by Vladislav Sovrasov's avatar Vladislav Sovrasov

text: cleanup dnn text detection part

parent c33629e0
set(the_description "Text Detection and Recognition") set(the_description "Text Detection and Recognition")
ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_dnn OPTIONAL opencv_highgui WRAP python java)
if(POLICY CMP0023) if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT)
message(STATUS "Explicitly setting policy CMP0023 to OLD") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
cmake_policy(SET CMP0023 OLD) find_package(Tesseract QUIET)
endif(POLICY CMP0023) if(Tesseract_FOUND)
# Using cmake scripts and modules
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
set(TEXT_DEPS opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d opencv_calib3d)
find_package(Caffe)
if(Caffe_FOUND)
message(STATUS "Caffe: YES")
set(HAVE_CAFFE 1)
else()
message(STATUS "Caffe: NO")
# list(APPEND TEXT_DEPS opencv_dnn)
endif()
#internal dependencies
find_package(Protobuf)
if(Protobuf_FOUND)
message(STATUS "Protobuf: YES")
set(HAVE_PROTOBUF 1)
else()
message(STATUS "Protobuf: NO")
endif()
find_package(Glog)
if(Glog_FOUND)
message(STATUS "Glog: YES")
set(HAVE_GLOG 1)
else()
message(STATUS "Glog: NO")
endif()
ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d OPTIONAL opencv_dnn WRAP python)
#ocv_define_module(text ${TEXT_DEPS} WRAP python)
#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR})
find_package(Tesseract)
if(${Tesseract_FOUND})
message(STATUS "Tesseract: YES") message(STATUS "Tesseract: YES")
include_directories(${Tesseract_INCLUDE_DIR}) set(HAVE_TESSERACT 1)
target_link_libraries(opencv_text ${Tesseract_LIBS}) ocv_include_directories(${Tesseract_INCLUDE_DIR})
add_definitions(-DHAVE_TESSERACT) ocv_target_link_libraries(${the_module} ${Tesseract_LIBRARIES})
else() else()
message(STATUS "Tesseract: NO") message(STATUS "Tesseract: NO")
endif() endif()
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
${CMAKE_BINARY_DIR}/text_config.hpp @ONLY)
if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF) ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${Caffe_INCLUDE_DIR})
find_package(HDF5 COMPONENTS HL REQUIRED)
include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
include_directories(SYSTEM ${Boost_INCLUDE_DIR})
include_directories(SYSTEM ${CUDA_INCLUDE_DIR})
link_directories(SYSTEM ${CUDA_LIBS})
# include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ )
#link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64)
list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
target_link_libraries(opencv_text atlas blas ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES})
add_definitions(-DHAVE_CAFFE)
endif() #HAVE_CAFFE
message(STATUS "TEXT CAFFE SEARCH")
if()
message(STATUS "TEXT NO CAFFE CONFLICT")
else()
message(STATUS "TEXT CAFFE CONFLICT")
endif()
if(HAVE_opencv_dnn) ocv_add_testdata(samples/ contrib/text
message(STATUS "dnn module found") FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg"
add_definitions(-DHAVE_DNN) )
set(HAVE_DNN 1)
else()
message(STATUS "dnn module not found")
endif()
# Caffe package for CNN Triplet training
unset(Caffe_FOUND)
find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp
HINTS
/usr/local/include)
find_library(Caffe_LIBS NAMES caffe
HINTS
/usr/local/lib)
if(Caffe_LIBS AND Caffe_INCLUDE_DIR)
set(Caffe_FOUND 1)
endif()
#Required for Caffe
unset(Glog_FOUND)
find_library(Glog_LIBS NAMES glog
HINTS
/usr/local/lib)
if(Glog_LIBS)
set(Glog_FOUND 1)
endif()
#Protobuf package required for Caffe
unset(Protobuf_FOUND)
find_library(Protobuf_LIBS NAMES protobuf
HINTS
/usr/local/lib)
if(Protobuf_LIBS)
set(Protobuf_FOUND 1)
endif()
# Tesseract OCR
unset(Tesseract_FOUND)
find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
HINTS
/usr/include
/usr/local/include)
find_library(Tesseract_LIBRARY NAMES tesseract
HINTS
/usr/lib
/usr/local/lib)
find_library(Lept_LIBRARY NAMES lept
HINTS
/usr/lib
/usr/local/lib)
set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY})
if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR)
set(Tesseract_FOUND 1)
endif()
...@@ -56,74 +56,3 @@ Intro ...@@ -56,74 +56,3 @@ Intro
----- -----
The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects. The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects.
Two backends are supported 1) caffe 2) opencv-dnn
Instalation of Caffe backend
----------------------------
* Please note a custom caffe based on SSD branch is required, the link of the custom caffe is provided below
The caffe wrapping backend has the requirements caffe does.
* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises.
The simplest solution is to build caffe without support for OpenCV.
* Only the OS supported by Caffe are supported by the backend.
The scripts describing the module have been developed in ubuntu 16.04 and assume such a system.
Other UNIX systems including OSX should be easy to adapt.
Sample script for building Caffe
```bash
#!/bin/bash
SRCROOT="${HOME}/caffe_inst/"
mkdir -p "$SRCROOT"
cd "$SRCROOT"
git clone https://github.com/sghoshcvc/TextBoxes.git
cd TextBoxes
cat Makefile.config.example > Makefile.config
echo 'USE_OPENCV := 0' >> Makefile.config
echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config
echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config
echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200
+++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200
@@ -234,6 +234,7 @@
template <typename T>
friend class Net;
+ virtual ~Callback(){}
};
const vector<Callback*>& before_forward() const { return before_forward_; }
void add_before_forward(Callback* value) {
">/tmp/cleanup_caffe.diff
patch < /tmp/cleanup_caffe.diff
make -j 6
make pycaffe
make distribute
```
```bash
#!/bin/bash
cd $OPENCV_BUILD_DIR #You must set this
CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04
cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="$OPENCV_CONTRIB/modules" ./
```
where $OPECV_CONTRIB is the root directory containing opencv_contrib module
Instalation of Caffe backend
----------------------------
Use of opencv-dnn does not need any additional library.
The recent opencv-3.3.0 needs to be build with extra modules to use text module.
...@@ -5,14 +5,17 @@ endif() ...@@ -5,14 +5,17 @@ endif()
if(NOT Tesseract_FOUND) if(NOT Tesseract_FOUND)
find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
HINTS HINTS
/usr/include
/usr/local/include) /usr/local/include)
find_library(Tesseract_LIBRARY NAMES tesseract find_library(Tesseract_LIBRARY NAMES tesseract
HINTS HINTS
/usr/lib
/usr/local/lib) /usr/local/lib)
find_library(Lept_LIBRARY NAMES lept find_library(Lept_LIBRARY NAMES lept
HINTS HINTS
/usr/lib
/usr/local/lib) /usr/local/lib)
if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY) if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY)
......
...@@ -93,7 +93,7 @@ grouping horizontally aligned text, and the method proposed by Lluis Gomez and D ...@@ -93,7 +93,7 @@ grouping horizontally aligned text, and the method proposed by Lluis Gomez and D
in @cite Gomez13 @cite Gomez14 for grouping arbitrary oriented text (see erGrouping). in @cite Gomez13 @cite Gomez14 for grouping arbitrary oriented text (see erGrouping).
To see the text detector at work, have a look at the textdetection demo: To see the text detector at work, have a look at the textdetection demo:
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/textdetection.cpp> <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/textdetection.cpp>
@defgroup text_recognize Scene Text Recognition @defgroup text_recognize Scene Text Recognition
@} @}
......
...@@ -65,7 +65,6 @@ component tree of the image. : ...@@ -65,7 +65,6 @@ component tree of the image. :
*/ */
struct CV_EXPORTS ERStat struct CV_EXPORTS ERStat
{ {
public: public:
//! Constructor //! Constructor
explicit ERStat(int level = 256, int pixel = 0, int x = 0, int y = 0); explicit ERStat(int level = 256, int pixel = 0, int x = 0, int y = 0);
......
...@@ -44,12 +44,10 @@ ...@@ -44,12 +44,10 @@
#ifndef __OPENCV_TEXT_OCR_HPP__ #ifndef __OPENCV_TEXT_OCR_HPP__
#define __OPENCV_TEXT_OCR_HPP__ #define __OPENCV_TEXT_OCR_HPP__
#include <opencv2/core.hpp>
#include <vector> #include <vector>
#include <string> #include <string>
#include <iostream>
#include <sstream>
namespace cv namespace cv
{ {
...@@ -91,100 +89,61 @@ enum ocr_engine_mode ...@@ -91,100 +89,61 @@ enum ocr_engine_mode
}; };
//base class BaseOCR declares a common API that would be used in a typical text recognition scenario //base class BaseOCR declares a common API that would be used in a typical text recognition scenario
class CV_EXPORTS_W BaseOCR class CV_EXPORTS_W BaseOCR
{ {
public: public:
virtual ~BaseOCR() {}; virtual ~BaseOCR() {};
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
virtual void run(Mat& image, std::string& output_text, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0; int component_level=0) = 0;
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0; int component_level=0) = 0;
/** @brief Main functionality of the OCR Hierarchy. Subclasses provide
* default parameters for all parameters other than the input image.
*/
virtual String run(InputArray image){
std::string res;
std::vector<Rect> component_rects;
std::vector<float> component_confidences;
std::vector<std::string> component_texts;
Mat inputImage=image.getMat();
this->run(inputImage,res,&component_rects,&component_texts,
&component_confidences,OCR_LEVEL_WORD);
return res;
}
}; };
/** @brief OCRTesseract class provides an interface with the tesseract-ocr API /** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++.
* (v3.02.02) in C++.
Notice that it is compiled only when tesseract-ocr is correctly installed. Notice that it is compiled only when tesseract-ocr is correctly installed.
@note @note
- (C++) An example of OCRTesseract recognition combined with scene text - (C++) An example of OCRTesseract recognition combined with scene text detection can be found
detection can be found at the end_to_end_recognition demo: at the end_to_end_recognition demo:
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/end_to_end_recognition.cpp> <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/end_to_end_recognition.cpp>
- (C++) Another example of OCRTesseract recognition combined with scene - (C++) Another example of OCRTesseract recognition combined with scene text detection can be
text detection can be found at the webcam_demo: found at the webcam_demo:
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp> <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp>
*/ */
class CV_EXPORTS_W OCRTesseract : public BaseOCR class CV_EXPORTS_W OCRTesseract : public BaseOCR
{ {
public: public:
/** @brief Recognize text using the tesseract-ocr API. /** @brief Recognize text using the tesseract-ocr API.
Takes image on input and returns recognized text in the output_text Takes image on input and returns recognized text in the output_text parameter. Optionally
parameter. Optionally provides also the Rects for individual text elements provides also the Rects for individual text elements found (e.g. words), and the list of those
found (e.g. words), and the list of those text elements with their text elements with their confidence values.
confidence values.
@param image Input image CV_8UC1 or CV_8UC3 @param image Input image CV_8UC1 or CV_8UC3
@param output_text Output text of the tesseract-ocr. @param output_text Output text of the tesseract-ocr.
@param component_rects If provided the method will output a list of Rects for the individual
@param component_rects If provided the method will output a list of Rects text elements found (e.g. words or text lines).
for the individual text elements found (e.g. words or text lines). @param component_texts If provided the method will output a list of text strings for the
recognition of individual text elements found (e.g. words or text lines).
@param component_texts If provided the method will output a list of text @param component_confidences If provided the method will output a list of confidence values
strings for the recognition of individual text elements found (e.g. words or for the recognition of individual text elements found (e.g. words or text lines).
text lines). @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXTLINE.
@param component_confidences If provided the method will output a list of
confidence values for the recognition of individual text elements found
(e.g. words or text lines).
@param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE.
*/ */
using BaseOCR::run; virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
virtual void run (Mat& image, std::string& output_text, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
virtual void run (Mat& image, Mat& mask, std::string& output_text, virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
// aliases for scripting // aliases for scripting
CV_WRAP String run (InputArray image, int min_confidence, CV_WRAP String run(InputArray image, int min_confidence, int component_level=0);
int component_level=0);
CV_WRAP String run(InputArray image, InputArray mask, CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0);
int min_confidence, int component_level=0);
CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0; CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0;
...@@ -205,7 +164,6 @@ public: ...@@ -205,7 +164,6 @@ public:
*/ */
CV_WRAP static Ptr<OCRTesseract> create(const char* datapath=NULL, const char* language=NULL, CV_WRAP static Ptr<OCRTesseract> create(const char* datapath=NULL, const char* language=NULL,
const char* char_whitelist=NULL, int oem=OEM_DEFAULT, int psmode=PSM_AUTO); const char* char_whitelist=NULL, int oem=OEM_DEFAULT, int psmode=PSM_AUTO);
}; };
...@@ -225,19 +183,19 @@ enum classifier_type ...@@ -225,19 +183,19 @@ enum classifier_type
/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models. /** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models.
@note
* @note - (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can
* - (C++) An example on using OCRHMMDecoder recognition combined with scene be found at the webcam_demo sample:
* text detection can be found at the webcam_demo sample: <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp>
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp>
*/ */
class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR
public: {
public:
/** @brief Callback with the character classifier is made a class. /** @brief Callback with the character classifier is made a class.
* This way it hides the feature extractor and the classifier itself, so This way it hides the feature extractor and the classifier itself, so developers can write
* developers can write their own OCR code. their own OCR code.
The default character classifier and feature extractor can be loaded using the utility function The default character classifier and feature extractor can be loaded using the utility function
loadOCRHMMClassifierNM and KNN model provided in loadOCRHMMClassifierNM and KNN model provided in
...@@ -246,120 +204,92 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { ...@@ -246,120 +204,92 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR {
class CV_EXPORTS_W ClassifierCallback class CV_EXPORTS_W ClassifierCallback
{ {
public: public:
virtual ~ClassifierCallback() { } virtual ~ClassifierCallback() { }
/** @brief The character classifier must return a (ranked list of) /** @brief The character classifier must return a (ranked list of) class(es) id('s)
* class(es) id('s)
* @param image Input image CV_8UC1 or CV_8UC3 with a single letter.
* @param out_class The classifier returns the character class
* categorical label, or list of class labels, to which the input image
* corresponds.
* @param out_confidence The classifier returns the probability of the @param image Input image CV_8UC1 or CV_8UC3 with a single letter.
* input image corresponding to each classes in out_class. @param out_class The classifier returns the character class categorical label, or list of
class labels, to which the input image corresponds.
@param out_confidence The classifier returns the probability of the input image
corresponding to each classes in out_class.
*/ */
virtual void eval (InputArray image, std::vector<int>& out_class, virtual void eval( InputArray image, std::vector<int>& out_class, std::vector<double>& out_confidence);
std::vector<double>& out_confidence);
}; };
public:
/** @brief Recognize text using HMM. /** @brief Recognize text using HMM.
* Takes binary image on input and returns recognized text in the output_text Takes binary image on input and returns recognized text in the output_text parameter. Optionally
* parameter. Optionally provides also the Rects for individual text elements provides also the Rects for individual text elements found (e.g. words), and the list of those
* found (e.g. words), and the list of those text elements with their text elements with their confidence values.
* confidence values.
* @param image Input binary image CV_8UC1 with a single text line (or word). @param image Input binary image CV_8UC1 with a single text line (or word).
* @param output_text Output text. Most likely character sequence found by @param output_text Output text. Most likely character sequence found by the HMM decoder.
* the HMM decoder.
* @param component_rects If provided the method will output a list of Rects @param component_rects If provided the method will output a list of Rects for the individual
* for the individual text elements found (e.g. words). text elements found (e.g. words).
* @param component_texts If provided the method will output a list of text @param component_texts If provided the method will output a list of text strings for the
* strings for the recognition of individual text elements found (e.g. words). recognition of individual text elements found (e.g. words).
* @param component_confidences If provided the method will output a list of @param component_confidences If provided the method will output a list of confidence values
* confidence values for the recognition of individual text elements found for the recognition of individual text elements found (e.g. words).
* (e.g. words).
* @param component_level Only OCR_LEVEL_WORD is supported. @param component_level Only OCR_LEVEL_WORD is supported.
*/ */
using BaseOCR::run; virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
virtual void run (Mat& image, std::string& output_text, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
/** @brief Recognize text using HMM. /** @brief Recognize text using HMM.
* Takes an image and a mask (where each connected component corresponds to a Takes an image and a mask (where each connected component corresponds to a segmented character)
* segmented character) on input and returns recognized text in the on input and returns recognized text in the output_text parameter. Optionally
* output_text parameter. Optionally provides also the Rects for individual provides also the Rects for individual text elements found (e.g. words), and the list of those
* text elements found (e.g. words), and the list of those text elements with text elements with their confidence values.
* their confidence values.
* @param image Input image CV_8UC1 or CV_8UC3 with a single text line
* (or word).
* @param mask Input binary image CV_8UC1 same size as input image. Each @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word).
* connected component in mask corresponds to a segmented character in the @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image.
* input image.
* @param output_text Output text. Most likely character sequence found by @param output_text Output text. Most likely character sequence found by the HMM decoder.
* the HMM decoder.
* @param component_rects If provided the method will output a list of Rects @param component_rects If provided the method will output a list of Rects for the individual
* for the individual text elements found (e.g. words). text elements found (e.g. words).
* @param component_texts If provided the method will output a list of text @param component_texts If provided the method will output a list of text strings for the
* strings for the recognition of individual text elements found (e.g. words). recognition of individual text elements found (e.g. words).
* @param component_confidences If provided the method will output a list of @param component_confidences If provided the method will output a list of confidence values
* confidence values for the recognition of individual text elements found for the recognition of individual text elements found (e.g. words).
* (e.g. words).
* @param component_level Only OCR_LEVEL_WORD is supported. @param component_level Only OCR_LEVEL_WORD is supported.
*/ */
virtual void run(Mat& image, Mat& mask, std::string& output_text, virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
// aliases for scripting // aliases for scripting
CV_WRAP String run(InputArray image, CV_WRAP String run(InputArray image, int min_confidence, int component_level=0);
int min_confidence,
int component_level=0);
CV_WRAP String run(InputArray image, CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0);
InputArray mask,
int min_confidence,
int component_level=0);
/** @brief Creates an instance of the OCRHMMDecoder class. Initializes /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.
* HMMDecoder.
* @param classifier The character classifier with built in feature @param classifier The character classifier with built in feature extractor.
* extractor.
* @param vocabulary The language vocabulary (chars when ascii english text) @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size()
* . vocabulary.size() must be equal to the number of classes of the must be equal to the number of classes of the classifier.
* classifier.
* @param transition_probabilities_table Table with transition probabilities @param transition_probabilities_table Table with transition probabilities between character
* between character pairs. cols == rows == vocabulary.size(). pairs. cols == rows == vocabulary.size().
* @param emission_probabilities_table Table with observation emission @param emission_probabilities_table Table with observation emission probabilities. cols ==
* probabilities. cols == rows == vocabulary.size(). rows == vocabulary.size().
* @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment
* for the moment (<http://en.wikipedia.org/wiki/Viterbi_algorithm>). (<http://en.wikipedia.org/wiki/Viterbi_algorithm>).
*/ */
static Ptr<OCRHMMDecoder> create(const Ptr<OCRHMMDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor static Ptr<OCRHMMDecoder> create(const Ptr<OCRHMMDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
const std::string& vocabulary, // The language vocabulary (chars when ASCII English text) const std::string& vocabulary, // The language vocabulary (chars when ASCII English text)
// size() must be equal to the number of classes // size() must be equal to the number of classes
...@@ -402,11 +332,9 @@ protected: ...@@ -402,11 +332,9 @@ protected:
decoder_mode mode; decoder_mode mode;
}; };
/** @brief Allow to implicitly load the default character classifier when /** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
* creating an OCRHMMDecoder object.
@param filename The XML or YAML file with the classifier model (e.g.OCRHMM_knn_model_data.xml)
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann & The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann &
Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a
...@@ -416,16 +344,11 @@ using a KNN model trained with synthetic data of rendered characters with differ ...@@ -416,16 +344,11 @@ using a KNN model trained with synthetic data of rendered characters with differ
types. types.
@deprecated loadOCRHMMClassifier instead @deprecated loadOCRHMMClassifier instead
*/ */
CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM (
const String& filename);
/** @brief Allow to implicitly load the default character classifier when CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const String& filename);
* creating an OCRHMMDecoder object.
@param filename The XML or YAML file with the classifier model (e.g.OCRBeamSearch_CNN_model_data.xml.gz)
/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) @param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
...@@ -435,10 +358,8 @@ a linear classifier. It is applied to the input image in a sliding window fashio ...@@ -435,10 +358,8 @@ a linear classifier. It is applied to the input image in a sliding window fashio
at each window location. at each window location.
@deprecated use loadOCRHMMClassifier instead @deprecated use loadOCRHMMClassifier instead
*/ */
CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN ( CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const String& filename);
const String& filename);
/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. /** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
...@@ -450,63 +371,48 @@ CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN ( ...@@ -450,63 +371,48 @@ CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN (
CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifier(const String& filename, int classifier); CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifier(const String& filename, int classifier);
//! @} //! @}
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). /** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
* *
* @param vocabulary The language vocabulary (chars when ASCII English text). * @param vocabulary The language vocabulary (chars when ASCII English text).
* *
* @param lexicon The list of words that are expected to be found in a particular image. * @param lexicon The list of words that are expected to be found in a particular image.
*
* @param transition_probabilities_table Output table with transition * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
* probabilities between character pairs. cols == rows == vocabulary.size(). *
* The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
* The function calculate frequency statistics of character pairs from the given
* lexicon and fills the output transition_probabilities_table with them. The
* transition_probabilities_table can be used as input in the
* OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
* @note * @note
* - (C++) An alternative would be to load the default generic language * - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
* transition table provided in the text module samples folder (created * <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
* from ispell 42869 english words list) :
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
**/ **/
CV_EXPORTS void createOCRHMMTransitionsTable ( CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table);
std::string& vocabulary, std::vector<std::string>& lexicon,
OutputArray transition_probabilities_table); CV_EXPORTS_W Mat createOCRHMMTransitionsTable(const String& vocabulary, std::vector<cv::String>& lexicon);
CV_EXPORTS_W Mat createOCRHMMTransitionsTable (
const String& vocabulary, std::vector<cv::String>& lexicon);
/* OCR BeamSearch Decoder */ /* OCR BeamSearch Decoder */
/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam /** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm.
* Search algorithm.
@note @note
- (C++) An example on using OCRBeamSearchDecoder recognition combined with - (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can
scene text detection can be found at the demo sample: be found at the demo sample:
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/word_recognition.cpp> <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/word_recognition.cpp>
*/ */
class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR
{
/* Forward declaration of class that can be used to generate an OCRBeamSearchDecoder::ClassifierCallbac */ public:
class TextImageClassifier;
class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{
public:
/** @brief Callback with the character classifier is made a class. /** @brief Callback with the character classifier is made a class.
* This way it hides the feature extractor and the classifier itself, so This way it hides the feature extractor and the classifier itself, so developers can write
* developers can write their own OCR code. their own OCR code.
* The default character classifier and feature extractor can be loaded The default character classifier and feature extractor can be loaded using the utility funtion
* using the utility funtion loadOCRBeamSearchClassifierCNN with all its loadOCRBeamSearchClassifierCNN with all its parameters provided in
* parameters provided in <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz>.
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz>.
*/ */
class CV_EXPORTS_W ClassifierCallback{ class CV_EXPORTS_W ClassifierCallback
{
public: public:
virtual ~ClassifierCallback() { } virtual ~ClassifierCallback() { }
/** @brief The character classifier must return a (ranked list of) class(es) id('s) /** @brief The character classifier must return a (ranked list of) class(es) id('s)
...@@ -519,8 +425,8 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ ...@@ -519,8 +425,8 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{
*/ */
virtual void eval( InputArray image, std::vector< std::vector<double> >& recognition_probabilities, std::vector<int>& oversegmentation ); virtual void eval( InputArray image, std::vector< std::vector<double> >& recognition_probabilities, std::vector<int>& oversegmentation );
virtual int getWindowSize() {return 0;} int getWindowSize() {return 0;}
virtual int getStepSize() {return 0;} int getStepSize() {return 0;}
}; };
public: public:
...@@ -545,7 +451,6 @@ public: ...@@ -545,7 +451,6 @@ public:
@param component_level Only OCR_LEVEL_WORD is supported. @param component_level Only OCR_LEVEL_WORD is supported.
*/ */
using BaseOCR::run;
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0); int component_level=0);
...@@ -577,7 +482,6 @@ public: ...@@ -577,7 +482,6 @@ public:
@param beam_size Size of the beam in Beam Search algorithm. @param beam_size Size of the beam in Beam Search algorithm.
*/ */
static Ptr<OCRBeamSearchDecoder> create(const Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor static Ptr<OCRBeamSearchDecoder> create(const Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
const std::string& vocabulary, // The language vocabulary (chars when ASCII English text) const std::string& vocabulary, // The language vocabulary (chars when ASCII English text)
// size() must be equal to the number of classes // size() must be equal to the number of classes
...@@ -598,29 +502,10 @@ public: ...@@ -598,29 +502,10 @@ public:
int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment)
int beam_size = 500); // Size of the beam in Beam Search algorithm int beam_size = 500); // Size of the beam in Beam Search algorithm
/** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder from the specified path. /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder from the specified path.
@overload @overload
@param filename path to a character classifier file
@param vocabulary The language vocabulary (chars when ASCII English text). vocabulary.size()
must be equal to the number of classes of the classifier..
@param transition_probabilities_table Table with transition probabilities between character
pairs. cols == rows == vocabulary.size().
@param emission_probabilities_table Table with observation emission probabilities. cols ==
rows == vocabulary.size().
@param mode HMM Decoding algorithm (only Viterbi for the moment)
@param beam_size Size of the beam in Beam Search algorithm
*/ */
CV_WRAP static Ptr<OCRBeamSearchDecoder> create(const String& filename, // The character classifier file CV_WRAP static Ptr<OCRBeamSearchDecoder> create(const String& filename, // The character classifier file
const String& vocabulary, // The language vocabulary (chars when ASCII English text) const String& vocabulary, // The language vocabulary (chars when ASCII English text)
...@@ -631,7 +516,6 @@ public: ...@@ -631,7 +516,6 @@ public:
// cols == rows == vocabulary.size() // cols == rows == vocabulary.size()
int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment)
int beam_size = 500); int beam_size = 500);
protected: protected:
Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier; Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier;
...@@ -656,402 +540,6 @@ CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClas ...@@ -656,402 +540,6 @@ CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClas
//! @} //! @}
}
//Classifiers should provide diferent backends }
enum{
OCR_HOLISTIC_BACKEND_NONE, //No back end
OCR_HOLISTIC_BACKEND_DNN, // dnn backend opencv_dnn
OCR_HOLISTIC_BACKEND_CAFFE, // caffe based backend
OCR_HOLISTIC_BACKEND_DEFAULT // to store default value based on environment
};
class TextImageClassifier;
/**
* @brief The ImagePreprocessor class
*/
class CV_EXPORTS_W ImagePreprocessor{
protected:
virtual void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels)=0;
virtual void set_mean_(Mat){}
public:
virtual ~ImagePreprocessor(){}
/** @brief this method in provides public acces to the preprocessing with respect to a specific
* classifier
*
* This method's main use would be to use the preprocessor without feeding it to a classifier.
* Determining the exact behavior of a preprocessor is the main motivation for this.
*
* @param input an image without any constraints
*
* @param output in most cases an image of fixed depth size and whitened
*
* @param sz the size to which the image would be resize if the preprocessor resizes inputs
*
* @param outputChannels the number of channels for the output image
*/
CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels);
/** @brief this method in provides public acces to set the mean of the input images
* mean can be a mat either of same size of the image or one value per color channel
* A preprocessor can be created without the mean( the pre processor will calculate mean for every image
* in that case
*
* @param mean which will be subtracted from the images
*
*/
CV_WRAP void set_mean(Mat mean);
/** @brief Creates a functor that only resizes and changes the channels of the input
* without further processing.
*
* @return shared pointer to the generated preprocessor
*/
CV_WRAP static Ptr<ImagePreprocessor> createResizer();
/** @brief
*
* @param sigma
*
* @return shared pointer to generated preprocessor
*/
CV_WRAP static Ptr<ImagePreprocessor> createImageStandarizer(double sigma);
/** @brief
*
* @return shared pointer to generated preprocessor
*/
CV_WRAP static Ptr<ImagePreprocessor> createImageMeanSubtractor(InputArray meanImg);
/** @brief
* create a functor with the parameters, parameters can be changes by corresponding set functions
* @return shared pointer to generated preprocessor
*/
CV_WRAP static Ptr<ImagePreprocessor>createImageCustomPreprocessor(double rawval=1.0,String channel_order="BGR");
friend class TextImageClassifier;
};
/** @brief Abstract class that implements the classifcation of text images.
*
* The interface is generic enough to describe any image classifier. And allows
* to take advantage of compouting in batches. While word classifiers are the default
* networks, any image classifers should work.
*
*/
class CV_EXPORTS_W TextImageClassifier
{
protected:
Size inputGeometry_;
Size outputGeometry_;
int channelCount_;
Ptr<ImagePreprocessor> preprocessor_;
/** @brief all image preprocessing is handled here including whitening etc.
*
* @param input the image to be preprocessed for the classifier. If the depth
* is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
*
* @param output reference to the image to be fed to the classifier, the preprocessor will
* resize the image to the apropriate size and convert it to the apropriate depth\
*
* The method preprocess should never be used externally, it is up to classify and classifyBatch
* methods to employ it.
*/
virtual void preprocess(const Mat& input,Mat& output);
public:
virtual ~TextImageClassifier() {}
/** @brief
*/
CV_WRAP virtual void setPreprocessor(Ptr<ImagePreprocessor> ptr);
/** @brief
*/
CV_WRAP Ptr<ImagePreprocessor> getPreprocessor();
/** @brief produces a class confidence row-vector given an image
*/
CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0;
/** @brief produces a matrix containing class confidence row-vectors given an collection of images
*/
CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0;
/** @brief simple getter method returning the number of channels each input sample has
*/
CV_WRAP virtual int getInputChannelCount(){return this->channelCount_;}
/** @brief simple getter method returning the size of the input sample
*/
CV_WRAP virtual Size getInputSize(){return this->inputGeometry_;}
/** @brief simple getter method returning the size of the oputput row-vector
*/
CV_WRAP virtual int getOutputSize()=0;
/** @brief simple getter method returning the shape of the oputput from caffe
*/
CV_WRAP virtual Size getOutputGeometry()=0;
/** @brief simple getter method returning the size of the minibatches for this classifier.
* If not applicabe this method should return 1
*/
CV_WRAP virtual int getMinibatchSize()=0;
friend class ImagePreprocessor;
};
class CV_EXPORTS_W DeepCNN:public TextImageClassifier
{
/** @brief Class that uses a pretrained caffe model for word classification.
*
* This network is described in detail in:
* Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
* http://arxiv.org/abs/1412.1842
*/
public:
virtual ~DeepCNN() {};
/** @brief Constructs a DeepCNN object from a caffe pretrained model
*
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
*
* @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be
* very large, up to 2GB.
*
* @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method;
*
* @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
* has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
*
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
* the only option
*/
CV_WRAP static Ptr<DeepCNN> create(String archFilename,String weightsFilename,Ptr<ImagePreprocessor> preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT);
/** @brief Constructs a DeepCNN intended to be used for word spotting.
*
* This method loads a pretrained classifier and couples him with a preprocessor that standarises pixels with a
* deviation of 113. The architecture file can be downloaded from:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt>
* While the weights can be downloaded from:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel>
* The words assigned to the network outputs are available at:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt>
*
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
* When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt".
*
* @param weightsFilename is the path to the pretrained weights of the model. When employing
* OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. This file can be very large, the
* pretrained DictNet uses 2GB.
*
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
* the only option
*/
CV_WRAP static Ptr<DeepCNN> createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT);
};
namespace cnn_config{
/** @brief runtime backend information
*
* this function finds the status of backends compiled with this module
*
* @return a list of backends (caffe,opencv-dnn etc.)
* */
CV_EXPORTS_W std::vector<std::string> getAvailableBackends();
namespace caffe_backend{
/** @brief Prompts Caffe on the computation device beeing used
*
* Caffe can only be controlled globally on whether the GPU or the CPU is used has a
* global behavior. This function queries the current state of caffe.
* If the module is built without caffe, this method throws an exception.
*
* @return true if caffe is computing on the GPU, false if caffe is computing on the CPU
*/
CV_EXPORTS_W bool getCaffeGpuMode();
/** @brief Sets the computation device beeing used by Caffe
*
* Caffe can only be controlled globally on whether the GPU or the CPU is used has a
* global behavior. This function queries the current state of caffe.
* If the module is built without caffe, this method throws an exception.
*
* @param useGpu set to true for caffe to be computing on the GPU, false if caffe is
* computing on the CPU
*/
CV_EXPORTS_W void setCaffeGpuMode(bool useGpu);
/** @brief Provides runtime information on whether Caffe support was compiled in.
*
* The text module API is the same regardless of whether CAffe was available or not
* During compilation. When methods that require Caffe are invocked while Caffe support
* is not compiled in, exceptions are thrown. This method allows to test whether the
* text module was built with caffe during runtime.
*
* @return true if Caffe support for the the text module was provided during compilation,
* false if Caffe was unavailable.
*/
CV_EXPORTS_W bool getCaffeAvailable();
}//caffe
namespace dnn_backend {
/** @brief Provides runtime information on whether DNN module was compiled in.
*
* The text module API is the same regardless of whether DNN module was available or not
* During compilation. When methods that require backend are invocked while no backend support
* is compiled, exceptions are thrown. This method allows to test whether the
* text module was built with dnn_backend during runtime.
*
* @return true if opencv_dnn support for the the text module was provided during compilation,
* false if opencv_dnn was unavailable.
*/
CV_EXPORTS_W bool getDNNAvailable();
}//dnn_backend
}//cnn_config
/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
* Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable
* word given an input image.
*
* This class implements the logic of providing transcriptions given a vocabulary and and an image
* classifer. The classifier has to be any TextImageClassifier but the classifier for which this
* class was built is the DictNet. In order to load it the following files should be downloaded:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt>
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel>
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt>
*/
class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR
{
public:
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
Takes image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 or CV_8UC3
@param mask is totally ignored and is only available for compatibillity reasons
@param output_text Output text of the the word spoting, always one that exists in the dictionary.
@param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_level must be OCR_LEVEL_WORD.
*/
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/**
@brief Method that provides a quick and simple interface to a single word image classifcation
@param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word
@param transcription an opencv string that will store the detected word transcription
@param confidence a double that will be updated with the confidence the classifier has for the selected word
*/
CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0;
/**
@brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
the classifiers parallel capabilities.
@param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
to contain a single word.
@param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
input image
@param confidences a vector of double that will be updated with the confidence the classifier has for each of the
selected words.
*/
CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
/**
@brief simple getter for the vocabulary employed
*/
CV_WRAP virtual const std::vector<String>& getVocabulary()=0;
/** @brief simple getter for the preprocessing functor
*/
CV_WRAP virtual Ptr<TextImageClassifier> getClassifier()=0;
/** @brief Creates an instance of the OCRHolisticWordRecognizer class.
@param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance
@param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabularyFilename);
/** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier.
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
@param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename);
/** @brief
*
* @param classifierPtr
*
* @param vocabulary
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,const std::vector<String>& vocabulary);
/** @brief
*
* @param modelArchFilename
*
* @param modelWeightsFilename
*
* @param vocabulary
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create (String modelArchFilename, String modelWeightsFilename, const std::vector<String>& vocabulary);
};
}//namespace text
}//namespace cv
#endif // _OPENCV_TEXT_OCR_HPP_ #endif // _OPENCV_TEXT_OCR_HPP_
/*M////////////////////////////////////////////////////////////////////////////////////////// // This file is part of OpenCV project.
// // It is subject to the license terms in the LICENSE file found in the top-level directory
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // of this distribution and at http://opencv.org/license.html.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ #ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
#define __OPENCV_TEXT_TEXTDETECTOR_HPP__ #define __OPENCV_TEXT_TEXTDETECTOR_HPP__
#include <vector>
#include <string>
#include <iostream>
#include <sstream>
#include"ocr.hpp" #include"ocr.hpp"
namespace cv namespace cv
{ {
namespace text namespace text
...@@ -59,208 +15,44 @@ namespace text ...@@ -59,208 +15,44 @@ namespace text
//! @addtogroup text_detect //! @addtogroup text_detect
//! @{ //! @{
/** @brief An abstract class providing interface for text detection algorithms
//base class BaseDetector declares a common API that would be used in a typical text
//detection scenario
class CV_EXPORTS_W BaseDetector
{
public:
virtual ~BaseDetector() {};
virtual void run(Mat& image,
std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
virtual void run(Mat& image, Mat& mask,
std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
};
/** A virtual class for different models of text detection (including CNN based deep models)
*/ */
class CV_EXPORTS_W TextDetector
class CV_EXPORTS_W TextRegionDetector
{ {
protected:
/** Stores input and output size
*/
//netGeometry inputGeometry_;
//netGeometry outputGeometry_;
Size inputGeometry_;
Size outputGeometry_;
int inputChannelCount_;
int outputChannelCount_;
public: public:
virtual ~TextRegionDetector() {} /**
@brief Method that provides a quick and simple interface to detect text inside an image
/** @brief produces a list of Bounding boxes and an estimate of text-ness confidence of Bounding Boxes
*/
CV_WRAP virtual void detect(InputArray image, OutputArray bboxProb ) = 0;
/** @brief simple getter method returning the size (height, width) of the input sample
*/
CV_WRAP virtual Size getInputGeometry(){return this->inputGeometry_;}
/** @brief simple getter method returning the shape of the oputput @param inputImage an image to process
* Any text detector should output a number of text regions alongwith a score of text-ness @param Bbox a vector of Rect that will store the detected word bounding box
* From the shape it can be inferred the number of text regions and number of returned value @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
* for each region
*/ */
CV_WRAP virtual Size getOutputGeometry(){return this->outputGeometry_;} virtual void textDetectInImage(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
virtual ~TextDetector() {}
}; };
/** Generic structure of Deep CNN based Text Detectors /** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
* */ * A TextDetectorCNN is employed to find bounding boxes of text words given an input image.
class CV_EXPORTS_W DeepCNNTextDetector : public TextRegionDetector
{
/** @brief Class that uses a pretrained caffe model for text detection.
* Any text detection should
* This network is described in detail in:
* Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network
* https://arxiv.org/abs/1611.06779
*/
protected:
/** all deep CNN based text detectors have a preprocessor (normally)
*/
Ptr<ImagePreprocessor> preprocessor_;
/** @brief all image preprocessing is handled here including whitening etc.
*
* @param input the image to be preprocessed for the classifier. If the depth
* is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
*
* @param output reference to the image to be fed to the classifier, the preprocessor will
* resize the image to the apropriate size and convert it to the apropriate depth\
*
* The method preprocess should never be used externally, it is up to classify and classifyBatch
* methods to employ it.
*/
virtual void preprocess(const Mat& input,Mat& output);
public:
virtual ~DeepCNNTextDetector() {};
/** @brief Constructs a DeepCNNTextDetector object from a caffe pretrained model
*
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
*
* @param weightsFilename is the path to the pretrained weights of the model in binary fdorm.
*
* @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method;
*
* @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
* has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
*
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
* the only option
*/ */
CV_WRAP static Ptr<DeepCNNTextDetector> create(String archFilename,String weightsFilename,Ptr<ImagePreprocessor> preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); class CV_EXPORTS_W TextDetectorCNN : public TextDetector
/** @brief Constructs a DeepCNNTextDetector intended to be used for text area detection.
*
* This method loads a pretrained classifier and couples with a preprocessor that preprocess the image with mean subtraction of ()
* The architecture and models weights can be downloaded from:
* https://github.com/sghoshcvc/TextBox-Models.git (size is around 100 MB)
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
* When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt".
*
* @param weightsFilename is the path to the pretrained weights of the model. When employing
* OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file.
*
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
* the only option
*/
CV_WRAP static Ptr<DeepCNNTextDetector> createTextBoxNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT);
friend class ImagePreprocessor;
};
/** @brief textDetector class provides the functionallity of text bounding box detection.
* A TextRegionDetector is employed to find bounding boxes of text
* words given an input image.
*
* This class implements the logic of providing text bounding boxes in a vector of rects given an TextRegionDetector
* The TextRegionDetector can be any text detector
*
*/
class CV_EXPORTS_W textDetector : public BaseDetector
{ {
public: public:
virtual void run(Mat& image, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/** @brief detect text with a cnn, input is one image with (multiple) ocuurance of text.
Takes image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 or CV_8UC3
@param mask is totally ignored and is only available for compatibillity reasons
@param component_rects a vector of Rects, each rect is one text bounding box.
@param component_confidences A vector of float returns confidence of text bounding boxes
@param component_level must be OCR_LEVEL_WORD.
*/
virtual void run(Mat& image, Mat& mask, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/** /**
@brief Method that provides a quick and simple interface to detect text inside an image @overload
@param inputImage an image expected to be a CV_U8C3 of any size @param inputImage an image expected to be a CV_U8C3 of any size
@param Bbox a vector of Rect that will store the detected word bounding box @param Bbox a vector of Rect that will store the detected word bounding box
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
*/ */
CV_WRAP virtual void textDetectInImage(InputArray inputImage,CV_OUT std::vector<Rect>& Bbox,CV_OUT std::vector<float>& confidence)=0; CV_WRAP virtual void textDetectInImage(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
/** @brief simple getter for the preprocessing functor
*/
CV_WRAP virtual Ptr<TextRegionDetector> getClassifier()=0;
/** @brief Creates an instance of the textDetector class.
@param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance
*/
CV_WRAP static Ptr<textDetector> create(Ptr<TextRegionDetector> classifierPtr);
/** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier. /** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier.
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
@param detectMultiscale if true, multiple scales of the input image will be used as network input
*/ */
CV_WRAP static Ptr<textDetector> create(String modelArchFilename, String modelWeightsFilename); CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale = false);
}; };
//! @} //! @}
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
Created on Wed Jul 19 17:54:00 2017
@author: sgnosh
"""
#!/usr/bin/python #!/usr/bin/python
import sys import sys
import os import os
import cv2 import cv2
import numpy as np import numpy as np
print('\nDeeptextdetection.py') def main():
print(' A demo script of text box alogorithm of the paper:') print('\nDeeptextdetection.py')
print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') print(' A demo script of text box alogorithm of the paper:')
print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n')
if (len(sys.argv) < 2):
if (len(sys.argv) < 2):
print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n')
quit() quit()
#if not cv2.text.cnn_config.caffe_backend.getCaffeAvailable():
# print"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n" if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'):
#
# quit()
# check model and architecture file existance
if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'):
print " Model files not found in current directory. Aborting" print " Model files not found in current directory. Aborting"
print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models"
quit() quit()
cv2.text.cnn_config.caffe_backend.setCaffeGpuMode(True);
pathname = os.path.dirname(sys.argv[0])
img = cv2.imread(str(sys.argv[1]))
textSpotter=cv2.text.textDetector_create(
"textbox_deploy.prototxt","textbox.caffemodel")
rects,outProbs = textSpotter.textDetectInImage(img);
# for visualization
vis = img.copy()
# Threshold to select rectangles : All rectangles for which outProbs is more than this threshold will be shown
thres = 0.6
img = cv2.imread(str(sys.argv[1]))
textSpotter = cv2.text.TextDetectorCNN_create("textbox_deploy.prototxt","textbox.caffemodel")
rects, outProbs = textSpotter.textDetectInImage(img);
vis = img.copy()
thres = 0.6
#Visualization for r in range(np.shape(rects)[0]):
for r in range(0,np.shape(rects)[0]): if outProbs[r] > thres:
if outProbs[r] >thres:
rect = rects[r] rect = rects[r]
cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 0, 0), 2) cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)
# cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 255, 255), 1)
cv2.imshow("Text detection result", vis)
cv2.waitKey()
#Visualization if __name__ == "__main__":
cv2.imshow("Text detection result", vis) main()
cv2.waitKey(0)
\ No newline at end of file
/* #include <opencv2/text.hpp>
* dictnet_demo.cpp #include <opencv2/highgui.hpp>
* #include <opencv2/imgproc.hpp>
* Demonstrates simple use of the holistic word classifier in C++
*
* Created on: June 26, 2016
* Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
*/
#include "opencv2/text.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include <sstream> #include <sstream>
#include <vector>
#include <iostream> #include <iostream>
#include <iomanip>
#include <fstream> #include <fstream>
void textbox_draw(cv::Mat &src, std::vector<cv::Rect> &groups,std::vector<float> &probs,std::vector<cv::String> wordList,float thres); using namespace cv;
inline std::string getHelpStr(std::string progFname){
std::stringstream out;
out << " Demo of text detection CNN for text detection." << std::endl;
out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
out << " Usage: " << progFname << " <output_file> <input_image>" << std::endl;
out << " Caffe Model files (textbox.caffemodel, textbox_deploy.prototxt)"<<std::endl;
out << " must be in the current directory." << std::endl << std::endl;
out << " Obtaining Caffe Model files in linux shell:"<<std::endl; namespace
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<std::endl; {
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<std::endl; std::string getHelpStr(std::string progFname)
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<std::endl<<std::endl; {
std::stringstream out;
out << " Demo of text detection CNN for text detection." << std::endl
<< " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl
<< " Usage: " << progFname << " <output_file> <input_image>" << std::endl
<< " Caffe Model files (textbox.caffemodel, textbox_deploy.prototxt)"<<std::endl
<< " must be in the current directory." << std::endl
<< " These files can be downloaded from https://github.com/sghoshcvc/TextBox-Models.git" << std::endl;
return out.str(); return out.str();
} }
inline bool fileExists (std::string filename) { bool fileExists (std::string filename)
{
std::ifstream f(filename.c_str()); std::ifstream f(filename.c_str());
return f.good(); return f.good();
} }
void textbox_draw(cv::Mat &src, std::vector<cv::Rect> &groups,std::vector<float> &probs,std::vector<cv::String> wordList,float thres=0.6)
void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, float thres)
{ {
for (int i=0;i<(int)groups.size(); i++) for (size_t i = 0; i < groups.size(); i++)
{ {
if(probs[i]>thres) if(probs[i] > thres)
{ {
if (src.type() == CV_8UC3) if (src.type() == CV_8UC3)
{ {
cv::rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 0, 255, 255 ), 3, 8 ); rectangle(src, groups[i], Scalar( 0, 255, 255 ), 2, LINE_AA);
cv::putText(src, wordList[i],groups.at(i).tl() , cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar( 0,0,255 )); String label = format("%.2f", probs[i]);
std::cout << "text box: " << groups[i] << " confidence: " << probs[i] << "\n";
putText(src, label, groups.at(i).tl(), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,255 ), 1, LINE_AA);
} }
else else
rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 255 ), 3, 8 ); rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
} }
} }
} }
}
int main(int argc, const char * argv[]){ int main(int argc, const char * argv[])
if(!cv::text::cnn_config::caffe_backend::getCaffeAvailable()){ {
std::cout<<"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n"; if (argc < 2)
//exit(1); {
} std::cout << getHelpStr(argv[0]);
std::vector<std::string> backends=cv::text::cnn_config::getAvailableBackends(); std::cout << "Insufiecient parameters. Aborting!" << std::endl;
std::cout << "The Following backends are available" << "\n";
for (int i=0;i<backends.size();i++)
std::cout << backends[i] << "\n";
// printf("%s",x);
//set to true if you have a GPU with more than 3GB
if(cv::text::cnn_config::caffe_backend::getCaffeAvailable())
cv::text::cnn_config::caffe_backend::setCaffeGpuMode(true);
if (argc < 3){
std::cout<<getHelpStr(argv[0]);
std::cout<<"Insufiecient parameters. Aborting!"<<std::endl;
exit(1); exit(1);
} }
if (!fileExists("textbox.caffemodel") || if (!fileExists("textbox.caffemodel") ||
!fileExists("textbox_deploy.prototxt")){ !fileExists("textbox_deploy.prototxt"))
// !fileExists("dictnet_vgg_labels.txt")) {
std::cout<<getHelpStr(argv[0]);
std::cout<<"Model files not found in the current directory. Aborting!"<<std::endl;
exit(1);
}
if (fileExists(argv[1])){
std::cout<<getHelpStr(argv[0]); std::cout<<getHelpStr(argv[0]);
std::cout<<"Output file must not exist. Aborting!"<<std::endl; std::cout << "Model files not found in the current directory. Aborting!" << std::endl;
exit(1); exit(1);
} }
cv::Mat image; Mat image = imread(String(argv[1]), IMREAD_COLOR);
image = cv::imread(cv::String(argv[2]));
std::cout<<"Starting Text Box Demo"<<std::endl; std::cout << "Starting Text Box Demo" << std::endl;
cv::Ptr<cv::text::textDetector> textSpotter=cv::text::textDetector::create( Ptr<text::TextDetectorCNN> textSpotter =
"textbox_deploy.prototxt","textbox.caffemodel"); text::TextDetectorCNN::create("textbox_deploy.prototxt","textbox.caffemodel", false);
//cv::Ptr<cv::text::textDetector> wordSpotter= std::vector<Rect> bbox;
// cv::text::textDetector::create(cnn);
std::cout<<"Created Text Spotter with text Boxes";
std::vector<cv::Rect> bbox;
std::vector<float> outProbabillities; std::vector<float> outProbabillities;
textSpotter->textDetectInImage(image,bbox,outProbabillities); textSpotter->textDetectInImage(image, bbox, outProbabillities);
// textbox_draw(image, bbox,outProbabillities);
float thres =0.6f;
std::vector<cv::Mat> imageList;
for(int imageIdx=0;imageIdx<(int)bbox.size();imageIdx++){
if(outProbabillities[imageIdx]>thres){
imageList.push_back(image(bbox.at(imageIdx)));
}
}
// call dict net here for all detected parts
cv::Ptr<cv::text::DeepCNN> cnn=cv::text::DeepCNN::createDictNet(
"dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",cv::text::OCR_HOLISTIC_BACKEND_DNN);
cv::Ptr<cv::text::OCRHolisticWordRecognizer> wordSpotter=
cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt");
std::vector<cv::String> wordList;
std::vector<double> wordProbabillities;
wordSpotter->recogniseImageBatch(imageList,wordList,wordProbabillities);
// write the output in file
std::ofstream out;
out.open(argv[1]);
for (int i=0;i<(int)wordList.size(); i++)
{
cv::Point tl_ = bbox.at(i).tl();
cv::Point br_ = bbox.at(i).br();
out<<argv[2]<<","<<tl_.x<<","<<tl_.y<<","<<","<<br_.x<<","<<br_.y<<","<<wordList[i]<<std::endl;
}
out.close();
textbox_draw(image, bbox,outProbabillities,wordList);
textbox_draw(image, bbox, outProbabillities, 0.5f);
cv::imshow("TextBox Demo",image); imshow("TextBox Demo",image);
std::cout << "Done!" << std::endl << std::endl; std::cout << "Done!" << std::endl << std::endl;
std::cout << "Press any key to exit." << std::endl << std::endl; std::cout << "Press any key to exit." << std::endl << std::endl;
if ((cv::waitKey()&0xff) == ' ') waitKey();
return 0; return 0;
} }
#include "precomp.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/core.hpp"
#include <iostream>
#include <fstream>
#include <sstream>
#include <queue>
#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
namespace cv { namespace text {
//************************************************************************************
//****************** ImagePreprocessor *******************************************
//************************************************************************************
void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){
Mat inpImg=input.getMat();
Mat outImg;
this->preprocess_(inpImg,outImg,sz,outputChannels);
outImg.copyTo(output);
}
void ImagePreprocessor::set_mean(Mat mean){
this->set_mean_(mean);
}
class ResizerPreprocessor: public ImagePreprocessor{
protected:
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1){
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U){
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
}
//void set_mean_(Mat m){}
public:
ResizerPreprocessor(){}
~ResizerPreprocessor(){}
};
class StandarizerPreprocessor: public ImagePreprocessor{
protected:
double sigma_;
//void set_mean_(Mat M){}
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
Scalar mean,dev;
meanStdDev(output,mean,dev);
subtract(output,mean[0],output);
divide(output,(dev[0]/sigma_),output);
}
public:
StandarizerPreprocessor(double sigma):sigma_(sigma){}
~StandarizerPreprocessor(){}
};
class customPreprocessor:public ImagePreprocessor{
protected:
double rawval_;
Mat mean_;
String channel_order_;
void set_mean_(Mat imMean_){
imMean_.copyTo(this->mean_);
}
void set_raw_scale(int rawval){
rawval_ = rawval;
}
void set_channels(String channel_order){
channel_order_=channel_order;
}
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
tmpInput.convertTo(output,CV_32FC3,1/255.0);
else
tmpInput.convertTo(output,CV_32FC1);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
tmpInput.convertTo(output, CV_32FC1);
else
tmpInput.convertTo(output, CV_32FC1,rawval_);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
tmpInput.convertTo(output,CV_32FC3,1/255.0);
else
tmpInput.convertTo(output,CV_32FC1);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
tmpInput.convertTo(output, CV_32FC1);
else
tmpInput.convertTo(output, CV_32FC1,rawval_);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
input.convertTo(output,CV_32FC1,1/255.0);
else
input.convertTo(output,CV_32FC1);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
input.convertTo(output, CV_32FC1);
else
input.convertTo(output, CV_32FC1,rawval_);
}
}else
{
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
input.convertTo(output,CV_32FC3,1/255.0);
else
input.convertTo(output,CV_32FC3);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
input.convertTo(output, CV_32FC3);
else
input.convertTo(output, CV_32FC3,rawval_);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
if (!this->mean_.empty()){
Scalar mean_s(this->mean_.at<uchar>(0,0),this->mean_.at<uchar>(0,1),this->mean_.at<uchar>(0,2));
subtract(output,mean_s,output);
}
else{
Scalar mean_s;
mean_s = mean(output);
subtract(output,mean_s,output);
}
}
public:
customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){}
~customPreprocessor(){}
};
class MeanSubtractorPreprocessor: public ImagePreprocessor{
protected:
Mat mean_;
//void set_mean_(Mat m){}
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height);
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
subtract(output,this->mean_,output);
}
public:
MeanSubtractorPreprocessor(Mat mean)
{
mean.copyTo(this->mean_);
}
~MeanSubtractorPreprocessor(){}
};
Ptr<ImagePreprocessor> ImagePreprocessor::createResizer()
{
return Ptr<ImagePreprocessor>(new ResizerPreprocessor);
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageStandarizer(double sigma)
{
return Ptr<ImagePreprocessor>(new StandarizerPreprocessor(sigma));
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order)
{
return Ptr<ImagePreprocessor>(new customPreprocessor(rawval,channel_order));
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg)
{
Mat tmp=meanImg.getMat();
return Ptr<ImagePreprocessor>(new MeanSubtractorPreprocessor(tmp));
}
}
}
#include "precomp.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/core.hpp"
#include <iostream>
#include <fstream>
#include <sstream>
#include <queue>
#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef HAVE_CAFFE
#include "caffe/caffe.hpp"
#endif
#ifdef HAVE_DNN
#include "opencv2/dnn.hpp"
#endif
using namespace cv;
using namespace cv::dnn;
using namespace std;
namespace cv { namespace text {
//Maybe OpenCV has a routine better suited
inline bool fileExists (String filename) {
std::ifstream f(filename.c_str());
return f.good();
}
//************************************************************************************
//****************** TextImageClassifier *****************************************
//************************************************************************************
void TextImageClassifier::preprocess(const Mat& input,Mat& output)
{
this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_);
}
void TextImageClassifier::setPreprocessor(Ptr<ImagePreprocessor> ptr)
{
CV_Assert(!ptr.empty());
preprocessor_=ptr;
}
Ptr<ImagePreprocessor> TextImageClassifier::getPreprocessor()
{
return preprocessor_;
}
class DeepCNNCaffeImpl: public DeepCNN{
protected:
void classifyMiniBatch(std::vector<Mat> inputImageList, Mat outputMat)
{
//Classifies a list of images containing at most minibatchSz_ images
CV_Assert(int(inputImageList.size())<=this->minibatchSz_);
CV_Assert(outputMat.isContinuous());
#ifdef HAVE_CAFFE
net_->input_blobs()[0]->Reshape(inputImageList.size(), this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width);
net_->Reshape();
float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data();
float* inputData=inputBuffer;
for(size_t imgNum=0;imgNum<inputImageList.size();imgNum++)
{
std::vector<Mat> input_channels;
Mat preprocessed;
// if the image have multiple color channels the input layer should be populated accordingly
for (int channel=0;channel < this->channelCount_;channel++){
cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData);
input_channels.push_back(netInputWraped);
//input_data += width * height;
inputData+=(this->inputGeometry_.height*this->inputGeometry_.width);
}
this->preprocess(inputImageList[imgNum],preprocessed);
split(preprocessed, input_channels);
}
this->net_->ForwardPrefilled();
const float* outputNetData=net_->output_blobs()[0]->cpu_data();
this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height());
int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width;
//outputMat.resize(this->outputGeometry_.height * this->outputGeometry_.width);
float*outputMatData=(float*)(outputMat.data);
memcpy(outputMatData,outputNetData,sizeof(float)*outputSz*inputImageList.size());
#endif
}
#ifdef HAVE_CAFFE
Ptr<caffe::Net<float> > net_;
#endif
//Size inputGeometry_;//=Size(100,32);
int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
int outputSize_;
//Size outputGeometry_;
public:
DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn):
minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){
channelCount_=dn.channelCount_;
inputGeometry_=dn.inputGeometry_;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
}
DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn)
{
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
this->setPreprocessor(dn.preprocessor_);
this->inputGeometry_=dn.inputGeometry_;
this->channelCount_=dn.channelCount_;
this->minibatchSz_=dn.minibatchSz_;
this->outputSize_=dn.outputSize_;
this->preprocessor_=dn.preprocessor_;
this->outputGeometry_=dn.outputGeometry_;
return *this;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
}
DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr<ImagePreprocessor> preprocessor, int maxMinibatchSz)
:minibatchSz_(maxMinibatchSz)
{
CV_Assert(this->minibatchSz_>0);
CV_Assert(fileExists(modelArchFilename));
CV_Assert(fileExists(modelWeightsFilename));
CV_Assert(!preprocessor.empty());
this->setPreprocessor(preprocessor);
#ifdef HAVE_CAFFE
this->net_.reset(new caffe::Net<float>(modelArchFilename, caffe::TEST));
CV_Assert(net_->num_inputs()==1);
CV_Assert(net_->num_outputs()==1);
CV_Assert(this->net_->input_blobs()[0]->channels()==1
||this->net_->input_blobs()[0]->channels()==3);
this->channelCount_=this->net_->input_blobs()[0]->channels();
this->net_->CopyTrainedLayersFrom(modelWeightsFilename);
caffe::Blob<float>* inputLayer = this->net_->input_blobs()[0];
this->inputGeometry_=Size(inputLayer->width(), inputLayer->height());
this->channelCount_ = inputLayer->channels();
inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width);
net_->Reshape();
this->outputSize_=net_->output_blobs()[0]->channels();
this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height());
#else
CV_Error(Error::StsError,"Caffe not available during compilation!");
#endif
}
void classify(InputArray image, OutputArray classProbabilities)
{
std::vector<Mat> inputImageList;
inputImageList.push_back(image.getMat());
classifyBatch(inputImageList,classProbabilities);
}
void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities)
{
std::vector<Mat> allImageVector;
inputImageList.getMatVector(allImageVector);
size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic
size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic
classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F);
Mat outputMat = classProbabilities.getMat();
for(size_t imgNum=0;imgNum<allImageVector.size();imgNum+=minibatchSize)
{
size_t rangeEnd=imgNum+std::min<size_t>(allImageVector.size()-imgNum,minibatchSize);
std::vector<Mat>::const_iterator from=std::vector<Mat>::const_iterator(allImageVector.begin()+imgNum);
std::vector<Mat>::const_iterator to=std::vector<Mat>::const_iterator(allImageVector.begin()+rangeEnd);
std::vector<Mat> minibatchInput(from,to);
classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd)));
}
}
int getOutputSize()
{
return this->outputSize_;
}
Size getOutputGeometry()
{
return this->outputGeometry_;
}
int getMinibatchSize()
{
return this->minibatchSz_;
}
int getBackend()
{
return OCR_HOLISTIC_BACKEND_CAFFE;
}
};
class DeepCNNOpenCvDNNImpl: public DeepCNN{
protected:
void classifyMiniBatch(std::vector<Mat> inputImageList, Mat outputMat)
{
//Classifies a list of images containing at most minibatchSz_ images
CV_Assert(int(inputImageList.size())<=this->minibatchSz_);
CV_Assert(outputMat.isContinuous());
#ifdef HAVE_DNN
std::vector<Mat> preProcessedImList; // to store preprocessed images, should it be handled inside preprocessing class?
Mat preprocessed;
// preprocesses each image in the inputImageList and push to preprocessedImList
for(size_t imgNum=0;imgNum<inputImageList.size();imgNum++)
{
this->preprocess(inputImageList[imgNum],preprocessed);
preProcessedImList.push_back(preprocessed);
}
// set input data blob in dnn::net
net_->setInput(blobFromImages(preProcessedImList,1, this->inputGeometry_), "data");
float*outputMatData=(float*)(outputMat.data);
//Mat outputNet(inputImageList.size(),this->outputSize_,CV_32FC1,outputMatData) ;
Mat outputNet = this->net_->forward();
outputNet = outputNet.reshape(1, 1);
float*outputNetData=(float*)(outputNet.data);
memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size());
#endif
}
#ifdef HAVE_DNN
Ptr<Net> net_;
#endif
// hard coding input image size. anything in DNN library to get that from prototxt??
// Size inputGeometry_;//=Size(100,32);
int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
int outputSize_;
//Size outputGeometry_;//= Size(1,1);
//int channelCount_;
// int inputChannel_ ;//=1;
// int _inputHeight;
//int _inputWidth ;
//int _inputChannel ;
public:
DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn):
minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){
channelCount_=dn.channelCount_;
inputGeometry_=dn.inputGeometry_;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
#ifdef HAVE_DNN
this->net_=dn.net_;
#endif
}
DeepCNNOpenCvDNNImpl& operator=(const DeepCNNOpenCvDNNImpl &dn)
{
#ifdef HAVE_DNN
this->net_=dn.net_;
#endif
this->setPreprocessor(dn.preprocessor_);
this->inputGeometry_=dn.inputGeometry_;
this->channelCount_=dn.channelCount_;
this->minibatchSz_=dn.minibatchSz_;
this->outputSize_=dn.outputSize_;
this->preprocessor_=dn.preprocessor_;
this->outputGeometry_=dn.outputGeometry_;
return *this;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
}
DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr<ImagePreprocessor> preprocessor, int maxMinibatchSz,int inputWidth ,int inputHeight ,int inputChannel )
:minibatchSz_(maxMinibatchSz)
{
CV_Assert(this->minibatchSz_>0);
CV_Assert(fileExists(modelArchFilename));
CV_Assert(fileExists(modelWeightsFilename));
CV_Assert(!preprocessor.empty());
this->setPreprocessor(preprocessor);
#ifdef HAVE_DNN
this->net_ = makePtr<Net>(readNetFromCaffe(modelArchFilename,modelWeightsFilename));
if (this->net_.empty())
{
std::cerr << "Can't load network by using the following files: " << std::endl;
std::cerr << "prototxt: " << modelArchFilename << std::endl;
std::cerr << "caffemodel: " << modelWeightsFilename << std::endl;
//std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl;
//std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl;
exit(-1);
}
this->inputGeometry_=Size(inputWidth,inputHeight);// Size(inputLayer->width(), inputLayer->height());
this->channelCount_ = inputChannel;//inputLayer->channels();
//inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width);
Ptr< Layer > outLayer= net_->getLayer (net_->getLayerId (net_->getLayerNames()[net_->getLayerNames().size()-2]));
//std::vector<Mat> blobs = outLayer->blobs;
this->outputSize_=(outLayer->blobs)[1].size[0] ;//net_->output_blobs()[0]->channels();
//this->outputGeometry_ = Size(1,1);//Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height());
#else
CV_Error(Error::StsError,"DNN module not available during compilation!");
#endif
}
void classify(InputArray image, OutputArray classProbabilities)
{
std::vector<Mat> inputImageList;
inputImageList.push_back(image.getMat());
classifyBatch(inputImageList,classProbabilities);
}
void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities)
{
std::vector<Mat> allImageVector;
inputImageList.getMatVector(allImageVector);
size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic
size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic
classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F);
Mat outputMat = classProbabilities.getMat();
for(size_t imgNum=0;imgNum<allImageVector.size();imgNum+=minibatchSize)
{
size_t rangeEnd=imgNum+std::min<size_t>(allImageVector.size()-imgNum,minibatchSize);
std::vector<Mat>::const_iterator from=std::vector<Mat>::const_iterator(allImageVector.begin()+imgNum);
std::vector<Mat>::const_iterator to=std::vector<Mat>::const_iterator(allImageVector.begin()+rangeEnd);
std::vector<Mat> minibatchInput(from,to);
classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd)));
}
}
int getOutputSize()
{
return this->outputSize_;
}
Size getOutputGeometry()
{
return this->outputGeometry_;
}
int getMinibatchSize()
{
return this->minibatchSz_;
}
int getBackend()
{
return OCR_HOLISTIC_BACKEND_DNN;
}
};
Ptr<DeepCNN> DeepCNN::create(String archFilename,String weightsFilename,Ptr<ImagePreprocessor> preprocessor,int minibatchSz,int backEnd)
{
if(preprocessor.empty())
{
preprocessor=ImagePreprocessor::createResizer();
}
switch(backEnd){
case OCR_HOLISTIC_BACKEND_DEFAULT:
#ifdef HAVE_CAFFE
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz));
#elif defined(HAVE_DNN)
return Ptr<DeepCNN>(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1));
#else
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
#endif
break;
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz));
break;
case OCR_HOLISTIC_BACKEND_DNN:
return Ptr<DeepCNN>(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
break;
}
}
Ptr<DeepCNN> DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd)
{
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageStandarizer(113);
switch(backEnd){
case OCR_HOLISTIC_BACKEND_DEFAULT:
#ifdef HAVE_CAFFE
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100));
#elif defined(HAVE_DNN)
return Ptr<DeepCNN>(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1));
#else
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
#endif
break;
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100));
break;
case OCR_HOLISTIC_BACKEND_DNN:
return Ptr<DeepCNN>(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
break;
}
}
namespace cnn_config{
std::vector<std::string> getAvailableBackends()
{
std::vector<std::string> backends;
#ifdef HAVE_CAFFE
backends.push_back("CAFFE, OCR_HOLISTIC_BACKEND_CAFFE"); // dnn backend opencv_dnn
#endif
#ifdef HAVE_DNN
backends.push_back("DNN, OCR_HOLISTIC_BACKEND_DNN");// opencv_dnn based backend"
#endif
return backends;
}
namespace caffe_backend{
#ifdef HAVE_CAFFE
bool getCaffeGpuMode()
{
return caffe::Caffe::mode()==caffe::Caffe::GPU;
}
void setCaffeGpuMode(bool useGpu)
{
if(useGpu)
{
caffe::Caffe::set_mode(caffe::Caffe::GPU);
}else
{
caffe::Caffe::set_mode(caffe::Caffe::CPU);
}
}
bool getCaffeAvailable()
{
return true;
}
#else
bool getCaffeGpuMode()
{
CV_Error(Error::StsError,"Caffe not available during compilation!");
return 0;
}
void setCaffeGpuMode(bool useGpu)
{
CV_Error(Error::StsError,"Caffe not available during compilation!");
CV_Assert(useGpu==1);//Compilation directives force
}
bool getCaffeAvailable(){
return 0;
}
#endif
}//namespace caffe
namespace dnn_backend{
#ifdef HAVE_DNN
bool getDNNAvailable(){
return true;
}
#else
bool getDNNAvailable(){
return 0;
}
#endif
}//namspace dnn_backend
}//namespace cnn_config
class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{
private:
struct NetOutput{
//Auxiliary structure that handles the logic of getting class ids and probabillities from
//the raw outputs of caffe
int wordIdx;
float probabillity;
static bool sorter(const NetOutput& o1,const NetOutput& o2)
{//used with std::sort to provide the most probable class
return o1.probabillity>o2.probabillity;
}
static void getOutputs(const float* buffer,int nbOutputs,std::vector<NetOutput>& res)
{
res.resize(nbOutputs);
for(int k=0;k<nbOutputs;k++)
{
res[k].wordIdx=k;
res[k].probabillity=buffer[k];
}
std::sort(res.begin(),res.end(),NetOutput::sorter);
}
static void getClassification(const float* buffer,int nbOutputs,int &classNum,double& confidence)
{
std::vector<NetOutput> tmp;
getOutputs(buffer,nbOutputs,tmp);
classNum=tmp[0].wordIdx;
confidence=tmp[0].probabillity;
}
};
protected:
std::vector<String> labels_;
Ptr<TextImageClassifier> classifier_;
public:
OCRHolisticWordRecognizerImpl(Ptr<TextImageClassifier> classifierPtr,String vocabularyFilename):classifier_(classifierPtr)
{
CV_Assert(fileExists(vocabularyFilename));//this fails for some rason
std::ifstream labelsFile(vocabularyFilename.c_str());
if(!labelsFile)
{
CV_Error(Error::StsError,"Could not read Labels from file");
}
std::string line;
while (std::getline(labelsFile, line))
{
labels_.push_back(std::string(line));
}
CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size()));
}
OCRHolisticWordRecognizerImpl(Ptr<TextImageClassifier> classifierPtr,const std::vector<String>& vocabulary):classifier_(classifierPtr)
{
this->labels_=vocabulary;
CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size()));
}
void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)
{
Mat netOutput;
this->classifier_->classify(inputImage,netOutput);
int classNum;
NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence);
transcription=this->labels_[classNum];
}
void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptionVec,CV_OUT std::vector<double>& confidenceVec)
{
Mat netOutput;
this->classifier_->classifyBatch(inputImageList,netOutput);
for(int k=0;k<netOutput.rows;k++)
{
int classNum;
double confidence;
NetOutput::getClassification((float*)(netOutput.row(k).data),this->classifier_->getOutputSize(),classNum,confidence);
transcriptionVec.push_back(this->labels_[classNum]);
confidenceVec.push_back(confidence);
}
}
void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting
double confidence;
String transcription;
recogniseImage(image,transcription,confidence);
output_text=transcription.c_str();
if(component_rects!=NULL)
{
component_rects->resize(1);
(*component_rects)[0]=Rect(0,0,image.size().width,image.size().height);
}
if(component_texts!=NULL)
{
component_texts->resize(1);
(*component_texts)[0]=transcription.c_str();
}
if(component_confidences!=NULL)
{
component_confidences->resize(1);
(*component_confidences)[0]=float(confidence);
}
}
void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image
this->run(image,output_text,component_rects,component_texts,component_confidences,component_level);
}
std::vector<String>& getVocabulary()
{
return this->labels_;
}
Ptr<TextImageClassifier> getClassifier()
{
return this->classifier_;
}
};
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(Ptr<TextImageClassifier> classifierPtr,String vocabularyFilename )
{
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename));
}
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename)
{
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageStandarizer(113);
Ptr<TextImageClassifier> classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100));
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename));
}
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(Ptr<TextImageClassifier> classifierPtr,const std::vector<String>& vocabulary)
{
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary));
}
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename,const std::vector<String>& vocabulary){
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageStandarizer(113);
Ptr<TextImageClassifier> classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100));
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary));
}
} } //namespace text namespace cv
...@@ -45,6 +45,8 @@ ...@@ -45,6 +45,8 @@
#include "opencv2/text.hpp" #include "opencv2/text.hpp"
#include "text_config.hpp"
#ifdef HAVE_TESSERACT #ifdef HAVE_TESSERACT
#if !defined(USE_STD_NAMESPACE) #if !defined(USE_STD_NAMESPACE)
#define USE_STD_NAMESPACE #define USE_STD_NAMESPACE
......
#include "precomp.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/core.hpp"
#include <iostream>
#include <fstream>
#include <sstream>
#include <queue>
#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
//#ifdef HAVE_CAFFE
//#include "caffe/caffe.hpp"
//#endif
namespace cv { namespace text {
class textDetectImpl: public textDetector{
private:
struct NetOutput{
//Auxiliary structure that handles the logic of getting bounding box and confidences of textness from
//the raw outputs of caffe
Rect bbox;
float probability;
static void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,std::vector<NetOutput>& res,Size inputShape)
{
res.resize(nbrTextBoxes);
for(int k=0;k<nbrTextBoxes;k++)
{
float x_min = buffer[k*nCol+3]*inputShape.width;
float y_min = buffer[k*nCol+4]*inputShape.height;
float x_max = buffer[k*nCol+5]*inputShape.width;
float y_max = buffer[k*nCol +6]*inputShape.height;
x_min = x_min<0?0:x_min;
y_min = y_min<0?0:y_min;
x_max = x_max> inputShape.width?inputShape.width-1:x_max;
y_max = y_max > inputShape.height?inputShape.height-1:y_max;
float wd = x_max-x_min+1;
float ht = y_max-y_min+1;
res[k].bbox=Rect(int(x_min),int(y_min),int(wd),int(ht));
res[k].probability=buffer[k*nCol+2];
}
}
};
protected:
Ptr<TextRegionDetector> classifier_;
public:
textDetectImpl(Ptr<TextRegionDetector> classifierPtr):classifier_(classifierPtr)
{
}
void textDetectInImage(InputArray inputImage,CV_OUT std::vector<Rect>& Bbox,CV_OUT std::vector<float>& confidence)
{
Mat netOutput;
// call the detect function of deepTextCNN class
this->classifier_->detect(inputImage,netOutput);
// get the output geometry i.e height and width of output blob from caffe
Size OutputGeometry_ = this->classifier_->getOutputGeometry();
int nbrTextBoxes = OutputGeometry_.height;
int nCol = OutputGeometry_.width;
std::vector<NetOutput> tmp;
// the output bounding box needs to be resized by the input height and width
Size inputImageShape = Size(inputImage.cols(),inputImage.rows());
NetOutput::getOutputs((float*)(netOutput.data),nbrTextBoxes,nCol,tmp,inputImageShape);
// put the output in CV_OUT
for (int k=0;k<nbrTextBoxes;k++)
{
Bbox.push_back(tmp[k].bbox);
confidence.push_back(tmp[k].probability);
}
}
void run(Mat& image, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting
std::vector<Rect> bbox;
std::vector<float> score;
textDetectInImage(image,bbox,score);
if(component_rects!=NULL)
{
component_rects->resize(bbox.size()); // should be a user behavior
component_rects = &bbox;
}
if(component_confidences!=NULL)
{
component_confidences->resize(score.size()); // shoub be a user behavior
component_confidences = &score;
}
}
void run(Mat& image, Mat& mask, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image
this->run(image,component_rects,component_confidences,component_level);
}
Ptr<TextRegionDetector> getClassifier()
{
return this->classifier_;
}
};
Ptr<textDetector> textDetector::create(Ptr<TextRegionDetector> classifierPtr)
{
return Ptr<textDetector>(new textDetectImpl(classifierPtr));
}
Ptr<textDetector> textDetector::create(String modelArchFilename, String modelWeightsFilename)
{
// create a custom preprocessor with rawval
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255);
// set the mean for the preprocessor
Mat textbox_mean(1,3,CV_8U);
textbox_mean.at<uchar>(0,0)=104;
textbox_mean.at<uchar>(0,1)=117;
textbox_mean.at<uchar>(0,2)=123;
preprocessor->set_mean(textbox_mean);
// create a pointer to text box detector(textDetector)
Ptr<TextRegionDetector> classifierPtr(DeepCNNTextDetector::create(modelArchFilename,modelWeightsFilename,preprocessor,1));
return Ptr<textDetector>(new textDetectImpl(classifierPtr));
}
} } //namespace text namespace cv
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "precomp.hpp" #include "precomp.hpp"
#include "opencv2/imgproc.hpp" #include "opencv2/imgproc.hpp"
#include "opencv2/core.hpp" #include "opencv2/core.hpp"
#include <iostream>
#include <fstream> #include <fstream>
#include <sstream>
#include <queue>
#include <algorithm> #include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef HAVE_CAFFE
#include "caffe/caffe.hpp"
#endif
#ifdef HAVE_DNN
#include "opencv2/dnn.hpp" #include "opencv2/dnn.hpp"
#endif
using namespace cv::dnn; using namespace cv::dnn;
#define CV_WARN(message) fprintf(stderr, "warning: %s (%s:%d)\n", message, __FILE__, __LINE__) namespace cv
{
namespace cv { namespace text { namespace text
{
inline bool fileExists (String filename) {
std::ifstream f(filename.c_str());
return f.good();
}
class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ class TextDetectorCNNImpl : public TextDetectorCNN
{
protected: protected:
Net net_;
std::vector<Size> sizes_;
int inputChannelCount_;
bool detectMultiscale_;
void process_(Mat inputImage, Mat &outputMat) void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,
std::vector<Rect>& Bbox, std::vector<float>& confidence, Size inputShape)
{ {
// do forward pass and stores the output in outputMat for(int k = 0; k < nbrTextBoxes; k++)
CV_Assert(outputMat.isContinuous());
if (inputImage.channels() != this->inputChannelCount_)
CV_WARN("Number of input channel(s) in the model is not same as input");
#ifdef HAVE_CAFFE
net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width);
net_->Reshape();
float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data();
float* inputData=inputBuffer;
std::vector<Mat> input_channels;
Mat preprocessed;
// if the image have multiple color channels the input layer should be populated accordingly
for (int channel=0;channel < this->inputChannelCount_;channel++){
cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData);
input_channels.push_back(netInputWraped);
//input_data += width * height;
inputData+=(this->inputGeometry_.height*this->inputGeometry_.width);
}
this->preprocess(inputImage,preprocessed);
split(preprocessed, input_channels);
//preprocessed.copyTo(netInputWraped);
this->net_->Forward();
const float* outputNetData=net_->output_blobs()[0]->cpu_data();
// const float* outputNetData1=net_->output_blobs()[1]->cpu_data();
this->outputGeometry_.height = net_->output_blobs()[0]->height();
this->outputGeometry_.width = net_->output_blobs()[0]->width();
this->outputChannelCount_ = net_->output_blobs()[0]->channels();
int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width;
outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1);
float*outputMatData=(float*)(outputMat.data);
memcpy(outputMatData,outputNetData,sizeof(float)*outputSz);
#endif
}
#ifdef HAVE_CAFFE
Ptr<caffe::Net<float> > net_;
#endif
//Size inputGeometry_;
int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
//int outputSize_;
public:
DeepCNNTextDetectorCaffeImpl(const DeepCNNTextDetectorCaffeImpl& dn):
minibatchSz_(dn.minibatchSz_){
outputGeometry_=dn.outputGeometry_;
inputGeometry_=dn.inputGeometry_;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
}
DeepCNNTextDetectorCaffeImpl& operator=(const DeepCNNTextDetectorCaffeImpl &dn)
{ {
#ifdef HAVE_CAFFE float x_min = buffer[k*nCol + 3]*inputShape.width;
this->net_=dn.net_; float y_min = buffer[k*nCol + 4]*inputShape.height;
#endif
this->setPreprocessor(dn.preprocessor_);
this->inputGeometry_=dn.inputGeometry_;
this->inputChannelCount_=dn.inputChannelCount_;
this->outputChannelCount_ = dn.outputChannelCount_;
// this->minibatchSz_=dn.minibatchSz_;
//this->outputGeometry_=dn.outputSize_;
this->preprocessor_=dn.preprocessor_;
this->outputGeometry_=dn.outputGeometry_;
return *this;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
}
DeepCNNTextDetectorCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr<ImagePreprocessor> preprocessor, int maxMinibatchSz)
:minibatchSz_(maxMinibatchSz)
{
CV_Assert(this->minibatchSz_>0);
CV_Assert(fileExists(modelArchFilename));
CV_Assert(fileExists(modelWeightsFilename));
CV_Assert(!preprocessor.empty());
this->setPreprocessor(preprocessor);
#ifdef HAVE_CAFFE
this->net_.reset(new caffe::Net<float>(modelArchFilename, caffe::TEST));
CV_Assert(net_->num_inputs()==1);
CV_Assert(net_->num_outputs()==1);
CV_Assert(this->net_->input_blobs()[0]->channels()==1
||this->net_->input_blobs()[0]->channels()==3);
// this->channelCount_=this->net_->input_blobs()[0]->channels();
this->net_->CopyTrainedLayersFrom(modelWeightsFilename);
caffe::Blob<float>* inputLayer = this->net_->input_blobs()[0];
this->inputGeometry_.height = inputLayer->height(); float x_max = buffer[k*nCol + 5]*inputShape.width;
this->inputGeometry_.width = inputLayer->width(); float y_max = buffer[k*nCol + 6]*inputShape.height;
this->inputChannelCount_ = inputLayer->channels();
//this->inputGeometry_.batchSize =1;
inputLayer->Reshape(this->minibatchSz_,this->inputChannelCount_,this->inputGeometry_.height, this->inputGeometry_.width); CV_Assert(x_min < x_max, y_min < y_max);
net_->Reshape();
this->outputChannelCount_ = net_->output_blobs()[0]->channels();
//this->outputGeometry_.batchSize =1;
this->outputGeometry_.height =net_->output_blobs()[0]->height();
this->outputGeometry_.width = net_->output_blobs()[0]->width();
#else x_min = std::max(0.f, x_min);
CV_Error(Error::StsError,"Caffe not available during compilation!"); y_min = std::max(0.f, y_min);
#endif
}
x_max = std::min(inputShape.width - 1.f, x_max);
y_max = std::min(inputShape.height - 1.f, y_max);
void detect(InputArray image, OutputArray Bbox_prob) int wd = cvRound(x_max - x_min);
{ int ht = cvRound(y_max - y_min);
Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width);
Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed
Mat outputMat = Bbox_prob.getMat();
process_(image.getMat(),outputMat);
//copy back to outputArray
outputMat.copyTo(Bbox_prob);
}
Size getOutputGeometry() Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht));
{ confidence.push_back(buffer[k*nCol + 2]);
return this->outputGeometry_;
} }
Size getinputGeometry()
{
return this->inputGeometry_;
}
int getMinibatchSize()
{
return this->minibatchSz_;
}
int getBackend()
{
return OCR_HOLISTIC_BACKEND_CAFFE;
}
void setPreprocessor(Ptr<ImagePreprocessor> ptr)
{
CV_Assert(!ptr.empty());
preprocessor_=ptr;
}
Ptr<ImagePreprocessor> getPreprocessor()
{
return preprocessor_;
}
};
class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{
protected:
void process_(Mat inputImage, Mat &outputMat)
{
// do forward pass and stores the output in outputMat
CV_Assert(outputMat.isContinuous());
if (inputImage.channels() != this->inputChannelCount_)
CV_WARN("Number of input channel(s) in the model is not same as input");
#ifdef HAVE_DNN
Mat preprocessed;
this->preprocess(inputImage,preprocessed);
net_->setInput(blobFromImage(preprocessed,1, this->inputGeometry_), "data");
Mat outputNet = this->net_->forward( );
this->outputGeometry_.height = outputNet.size[2];
this->outputGeometry_.width = outputNet.size[3];
this->outputChannelCount_ = outputNet.size[1];
outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1);
float*outputMatData=(float*)(outputMat.data);
float*outputNetData=(float*)(outputNet.data);
int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width;
memcpy(outputMatData,outputNetData,sizeof(float)*outputSz);
#endif
} }
#ifdef HAVE_DNN
Ptr<Net> net_;
#endif
//Size inputGeometry_;
int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
//int outputSize_;
//int inputHeight_;
//int inputWidth_;
//int inputChannel_;
public: public:
DeepCNNTextDetectorDNNImpl(const DeepCNNTextDetectorDNNImpl& dn): TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale) :
minibatchSz_(dn.minibatchSz_){ detectMultiscale_(detectMultiscale)
outputGeometry_=dn.outputGeometry_;
inputGeometry_=dn.inputGeometry_;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
#ifdef HAVE_DNN
this->net_=dn.net_;
#endif
}
DeepCNNTextDetectorDNNImpl& operator=(const DeepCNNTextDetectorDNNImpl &dn)
{ {
#ifdef HAVE_DNN net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename);
this->net_=dn.net_; CV_Assert(!net_.empty());
#endif inputChannelCount_ = 3;
this->setPreprocessor(dn.preprocessor_); sizes_.push_back(Size(700, 700));
this->inputGeometry_=dn.inputGeometry_;
this->inputChannelCount_=dn.inputChannelCount_;
this->outputChannelCount_ = dn.outputChannelCount_;
// this->minibatchSz_=dn.minibatchSz_;
//this->outputGeometry_=dn.outputSize_;
this->preprocessor_=dn.preprocessor_;
this->outputGeometry_=dn.outputGeometry_;
return *this;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
}
DeepCNNTextDetectorDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr<ImagePreprocessor> preprocessor, int maxMinibatchSz,int inputHeight=700,int inputWidth =700,int inputChannel =3)
:minibatchSz_(maxMinibatchSz)
{
CV_Assert(this->minibatchSz_>0);
CV_Assert(fileExists(modelArchFilename));
CV_Assert(fileExists(modelWeightsFilename));
CV_Assert(!preprocessor.empty());
this->setPreprocessor(preprocessor);
#ifdef HAVE_DNN
this->net_ = makePtr<Net>(readNetFromCaffe(modelArchFilename,modelWeightsFilename));
if (this->net_.empty()) if(detectMultiscale_)
{ {
std::cerr << "Can't load network by using the following files: " << std::endl; sizes_.push_back(Size(300, 300));
std::cerr << "prototxt: " << modelArchFilename << std::endl; sizes_.push_back(Size(700,500));
std::cerr << "caffemodel: " << modelWeightsFilename << std::endl; sizes_.push_back(Size(700,300));
//std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl; sizes_.push_back(Size(1600,1600));
//std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl;
exit(-1);
} }
this->inputGeometry_.height =inputHeight;
this->inputGeometry_.width = inputWidth ;//inputLayer->width();
this->inputChannelCount_ = inputChannel ;//inputLayer->channels();
#else
CV_Error(Error::StsError,"DNN module not available during compilation!");
#endif
}
void detect(InputArray image, OutputArray Bbox_prob)
{
Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width);
Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed
Mat outputMat = Bbox_prob.getMat();
process_(image.getMat(),outputMat);
//copy back to outputArray
outputMat.copyTo(Bbox_prob);
} }
Size getOutputGeometry() void textDetectInImage(InputArray inputImage_, std::vector<Rect>& Bbox, std::vector<float>& confidence)
{
return this->outputGeometry_;
}
Size getinputGeometry()
{ {
return this->inputGeometry_; CV_Assert(inputImage_.channels() == inputChannelCount_);
} Mat inputImage = inputImage_.getMat().clone();
Bbox.resize(0);
confidence.resize(0);
int getMinibatchSize() for(size_t i = 0; i < sizes_.size(); i++)
{ {
return this->minibatchSz_; Size inputGeometry = sizes_[i];
net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104)), "data");
Mat outputNet = net_.forward();
int nbrTextBoxes = outputNet.size[2];
int nCol = outputNet.size[3];
int outputChannelCount = outputNet.size[1];
CV_Assert(outputChannelCount == 1);
getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size());
} }
int getBackend()
{
return OCR_HOLISTIC_BACKEND_DNN;
}
void setPreprocessor(Ptr<ImagePreprocessor> ptr)
{
CV_Assert(!ptr.empty());
preprocessor_=ptr;
}
Ptr<ImagePreprocessor> getPreprocessor()
{
return preprocessor_;
} }
}; };
Ptr<DeepCNNTextDetector> DeepCNNTextDetector::create(String archFilename,String weightsFilename,Ptr<ImagePreprocessor> preprocessor,int minibatchSz,int backEnd) Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, bool detectMultiscale)
{ {
if(preprocessor.empty()) return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectMultiscale);
{
// create a custom preprocessor with rawval
preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255);
// set the mean for the preprocessor
Mat textbox_mean(1,3,CV_8U);
textbox_mean.at<uchar>(0,0)=104;
textbox_mean.at<uchar>(0,1)=117;
textbox_mean.at<uchar>(0,2)=123;
preprocessor->set_mean(textbox_mean);
}
switch(backEnd){
case OCR_HOLISTIC_BACKEND_DEFAULT:
#ifdef HAVE_CAFFE
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz));
#elif defined(HAVE_DNN)
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3));
#else
CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented");
return Ptr<DeepCNNTextDetector>();
#endif
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz));
break;
case OCR_HOLISTIC_BACKEND_DNN:
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented");
return Ptr<DeepCNNTextDetector>();
break;
}
//return Ptr<DeepCNNTextDetector>();
} }
} //namespace text
} //namespace cv
Ptr<DeepCNNTextDetector> DeepCNNTextDetector::createTextBoxNet(String archFilename,String weightsFilename,int backEnd)
{
// create a custom preprocessor with rawval
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255);
// set the mean for the preprocessor
Mat textbox_mean(1,3,CV_8U);
textbox_mean.at<uchar>(0,0)=104;
textbox_mean.at<uchar>(0,1)=117;
textbox_mean.at<uchar>(0,2)=123;
preprocessor->set_mean(textbox_mean);
switch(backEnd){
case OCR_HOLISTIC_BACKEND_DEFAULT:
#ifdef HAVE_CAFFE
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1));
#elif defined(HAVE_DNN)
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3));
#else
CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented");
return Ptr<DeepCNNTextDetector>();
#endif
break;
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1));
break;
case OCR_HOLISTIC_BACKEND_DNN:
return Ptr<DeepCNNTextDetector>(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented");
return Ptr<DeepCNNTextDetector>();
break;
}
//return Ptr<DeepCNNTextDetector>();
}
void DeepCNNTextDetector::preprocess(const Mat& input,Mat& output)
{
Size inputHtWd = Size(this->inputGeometry_.height,this->inputGeometry_.width);
this->preprocessor_->preprocess(input,output,inputHtWd,this->inputChannelCount_);
}
} } //namespace text namespace cv
#ifndef __OPENCV_TEXT_CONFIG_HPP__ #ifndef __OPENCV_TEXT_CONFIG_HPP__
#define __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__
// HAVE OCR Tesseract
#cmakedefine HAVE_TESSERACT
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment