Merge pull request #10356 from dkurt:dnn_rfcn

b8a24b36 · Vadim Pisarevsky · d2f13a82 · 0ed2cbc9 · b8a24b36 · b8a24b36
Commit b8a24b36 authored Dec 20, 2017 by Vadim Pisarevsky
7 changed files
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -242,6 +242,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        // ROIPooling parameters.
        Size pooledSize;
        float spatialScale;
+        // PSROIPooling parameters.
+        int psRoiOutChannels;

        static Ptr<PoolingLayer> create(const LayerParams& params);
    };

--- a/modules/dnn/misc/caffe/opencv-caffe.pb.cc
+++ b/modules/dnn/misc/caffe/opencv-caffe.pb.cc
--- a/modules/dnn/misc/caffe/opencv-caffe.pb.h
+++ b/modules/dnn/misc/caffe/opencv-caffe.pb.h
--- a/modules/dnn/src/caffe/opencv-caffe.proto
+++ b/modules/dnn/src/caffe/opencv-caffe.proto
@@ -179,6 +179,8 @@ message DetectionOutputParameter {
  // Only consider detections whose confidences are larger than a threshold.
  // If not provided, consider all boxes.
  optional float confidence_threshold = 9;
+  // If prior boxes are normalized to [0, 1] or not.
+  optional bool normalized_bbox = 10 [default = true];
 }

 message Datum {
@@ -548,6 +550,7 @@ message LayerParameter {
  optional PReLUParameter prelu_param = 131;
  optional PriorBoxParameter prior_box_param = 150;
  optional ProposalParameter proposal_param = 201;
+  optional PSROIPoolingParameter psroi_pooling_param = 10001;  // https://github.com/daijifeng001/caffe-rfcn
  optional PythonParameter python_param = 130;
  optional RecurrentParameter recurrent_param = 146;
  optional ReductionParameter reduction_param = 136;
@@ -1633,3 +1636,10 @@ message ProposalParameter {
  optional uint32 post_nms_topn = 7 [default = 300];
  optional float nms_thresh = 8 [default = 0.7];
 }
+
+// origin: https://github.com/daijifeng001/caffe-rfcn
+message PSROIPoolingParameter {
+  required float spatial_scale = 1;
+  required int32 output_dim = 2; // output channel number
+  required int32 group_size = 3; // equal to pooled_size
+}
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -89,6 +89,7 @@ void initializeLayerFactory()
    CV_DNN_REGISTER_LAYER_CLASS(Deconvolution,  DeconvolutionLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Pooling,        PoolingLayer);
    CV_DNN_REGISTER_LAYER_CLASS(ROIPooling,     PoolingLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(PSROIPooling,   PoolingLayer);
    CV_DNN_REGISTER_LAYER_CLASS(LRN,            LRNLayer);
    CV_DNN_REGISTER_LAYER_CLASS(InnerProduct,   InnerProductLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Softmax,        SoftmaxLayer);

--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -57,9 +57,9 @@ namespace cv
 {
 namespace dnn
 {
-static inline int scaleAndRoundRoi(float f, float scale)
+static inline int roundRoiSize(float v)
 {
-    return (int)(f * scale + (f >= 0.f ? 0.5f : -0.5f));
+    return (int)(v + (v >= 0.f ? 0.5f : -0.5f));
 }

 class PoolingLayerImpl : public PoolingLayer
@@ -86,17 +86,24 @@ public:
            getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
                                   pad.height, pad.width, stride.height, stride.width, padMode);
        }
-        else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale"))
+        else if (params.has("pooled_w") || params.has("pooled_h"))
        {
            type = ROI;
            computeMaxIdx = false;
+            pooledSize.width = params.get<uint32_t>("pooled_w", 1);
+            pooledSize.height = params.get<uint32_t>("pooled_h", 1);
+        }
+        else if (params.has("output_dim") && params.has("group_size"))
+        {
+            type = PSROI;
+            pooledSize.width = params.get<int>("group_size");
+            pooledSize.height = pooledSize.width;
+            psRoiOutChannels = params.get<int>("output_dim");
        }
        else
            CV_Error(Error::StsBadArg, "Cannot determine pooling type");
        setParamsFrom(params);
        ceilMode = params.get<bool>("ceil_mode", true);
-        pooledSize.width = params.get<uint32_t>("pooled_w", 1);
-        pooledSize.height = params.get<uint32_t>("pooled_h", 1);
        spatialScale = params.get<float>("spatial_scale", 1);
    }

@@ -195,7 +202,7 @@ public:
                CV_Assert(inputs.size() == 1, outputs.size() == 1);
                avePooling(*inputs[0], outputs[0]);
                break;
-            case ROI:
+            case ROI: case PSROI:
                CV_Assert(inputs.size() == 2, outputs.size() == 1);
                roiPooling(*inputs[0], *inputs[1], outputs[0]);
                break;
@@ -234,11 +241,11 @@ public:
                        Size stride, Size pad, int poolingType, float spatialScale,
                        bool computeMaxIdx, int nstripes)
        {
-            CV_Assert(src.isContinuous() && dst.isContinuous() &&
-                      src.type() == CV_32F && src.type() == dst.type() &&
-                      src.dims == 4 && dst.dims == 4 &&
-                      (poolingType == ROI && dst.size[0] == rois.size[0] ||
-                       src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] &&
+            CV_Assert(src.isContinuous(), dst.isContinuous(),
+                      src.type() == CV_32F, src.type() == dst.type(),
+                      src.dims == 4, dst.dims == 4,
+                      ((poolingType == ROI || poolingType == PSROI) && dst.size[0] ==rois.size[0] || src.size[0] == dst.size[0]),
+                       poolingType == PSROI || src.size[1] == dst.size[1],
                      (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));

            PoolingInvoker p;
@@ -297,12 +304,12 @@ public:
                int n = (int)(ofs / channels);
                int ystart, yend;

-                const float *srcData;
+                const float *srcData = 0;
                if (poolingType == ROI)
                {
                    const float *roisData = rois->ptr<float>(n);
-                    int ystartROI = scaleAndRoundRoi(roisData[2], spatialScale);
-                    int yendROI = scaleAndRoundRoi(roisData[4], spatialScale);
+                    int ystartROI = roundRoiSize(roisData[2] * spatialScale);
+                    int yendROI = roundRoiSize(roisData[4] * spatialScale);
                    int roiHeight = std::max(yendROI - ystartROI + 1, 1);
                    float roiRatio = (float)roiHeight / height;

@@ -312,6 +319,17 @@ public:
                    CV_Assert(roisData[0] < src->size[0]);
                    srcData = src->ptr<float>(roisData[0], c);
                }
+                else if (poolingType == PSROI)
+                {
+                    const float *roisData = rois->ptr<float>(n);
+                    float ystartROI = roundRoiSize(roisData[2]) * spatialScale;
+                    float yendROI = roundRoiSize(roisData[4] + 1) * spatialScale;
+                    float roiHeight = std::max(yendROI - ystartROI, 0.1f);
+                    float roiRatio = roiHeight / height;
+
+                    ystart = (int)std::floor(ystartROI + y0 * roiRatio);
+                    yend = (int)std::ceil(ystartROI + (y0 + 1) * roiRatio);
+                }
                else
                {
                    ystart = y0 * stride_h - pad_h;
@@ -530,11 +548,11 @@ public:
                        }
                    }
                }
-                else  // ROI
+                else if (poolingType == ROI)
                {
                    const float *roisData = rois->ptr<float>(n);
-                    int xstartROI = scaleAndRoundRoi(roisData[1], spatialScale);
-                    int xendROI = scaleAndRoundRoi(roisData[3], spatialScale);
+                    int xstartROI = roundRoiSize(roisData[1] * spatialScale);
+                    int xendROI = roundRoiSize(roisData[3] * spatialScale);
                    int roiWidth = std::max(xendROI - xstartROI + 1, 1);
                    float roiRatio = (float)roiWidth / width;
                    for( ; x0 < x1; x0++ )
@@ -561,6 +579,38 @@ public:
                        dstData[x0] = max_val;
                    }
                }
+                else  // PSROI
+                {
+                    const float *roisData = rois->ptr<float>(n);
+                    CV_Assert(roisData[0] < src->size[0]);
+                    float xstartROI = roundRoiSize(roisData[1]) * spatialScale;
+                    float xendROI = roundRoiSize(roisData[3] + 1) * spatialScale;
+                    float roiWidth = std::max(xendROI - xstartROI, 0.1f);
+                    float roiRatio = roiWidth / width;
+                    for( ; x0 < x1; x0++ )
+                    {
+                        int xstart = (int)std::floor(xstartROI + x0 * roiRatio);
+                        int xend = (int)std::ceil(xstartROI + (x0 + 1) * roiRatio);
+                        xstart = max(xstart, 0);
+                        xend = min(xend, inp_width);
+                        if (xstart >= xend || ystart >= yend)
+                        {
+                            dstData[x0] = 0;
+                            continue;
+                        }
+
+                        srcData = src->ptr<float>(roisData[0], (c * height + y0) * width + x0);
+                        float sum_val = 0.f;
+                        for (int y = ystart; y < yend; ++y)
+                            for (int x = xstart; x < xend; ++x)
+                            {
+                                const int index = y * inp_width + x;
+                                float val = srcData[index];
+                                sum_val += val;
+                            }
+                        dstData[x0] = sum_val / ((yend - ystart) * (xend - xstart));
+                    }
+                }
            }
        }
    };
@@ -719,7 +769,7 @@ public:
            out.height = 1;
            out.width = 1;
        }
-        else if (type == ROI)
+        else if (type == ROI || type == PSROI)
        {
            out.height = pooledSize.height;
            out.width = pooledSize.width;
@@ -754,6 +804,13 @@ public:
            CV_Assert(inputs.size() == 2);
            dims[0] = inputs[1][0];  // Number of proposals;
        }
+        else if (type == PSROI)
+        {
+            CV_Assert(inputs.size() == 2);
+            CV_Assert(psRoiOutChannels * pooledSize.width * pooledSize.height == inputs[0][1]);
+            dims[0] = inputs[1][0];  // Number of proposals;
+            dims[1] = psRoiOutChannels;
+        }
        outputs.assign(type == MAX ? 2 : 1, shape(dims));
        return false;
    }
@@ -784,7 +841,8 @@ private:
        MAX,
        AVE,
        STOCHASTIC,
-        ROI
+        ROI,   // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
+        PSROI  // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
    };
 };


--- a/samples/dnn/faster_rcnn.cpp
+++ b/samples/dnn/faster_rcnn.cpp
-// Faster-RCNN models use custom layer called 'Proposal' written in Python. To
-// map it into OpenCV's layer replace a layer node with [type: 'Python'] to the
-// following definition:
-// layer {
-//   name: 'proposal'
-//   type: 'Proposal'
-//   bottom: 'rpn_cls_prob_reshape'
-//   bottom: 'rpn_bbox_pred'
-//   bottom: 'im_info'
-//   top: 'rois'
-//   proposal_param {
-//     ratio: 0.5
-//     ratio: 1.0
-//     ratio: 2.0
-//     scale: 8
-//     scale: 16
-//     scale: 32
-//   }
-// }
-#include <iostream>
-
 #include <opencv2/dnn.hpp>
 #include <opencv2/dnn/all_layers.hpp>
 #include <opencv2/imgproc.hpp>
@@ -50,9 +29,11 @@ int main(int argc, char** argv)
 {
    // Parse command line arguments.
    CommandLineParser parser(argc, argv, keys);
-    parser.about( "This sample is used to run Faster-RCNN object detection with OpenCV.\n"
-                  "You can get required models from https://github.com/rbgirshick/py-faster-rcnn" );
-
+    parser.about("This sample is used to run Faster-RCNN and R-FCN object detection "
+                 "models with OpenCV. You can get required models from "
+                 "https://github.com/rbgirshick/py-faster-rcnn (Faster-RCNN) and from "
+                 "https://github.com/YuwenXiong/py-R-FCN (R-FCN). Corresponding .prototxt "
+                 "files may be found at https://github.com/opencv/opencv_extra/tree/master/testdata/dnn.");
    if (argc == 1 || parser.has("help"))
    {
        parser.printMessage();
@@ -68,19 +49,6 @@ int main(int argc, char** argv)
    // Load a model.
    Net net = readNetFromCaffe(protoPath, modelPath);

-    // Create a preprocessing layer that does final bounding boxes applying predicted
-    // deltas to objects locations proposals and doing non-maximum suppression over it.
-    LayerParams lp;
-    lp.set("code_type", "CENTER_SIZE");               // An every bounding box is [xmin, ymin, xmax, ymax]
-    lp.set("num_classes", 21);
-    lp.set("share_location", (int)false);             // Separate predictions for different classes.
-    lp.set("background_label_id", 0);
-    lp.set("variance_encoded_in_target", (int)true);
-    lp.set("keep_top_k", 100);
-    lp.set("nms_threshold", 0.3);
-    lp.set("normalized_bbox", (int)false);
-    Ptr<Layer> detectionOutputLayer = DetectionOutputLayer::create(lp);
-
    Mat img = imread(imagePath);
    resize(img, img, Size(kInpWidth, kInpHeight));
    Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
@@ -89,31 +57,8 @@ int main(int argc, char** argv)
    net.setInput(blob, "data");
    net.setInput(imInfo, "im_info");

-    std::vector<Mat> outs;
-    std::vector<String> outNames(3);
-    outNames[0] = "proposal";
-    outNames[1] = "bbox_pred";
-    outNames[2] = "cls_prob";
-    net.forward(outs, outNames);
-
-    Mat proposals = outs[0].colRange(1, 5).clone();  // Only last 4 columns.
-    Mat& deltas = outs[1];
-    Mat& scores = outs[2];
-
-    // Reshape proposals from Nx4 to 1x1xN*4
-    std::vector<int> shape(3, 1);
-    shape[2] = (int)proposals.total();
-    proposals = proposals.reshape(1, shape);
-
-    // Run postprocessing layer.
-    std::vector<Mat> layerInputs(3), layerOutputs(1), layerInternals;
-    layerInputs[0] = deltas.reshape(1, 1);
-    layerInputs[1] = scores.reshape(1, 1);
-    layerInputs[2] = proposals;
-    detectionOutputLayer->forward(layerInputs, layerOutputs, layerInternals);
-
    // Draw detections.
-    Mat detections = layerOutputs[0];
+    Mat detections = net.forward();
    const float* data = (float*)detections.data;
    for (size_t i = 0; i < detections.total(); i += 7)
    {