Commit b8a24b36 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #10356 from dkurt:dnn_rfcn

parents d2f13a82 0ed2cbc9
......@@ -242,6 +242,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
// ROIPooling parameters.
Size pooledSize;
float spatialScale;
// PSROIPooling parameters.
int psRoiOutChannels;
static Ptr<PoolingLayer> create(const LayerParams& params);
};
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -179,6 +179,8 @@ message DetectionOutputParameter {
// Only consider detections whose confidences are larger than a threshold.
// If not provided, consider all boxes.
optional float confidence_threshold = 9;
// If prior boxes are normalized to [0, 1] or not.
optional bool normalized_bbox = 10 [default = true];
}
message Datum {
......@@ -548,6 +550,7 @@ message LayerParameter {
optional PReLUParameter prelu_param = 131;
optional PriorBoxParameter prior_box_param = 150;
optional ProposalParameter proposal_param = 201;
optional PSROIPoolingParameter psroi_pooling_param = 10001; // https://github.com/daijifeng001/caffe-rfcn
optional PythonParameter python_param = 130;
optional RecurrentParameter recurrent_param = 146;
optional ReductionParameter reduction_param = 136;
......@@ -1633,3 +1636,10 @@ message ProposalParameter {
optional uint32 post_nms_topn = 7 [default = 300];
optional float nms_thresh = 8 [default = 0.7];
}
// origin: https://github.com/daijifeng001/caffe-rfcn
message PSROIPoolingParameter {
required float spatial_scale = 1;
required int32 output_dim = 2; // output channel number
required int32 group_size = 3; // equal to pooled_size
}
......@@ -89,6 +89,7 @@ void initializeLayerFactory()
CV_DNN_REGISTER_LAYER_CLASS(Deconvolution, DeconvolutionLayer);
CV_DNN_REGISTER_LAYER_CLASS(Pooling, PoolingLayer);
CV_DNN_REGISTER_LAYER_CLASS(ROIPooling, PoolingLayer);
CV_DNN_REGISTER_LAYER_CLASS(PSROIPooling, PoolingLayer);
CV_DNN_REGISTER_LAYER_CLASS(LRN, LRNLayer);
CV_DNN_REGISTER_LAYER_CLASS(InnerProduct, InnerProductLayer);
CV_DNN_REGISTER_LAYER_CLASS(Softmax, SoftmaxLayer);
......
......@@ -57,9 +57,9 @@ namespace cv
{
namespace dnn
{
static inline int scaleAndRoundRoi(float f, float scale)
static inline int roundRoiSize(float v)
{
return (int)(f * scale + (f >= 0.f ? 0.5f : -0.5f));
return (int)(v + (v >= 0.f ? 0.5f : -0.5f));
}
class PoolingLayerImpl : public PoolingLayer
......@@ -86,17 +86,24 @@ public:
getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
pad.height, pad.width, stride.height, stride.width, padMode);
}
else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale"))
else if (params.has("pooled_w") || params.has("pooled_h"))
{
type = ROI;
computeMaxIdx = false;
pooledSize.width = params.get<uint32_t>("pooled_w", 1);
pooledSize.height = params.get<uint32_t>("pooled_h", 1);
}
else if (params.has("output_dim") && params.has("group_size"))
{
type = PSROI;
pooledSize.width = params.get<int>("group_size");
pooledSize.height = pooledSize.width;
psRoiOutChannels = params.get<int>("output_dim");
}
else
CV_Error(Error::StsBadArg, "Cannot determine pooling type");
setParamsFrom(params);
ceilMode = params.get<bool>("ceil_mode", true);
pooledSize.width = params.get<uint32_t>("pooled_w", 1);
pooledSize.height = params.get<uint32_t>("pooled_h", 1);
spatialScale = params.get<float>("spatial_scale", 1);
}
......@@ -195,7 +202,7 @@ public:
CV_Assert(inputs.size() == 1, outputs.size() == 1);
avePooling(*inputs[0], outputs[0]);
break;
case ROI:
case ROI: case PSROI:
CV_Assert(inputs.size() == 2, outputs.size() == 1);
roiPooling(*inputs[0], *inputs[1], outputs[0]);
break;
......@@ -234,11 +241,11 @@ public:
Size stride, Size pad, int poolingType, float spatialScale,
bool computeMaxIdx, int nstripes)
{
CV_Assert(src.isContinuous() && dst.isContinuous() &&
src.type() == CV_32F && src.type() == dst.type() &&
src.dims == 4 && dst.dims == 4 &&
(poolingType == ROI && dst.size[0] == rois.size[0] ||
src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] &&
CV_Assert(src.isContinuous(), dst.isContinuous(),
src.type() == CV_32F, src.type() == dst.type(),
src.dims == 4, dst.dims == 4,
((poolingType == ROI || poolingType == PSROI) && dst.size[0] ==rois.size[0] || src.size[0] == dst.size[0]),
poolingType == PSROI || src.size[1] == dst.size[1],
(mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
PoolingInvoker p;
......@@ -297,12 +304,12 @@ public:
int n = (int)(ofs / channels);
int ystart, yend;
const float *srcData;
const float *srcData = 0;
if (poolingType == ROI)
{
const float *roisData = rois->ptr<float>(n);
int ystartROI = scaleAndRoundRoi(roisData[2], spatialScale);
int yendROI = scaleAndRoundRoi(roisData[4], spatialScale);
int ystartROI = roundRoiSize(roisData[2] * spatialScale);
int yendROI = roundRoiSize(roisData[4] * spatialScale);
int roiHeight = std::max(yendROI - ystartROI + 1, 1);
float roiRatio = (float)roiHeight / height;
......@@ -312,6 +319,17 @@ public:
CV_Assert(roisData[0] < src->size[0]);
srcData = src->ptr<float>(roisData[0], c);
}
else if (poolingType == PSROI)
{
const float *roisData = rois->ptr<float>(n);
float ystartROI = roundRoiSize(roisData[2]) * spatialScale;
float yendROI = roundRoiSize(roisData[4] + 1) * spatialScale;
float roiHeight = std::max(yendROI - ystartROI, 0.1f);
float roiRatio = roiHeight / height;
ystart = (int)std::floor(ystartROI + y0 * roiRatio);
yend = (int)std::ceil(ystartROI + (y0 + 1) * roiRatio);
}
else
{
ystart = y0 * stride_h - pad_h;
......@@ -530,11 +548,11 @@ public:
}
}
}
else // ROI
else if (poolingType == ROI)
{
const float *roisData = rois->ptr<float>(n);
int xstartROI = scaleAndRoundRoi(roisData[1], spatialScale);
int xendROI = scaleAndRoundRoi(roisData[3], spatialScale);
int xstartROI = roundRoiSize(roisData[1] * spatialScale);
int xendROI = roundRoiSize(roisData[3] * spatialScale);
int roiWidth = std::max(xendROI - xstartROI + 1, 1);
float roiRatio = (float)roiWidth / width;
for( ; x0 < x1; x0++ )
......@@ -561,6 +579,38 @@ public:
dstData[x0] = max_val;
}
}
else // PSROI
{
const float *roisData = rois->ptr<float>(n);
CV_Assert(roisData[0] < src->size[0]);
float xstartROI = roundRoiSize(roisData[1]) * spatialScale;
float xendROI = roundRoiSize(roisData[3] + 1) * spatialScale;
float roiWidth = std::max(xendROI - xstartROI, 0.1f);
float roiRatio = roiWidth / width;
for( ; x0 < x1; x0++ )
{
int xstart = (int)std::floor(xstartROI + x0 * roiRatio);
int xend = (int)std::ceil(xstartROI + (x0 + 1) * roiRatio);
xstart = max(xstart, 0);
xend = min(xend, inp_width);
if (xstart >= xend || ystart >= yend)
{
dstData[x0] = 0;
continue;
}
srcData = src->ptr<float>(roisData[0], (c * height + y0) * width + x0);
float sum_val = 0.f;
for (int y = ystart; y < yend; ++y)
for (int x = xstart; x < xend; ++x)
{
const int index = y * inp_width + x;
float val = srcData[index];
sum_val += val;
}
dstData[x0] = sum_val / ((yend - ystart) * (xend - xstart));
}
}
}
}
};
......@@ -719,7 +769,7 @@ public:
out.height = 1;
out.width = 1;
}
else if (type == ROI)
else if (type == ROI || type == PSROI)
{
out.height = pooledSize.height;
out.width = pooledSize.width;
......@@ -754,6 +804,13 @@ public:
CV_Assert(inputs.size() == 2);
dims[0] = inputs[1][0]; // Number of proposals;
}
else if (type == PSROI)
{
CV_Assert(inputs.size() == 2);
CV_Assert(psRoiOutChannels * pooledSize.width * pooledSize.height == inputs[0][1]);
dims[0] = inputs[1][0]; // Number of proposals;
dims[1] = psRoiOutChannels;
}
outputs.assign(type == MAX ? 2 : 1, shape(dims));
return false;
}
......@@ -784,7 +841,8 @@ private:
MAX,
AVE,
STOCHASTIC,
ROI
ROI, // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
PSROI // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
};
};
......
// Faster-RCNN models use custom layer called 'Proposal' written in Python. To
// map it into OpenCV's layer replace a layer node with [type: 'Python'] to the
// following definition:
// layer {
// name: 'proposal'
// type: 'Proposal'
// bottom: 'rpn_cls_prob_reshape'
// bottom: 'rpn_bbox_pred'
// bottom: 'im_info'
// top: 'rois'
// proposal_param {
// ratio: 0.5
// ratio: 1.0
// ratio: 2.0
// scale: 8
// scale: 16
// scale: 32
// }
// }
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include <opencv2/imgproc.hpp>
......@@ -50,9 +29,11 @@ int main(int argc, char** argv)
{
// Parse command line arguments.
CommandLineParser parser(argc, argv, keys);
parser.about( "This sample is used to run Faster-RCNN object detection with OpenCV.\n"
"You can get required models from https://github.com/rbgirshick/py-faster-rcnn" );
parser.about("This sample is used to run Faster-RCNN and R-FCN object detection "
"models with OpenCV. You can get required models from "
"https://github.com/rbgirshick/py-faster-rcnn (Faster-RCNN) and from "
"https://github.com/YuwenXiong/py-R-FCN (R-FCN). Corresponding .prototxt "
"files may be found at https://github.com/opencv/opencv_extra/tree/master/testdata/dnn.");
if (argc == 1 || parser.has("help"))
{
parser.printMessage();
......@@ -68,19 +49,6 @@ int main(int argc, char** argv)
// Load a model.
Net net = readNetFromCaffe(protoPath, modelPath);
// Create a preprocessing layer that does final bounding boxes applying predicted
// deltas to objects locations proposals and doing non-maximum suppression over it.
LayerParams lp;
lp.set("code_type", "CENTER_SIZE"); // An every bounding box is [xmin, ymin, xmax, ymax]
lp.set("num_classes", 21);
lp.set("share_location", (int)false); // Separate predictions for different classes.
lp.set("background_label_id", 0);
lp.set("variance_encoded_in_target", (int)true);
lp.set("keep_top_k", 100);
lp.set("nms_threshold", 0.3);
lp.set("normalized_bbox", (int)false);
Ptr<Layer> detectionOutputLayer = DetectionOutputLayer::create(lp);
Mat img = imread(imagePath);
resize(img, img, Size(kInpWidth, kInpHeight));
Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
......@@ -89,31 +57,8 @@ int main(int argc, char** argv)
net.setInput(blob, "data");
net.setInput(imInfo, "im_info");
std::vector<Mat> outs;
std::vector<String> outNames(3);
outNames[0] = "proposal";
outNames[1] = "bbox_pred";
outNames[2] = "cls_prob";
net.forward(outs, outNames);
Mat proposals = outs[0].colRange(1, 5).clone(); // Only last 4 columns.
Mat& deltas = outs[1];
Mat& scores = outs[2];
// Reshape proposals from Nx4 to 1x1xN*4
std::vector<int> shape(3, 1);
shape[2] = (int)proposals.total();
proposals = proposals.reshape(1, shape);
// Run postprocessing layer.
std::vector<Mat> layerInputs(3), layerOutputs(1), layerInternals;
layerInputs[0] = deltas.reshape(1, 1);
layerInputs[1] = scores.reshape(1, 1);
layerInputs[2] = proposals;
detectionOutputLayer->forward(layerInputs, layerOutputs, layerInternals);
// Draw detections.
Mat detections = layerOutputs[0];
Mat detections = net.forward();
const float* data = (float*)detections.data;
for (size_t i = 0; i < detections.total(); i += 7)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment