Set output layers names and types for models in DLDT's intermediate representation

346871e2 · Dmitry Kurtaev · e4b51fa8 · 346871e2 · 346871e2 · 346871e2
Commit 346871e2 authored Jun 28, 2018 by Dmitry Kurtaev
4 changed files
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1993,11 +1993,17 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
    backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
    for (auto& it : ieNet.getOutputsInfo())
    {
+        Ptr<Layer> cvLayer(new InfEngineBackendLayer(it.second));
+        InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
+        CV_Assert(ieLayer);
        LayerParams lp;
        int lid = cvNet.addLayer(it.first, "", lp);
        LayerData& ld = cvNet.impl->layers[lid];
-        ld.layerInstance = Ptr<Layer>(new InfEngineBackendLayer(it.second));
+        cvLayer->name = it.first;
+        cvLayer->type = ieLayer->type;
+        ld.layerInstance = cvLayer;
        ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
        for (int i = 0; i < inputsNames.size(); ++i)

--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -925,6 +925,10 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy)
    Mat out = net.forward();
    normAssert(outDefault, out);
+    std::vector<int> outLayers = net.getUnconnectedOutLayers();
+    ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge");
+    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat");
 }
 // 1. Create a .prototxt file with the following network:

--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -22,6 +22,7 @@ const char* keys =
    "{ height      | -1 | Preprocess input image by resizing to a specific height. }"
    "{ rgb         |    | Indicate that model works with RGB input images instead BGR ones. }"
    "{ thr         | .5 | Confidence threshold. }"
+    "{ thr         | .4 | Non-maximum suppression threshold. }"
    "{ backend     |  0 | Choose one of computation backends: "
                         "0: automatically (by default), "
                         "1: Halide language (http://halide-lang.org/), "
@@ -37,7 +38,7 @@ const char* keys =
 using namespace cv;
 using namespace dnn;
-float confThreshold;
+float confThreshold, nmsThreshold;
 std::vector<std::string> classes;
 void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net);
@@ -59,6 +60,7 @@ int main(int argc, char** argv)
    }
    confThreshold = parser.get<float>("thr");
+    nmsThreshold = parser.get<float>("nms");
    float scale = parser.get<float>("scale");
    Scalar mean = parser.get<Scalar>("mean");
    bool swapRB = parser.get<bool>("rgb");
@@ -144,6 +146,9 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
    static std::vector<int> outLayers = net.getUnconnectedOutLayers();
    static std::string outLayerType = net.getLayer(outLayers[0])->type;
+    std::vector<int> classIds;
+    std::vector<float> confidences;
+    std::vector<Rect> boxes;
    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
@@ -160,8 +165,11 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                int top = (int)data[i + 4];
                int right = (int)data[i + 5];
                int bottom = (int)data[i + 6];
-                int classId = (int)(data[i + 1]) - 1;  // Skip 0th background class id.
+                int width = right - left + 1;
-                drawPred(classId, confidence, left, top, right, bottom, frame);
+                int height = bottom - top + 1;
+                classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
+                boxes.push_back(Rect(left, top, width, height));
+                confidences.push_back(confidence);
            }
        }
    }
@@ -181,16 +189,16 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                int top = (int)(data[i + 4] * frame.rows);
                int right = (int)(data[i + 5] * frame.cols);
                int bottom = (int)(data[i + 6] * frame.rows);
-                int classId = (int)(data[i + 1]) - 1;  // Skip 0th background class id.
+                int width = right - left + 1;
-                drawPred(classId, confidence, left, top, right, bottom, frame);
+                int height = bottom - top + 1;
+                classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
+                boxes.push_back(Rect(left, top, width, height));
+                confidences.push_back(confidence);
            }
        }
    }
    else if (outLayerType == "Region")
    {
-        std::vector<int> classIds;
-        std::vector<float> confidences;
-        std::vector<Rect> boxes;
        for (size_t i = 0; i < outs.size(); ++i)
        {
            // Network produces output blob with a shape NxC where N is a number of
@@ -218,18 +226,19 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                }
            }
        }
-        std::vector<int> indices;
-        NMSBoxes(boxes, confidences, confThreshold, 0.4f, indices);
-        for (size_t i = 0; i < indices.size(); ++i)
-        {
-            int idx = indices[i];
-            Rect box = boxes[idx];
-            drawPred(classIds[idx], confidences[idx], box.x, box.y,
-                     box.x + box.width, box.y + box.height, frame);
-        }
    }
    else
        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
+    std::vector<int> indices;
+    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
+    for (size_t i = 0; i < indices.size(); ++i)
+    {
+        int idx = indices[i];
+        Rect box = boxes[idx];
+        drawPred(classIds[idx], confidences[idx], box.x, box.y,
+                 box.x + box.width, box.y + box.height, frame);
+    }
 }
 void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)

--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@@ -31,6 +31,7 @@ parser.add_argument('--height', type=int,
 parser.add_argument('--rgb', action='store_true',
                    help='Indicate that model works with RGB input images instead BGR ones.')
 parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
+parser.add_argument('--nms', type=float, default=0.4, help='Non-maximum suppression threshold')
 parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
                    help="Choose one of computation backends: "
                         "%d: automatically (by default), "
@@ -57,6 +58,7 @@ net.setPreferableBackend(args.backend)
 net.setPreferableTarget(args.target)
 confThreshold = args.thr
+nmsThreshold = args.nms
 def getOutputsNames(net):
    layersNames = net.getLayerNames()
@@ -86,36 +88,43 @@ def postprocess(frame, outs):
    lastLayerId = net.getLayerId(layerNames[-1])
    lastLayer = net.getLayer(lastLayerId)
+    classIds = []
+    confidences = []
+    boxes = []
    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
-        assert(len(outs) == 1)
+        for out in outs:
-        out = outs[0]
+            for detection in out[0, 0]:
-        for detection in out[0, 0]:
+                confidence = detection[2]
-            confidence = detection[2]
+                if confidence > confThreshold:
-            if confidence > confThreshold:
+                    left = int(detection[3])
-                left = int(detection[3])
+                    top = int(detection[4])
-                top = int(detection[4])
+                    right = int(detection[5])
-                right = int(detection[5])
+                    bottom = int(detection[6])
-                bottom = int(detection[6])
+                    width = right - left + 1
-                classId = int(detection[1]) - 1  # Skip background label
+                    height = bottom - top + 1
-                drawPred(classId, confidence, left, top, right, bottom)
+                    classIds.append(int(detection[1]) - 1)  # Skip background label
+                    confidences.append(float(confidence))
+                    boxes.append([left, top, width, height])
    elif lastLayer.type == 'DetectionOutput':
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
-        assert(len(outs) == 1)
+        for out in outs:
-        out = outs[0]
+            for detection in out[0, 0]:
-        for detection in out[0, 0]:
+                confidence = detection[2]
-            confidence = detection[2]
+                if confidence > confThreshold:
-            if confidence > confThreshold:
+                    left = int(detection[3] * frameWidth)
-                left = int(detection[3] * frameWidth)
+                    top = int(detection[4] * frameHeight)
-                top = int(detection[4] * frameHeight)
+                    right = int(detection[5] * frameWidth)
-                right = int(detection[5] * frameWidth)
+                    bottom = int(detection[6] * frameHeight)
-                bottom = int(detection[6] * frameHeight)
+                    width = right - left + 1
-                classId = int(detection[1]) - 1  # Skip background label
+                    height = bottom - top + 1
-                drawPred(classId, confidence, left, top, right, bottom)
+                    classIds.append(int(detection[1]) - 1)  # Skip background label
+                    confidences.append(float(confidence))
+                    boxes.append([left, top, width, height])
    elif lastLayer.type == 'Region':
        # Network produces output blob with a shape NxC where N is a number of
        # detected objects and C is a number of classes + 4 where the first 4
@@ -138,15 +147,19 @@ def postprocess(frame, outs):
                    classIds.append(classId)
                    confidences.append(float(confidence))
                    boxes.append([left, top, width, height])
-        indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, 0.4)
+    else:
-        for i in indices:
+        print('Unknown output layer type: ' + lastLayer.type)
-            i = i[0]
+        exit()
-            box = boxes[i]
-            left = box[0]
+    indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
-            top = box[1]
+    for i in indices:
-            width = box[2]
+        i = i[0]
-            height = box[3]
+        box = boxes[i]
-            drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
+        left = box[0]
+        top = box[1]
+        width = box[2]
+        height = box[3]
+        drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
 # Process inputs
 winName = 'Deep learning object detection in OpenCV'