Support YOLOv3 model from Darknet

97fec07d · Dmitry Kurtaev · 2129db6e · 97fec07d · 97fec07d · 97fec07d
Commit 97fec07d authored Apr 13, 2018 by Dmitry Kurtaev
8 changed files
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -217,6 +217,16 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
            Mat(cv::Size(300, 300), CV_32FC3));
 }

+PERF_TEST_P_(DNNTestNetwork, YOLOv3)
+{
+    if (backend != DNN_BACKEND_DEFAULT)
+        throw SkipTestException("");
+    Mat sample = imread(findDataFile("dnn/dog416.png", false));
+    Mat inp;
+    sample.convertTo(inp, CV_32FC3);
+    processNet("dnn/yolov3.cfg", "dnn/yolov3.weights", "", inp / 255);
+}
+
 const tuple<DNNBackend, DNNTarget> testCases[] = {
 #ifdef HAVE_HALIDE
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),

--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -89,6 +89,8 @@ namespace cv {
                return init_val;
            }

+            static const std::string kFirstLayerName = "data";
+
            class setLayersParams {

                NetParameter *net;
@@ -97,8 +99,8 @@ namespace cv {
                std::vector<std::string> fused_layer_names;

            public:
-                setLayersParams(NetParameter *_net, std::string _first_layer = "data") :
-                    net(_net), layer_id(0), last_layer(_first_layer)
+                setLayersParams(NetParameter *_net) :
+                    net(_net), layer_id(0), last_layer(kFirstLayerName)
                {}

                void setLayerBlobs(int i, std::vector<cv::Mat> blobs)
@@ -275,7 +277,7 @@ namespace cv {
                    fused_layer_names.push_back(last_layer);
                }

-                void setPermute()
+                void setPermute(bool isDarknetLayer = true)
                {
                    cv::dnn::LayerParams permute_params;
                    permute_params.name = "Permute-name";
@@ -294,9 +296,12 @@ namespace cv {
                    last_layer = layer_name;
                    net->layers.push_back(lp);

+                    if (isDarknetLayer)
+                    {
                        layer_id++;
                        fused_layer_names.push_back(last_layer);
                    }
+                }

                void setRegion(float thresh, int coords, int classes, int anchors, int classfix, int softmax, int softmax_tree, float *biasData)
                {
@@ -327,6 +332,85 @@ namespace cv {
                    layer_id++;
                    fused_layer_names.push_back(last_layer);
                }
+
+                void setYolo(int classes, const std::vector<int>& mask, const std::vector<float>& anchors)
+                {
+                    cv::dnn::LayerParams region_param;
+                    region_param.name = "Region-name";
+                    region_param.type = "Region";
+
+                    const int numAnchors = mask.size();
+
+                    region_param.set<int>("classes", classes);
+                    region_param.set<int>("anchors", numAnchors);
+                    region_param.set<bool>("logistic", true);
+
+                    std::vector<float> usedAnchors(numAnchors * 2);
+                    for (int i = 0; i < numAnchors; ++i)
+                    {
+                        usedAnchors[i * 2] = anchors[mask[i] * 2];
+                        usedAnchors[i * 2 + 1] = anchors[mask[i] * 2 + 1];
+                    }
+
+                    cv::Mat biasData_mat = cv::Mat(1, numAnchors * 2, CV_32F, &usedAnchors[0]).clone();
+                    region_param.blobs.push_back(biasData_mat);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("yolo_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = region_param.type;
+                    lp.layerParams = region_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(kFirstLayerName);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setShortcut(int from)
+                {
+                    cv::dnn::LayerParams shortcut_param;
+                    shortcut_param.name = "Shortcut-name";
+                    shortcut_param.type = "Eltwise";
+
+                    shortcut_param.set<std::string>("op", "sum");
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("shortcut_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = shortcut_param.type;
+                    lp.layerParams = shortcut_param;
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setUpsample(int scaleFactor)
+                {
+                    cv::dnn::LayerParams param;
+                    param.name = "Upsample-name";
+                    param.type = "ResizeNearestNeighbor";
+
+                    param.set<int>("zoom_factor", scaleFactor);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("upsample_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = param.type;
+                    lp.layerParams = param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
            };

            std::string escapeString(const std::string &src)
@@ -464,7 +548,7 @@ namespace cv {

                        current_channels = 0;
                        for (size_t k = 0; k < layers_vec.size(); ++k) {
-                            layers_vec[k] += layers_counter;
+                            layers_vec[k] = layers_vec[k] > 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
                            current_channels += net->out_channels_vec[layers_vec[k]];
                        }

@@ -496,9 +580,43 @@ namespace cv {

                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());

-                        setParams.setPermute();
+                        setParams.setPermute(false);
                        setParams.setRegion(thresh, coords, classes, num_of_anchors, classfix, softmax, softmax_tree, anchors_vec.data());
                    }
+                    else if (layer_type == "shortcut")
+                    {
+                        std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        CV_Assert(!bottom_layer.empty());
+                        int from = std::atoi(bottom_layer.c_str());
+
+                        from += layers_counter;
+                        current_channels = net->out_channels_vec[from];
+
+                        setParams.setShortcut(from);
+                    }
+                    else if (layer_type == "upsample")
+                    {
+                        int scaleFactor = getParam<int>(layer_params, "stride", 1);
+                        setParams.setUpsample(scaleFactor);
+                    }
+                    else if (layer_type == "yolo")
+                    {
+                        int classes = getParam<int>(layer_params, "classes", -1);
+                        int num_of_anchors = getParam<int>(layer_params, "num", -1);
+
+                        std::string anchors_values = getParam<std::string>(layer_params, "anchors", std::string());
+                        CV_Assert(!anchors_values.empty());
+                        std::vector<float> anchors_vec = getNumbers<float>(anchors_values);
+
+                        std::string mask_values = getParam<std::string>(layer_params, "mask", std::string());
+                        CV_Assert(!mask_values.empty());
+                        std::vector<int> mask_vec = getNumbers<int>(mask_values);
+
+                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());
+
+                        setParams.setPermute(false);
+                        setParams.setYolo(classes, mask_vec, anchors_vec);
+                    }
                    else {
                        CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type);
                    }
@@ -598,6 +716,10 @@ namespace cv {
                        if(activation == "leaky")
                            ++cv_layers_counter;
                    }
+                    if (layer_type == "region" || layer_type == "yolo")
+                    {
+                        ++cv_layers_counter;  // For permute.
+                    }
                    current_channels = net->out_channels_vec[darknet_layers_counter];
                }
                return true;

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1527,12 +1527,11 @@ struct Net::Impl
                                convLayer = downLayerData->layerInstance.dynamicCast<ConvolutionLayer>();

                            //  first input layer is convolution layer
-                            if( !convLayer.empty() )
+                            if( !convLayer.empty() && eltwiseData->consumers.size() == 1 )
                            {
                                // fuse eltwise + activation layer
                                LayerData *firstConvLayerData = downLayerData;
                                {
-                                    CV_Assert(eltwiseData->consumers.size() == 1);
                                    nextData = &layers[eltwiseData->consumers[0].lid];
                                    lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
                                    Ptr<ActivationLayer> nextActivLayer;

--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -59,7 +59,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
 public:
    int coords, classes, anchors, classfix;
    float thresh, nmsThreshold;
-    bool useSoftmaxTree, useSoftmax;
+    bool useSoftmax, useLogistic;

    RegionLayerImpl(const LayerParams& params)
    {
@@ -71,15 +71,17 @@ public:
        classes = params.get<int>("classes", 0);
        anchors = params.get<int>("anchors", 5);
        classfix = params.get<int>("classfix", 0);
-        useSoftmaxTree = params.get<bool>("softmax_tree", false);
        useSoftmax = params.get<bool>("softmax", false);
+        useLogistic = params.get<bool>("logistic", false);
        nmsThreshold = params.get<float>("nms_threshold", 0.4);

        CV_Assert(nmsThreshold >= 0.);
        CV_Assert(coords == 4);
        CV_Assert(classes >= 1);
        CV_Assert(anchors >= 1);
-        CV_Assert(useSoftmaxTree || useSoftmax);
+        CV_Assert(useLogistic || useSoftmax);
+        if (params.get<bool>("softmax_tree", false))
+            CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -89,7 +91,7 @@ public:
    {
        CV_Assert(inputs.size() > 0);
        CV_Assert(inputs[0][3] == (1 + coords + classes)*anchors);
-        outputs = std::vector<MatShape>(inputs.size(), shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
+        outputs = std::vector<MatShape>(1, shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
        return false;
    }

@@ -124,14 +126,13 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        // TODO: implement a logistic activation to classification scores.
+        if (useLogistic)
+            return false;
+
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

-        if (useSoftmaxTree) {   // Yolo 9000
-            CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
-            return false;
-        }
-
        CV_Assert(inputs.size() >= 1);
        int const cell_size = classes + coords + 1;
        UMat blob_umat = blobs[0].getUMat(ACCESS_READ);
@@ -203,6 +204,7 @@ public:
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

        CV_Assert(inputs.size() >= 1);
+        CV_Assert(outputs.size() == 1);
        int const cell_size = classes + coords + 1;

        const float* biasData = blobs[0].ptr<float>();
@@ -214,6 +216,9 @@ public:

            int rows = inpBlob.size[1];
            int cols = inpBlob.size[2];
+            CV_Assert(inputs.size() < 2 || inputs[1]->dims == 4);
+            int hNorm = inputs.size() > 1 ? inputs[1]->size[2] : rows;
+            int wNorm = inputs.size() > 1 ? inputs[1]->size[3] : cols;

            const float *srcData = inpBlob.ptr<float>();
            float *dstData = outBlob.ptr<float>();
@@ -225,16 +230,23 @@ public:
                dstData[index + 4] = logistic_activate(x);	// logistic activation
            }

-            if (useSoftmaxTree) {   // Yolo 9000
-                CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
-            }
-            else if (useSoftmax) {  // Yolo v2
+            if (useSoftmax) {  // Yolo v2
                // softmax activation for Probability, for each grid cell (X x Y x Anchor-index)
                for (int i = 0; i < rows*cols*anchors; ++i) {
                    int index = cell_size*i;
                    softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
                }
-
+            }
+            else if (useLogistic) {  // Yolo v3
+                for (int i = 0; i < rows*cols*anchors; ++i)
+                {
+                    int index = cell_size*i;
+                    const float* input = srcData + index + 5;
+                    float* output = dstData + index + 5;
+                    for (int i = 0; i < classes; ++i)
+                        output[i] = logistic_activate(input[i]);
+                }
+            }
            for (int x = 0; x < cols; ++x)
                for(int y = 0; y < rows; ++y)
                    for (int a = 0; a < anchors; ++a) {
@@ -246,28 +258,19 @@ public:

                        dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols;
                        dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows;
-                            dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / cols;
-                            dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / rows;
+                        dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / hNorm;
+                        dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / wNorm;

                        int class_index = index * cell_size + 5;

-                            if (useSoftmaxTree) {
-                                CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
-                            }
-                            else {
                        for (int j = 0; j < classes; ++j) {
                            float prob = scale*dstData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
                            dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
                        }
                    }
-                        }
-
-            }
-
            if (nmsThreshold > 0) {
                do_nms_sort(dstData, rows*cols*anchors, thresh, nmsThreshold);
            }
-
        }
    }


--- a/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp
+++ b/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp
@@ -16,9 +16,11 @@ public:
    ResizeNearestNeighborLayerImpl(const LayerParams& params)
    {
        setParamsFrom(params);
-        CV_Assert(params.has("width"), params.has("height"));
-        outWidth = params.get<float>("width");
-        outHeight = params.get<float>("height");
+        CV_Assert(params.has("width") && params.has("height") || params.has("zoom_factor"));
+        CV_Assert(!params.has("width") && !params.has("height") || !params.has("zoom_factor"));
+        outWidth = params.get<float>("width", 0);
+        outHeight = params.get<float>("height", 0);
+        zoomFactor = params.get<int>("zoom_factor", 1);
        alignCorners = params.get<bool>("align_corners", false);
        if (alignCorners)
            CV_Error(Error::StsNotImplemented, "Nearest neighborhood resize with align_corners=true is not implemented");
@@ -31,12 +33,21 @@ public:
    {
        CV_Assert(inputs.size() == 1, inputs[0].size() == 4);
        outputs.resize(1, inputs[0]);
-        outputs[0][2] = outHeight;
-        outputs[0][3] = outWidth;
+        outputs[0][2] = outHeight > 0 ? outHeight : (outputs[0][2] * zoomFactor);
+        outputs[0][3] = outWidth > 0 ? outWidth : (outputs[0][3] * zoomFactor);
        // We can work in-place (do nothing) if input shape == output shape.
        return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]);
    }

+    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+    }
+
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
@@ -65,7 +76,7 @@ public:
        }
    }
 private:
-    int outWidth, outHeight;
+    int outWidth, outHeight, zoomFactor;
    bool alignCorners;
 };


--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -35,12 +35,14 @@ using namespace dnn;
 float confThreshold;
 std::vector<std::string> classes;

-void postprocess(Mat& frame, const Mat& out, Net& net);
+void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net);

 void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);

 void callback(int pos, void* userdata);

+std::vector<String> getOutputsNames(const Net& net);
+
 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, keys);
@@ -115,9 +117,10 @@ int main(int argc, char** argv)
            Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
            net.setInput(imInfo, "im_info");
        }
-        Mat out = net.forward();
+        std::vector<Mat> outs;
+        net.forward(outs, getOutputsNames(net));

-        postprocess(frame, out, net);
+        postprocess(frame, outs, net);

        // Put efficiency information.
        std::vector<double> layersTimes;
@@ -131,18 +134,19 @@ int main(int argc, char** argv)
    return 0;
 }

-void postprocess(Mat& frame, const Mat& out, Net& net)
+void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
 {
    static std::vector<int> outLayers = net.getUnconnectedOutLayers();
    static std::string outLayerType = net.getLayer(outLayers[0])->type;

-    float* data = (float*)out.data;
    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
-        for (size_t i = 0; i < out.total(); i += 7)
+        CV_Assert(outs.size() == 1);
+        float* data = (float*)outs[0].data;
+        for (size_t i = 0; i < outs[0].total(); i += 7)
        {
            float confidence = data[i + 2];
            if (confidence > confThreshold)
@@ -161,7 +165,9 @@ void postprocess(Mat& frame, const Mat& out, Net& net)
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
-        for (size_t i = 0; i < out.total(); i += 7)
+        CV_Assert(outs.size() == 1);
+        float* data = (float*)outs[0].data;
+        for (size_t i = 0; i < outs[0].total(); i += 7)
        {
            float confidence = data[i + 2];
            if (confidence > confThreshold)
@@ -176,29 +182,47 @@ void postprocess(Mat& frame, const Mat& out, Net& net)
        }
    }
    else if (outLayerType == "Region")
+    {
+        std::vector<int> classIds;
+        std::vector<float> confidences;
+        std::vector<Rect> boxes;
+        for (size_t i = 0; i < outs.size(); ++i)
        {
            // Network produces output blob with a shape NxC where N is a number of
            // detected objects and C is a number of classes + 4 where the first 4
            // numbers are [center_x, center_y, width, height]
-        for (int i = 0; i < out.rows; ++i, data += out.cols)
+            float* data = (float*)outs[i].data;
+            for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
            {
-            Mat confidences = out.row(i).colRange(5, out.cols);
+                Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
                Point classIdPoint;
                double confidence;
-            minMaxLoc(confidences, 0, &confidence, 0, &classIdPoint);
+                minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
                if (confidence > confThreshold)
                {
-                int classId = classIdPoint.x;
                    int centerX = (int)(data[0] * frame.cols);
                    int centerY = (int)(data[1] * frame.rows);
                    int width = (int)(data[2] * frame.cols);
                    int height = (int)(data[3] * frame.rows);
                    int left = centerX - width / 2;
                    int top = centerY - height / 2;
-                drawPred(classId, (float)confidence, left, top, left + width, top + height, frame);
+
+                    classIds.push_back(classIdPoint.x);
+                    confidences.push_back((float)confidence);
+                    boxes.push_back(Rect(left, top, width, height));
                }
            }
        }
+        std::vector<int> indices;
+        NMSBoxes(boxes, confidences, confThreshold, 0.4, indices);
+        for (size_t i = 0; i < indices.size(); ++i)
+        {
+            int idx = indices[i];
+            Rect box = boxes[idx];
+            drawPred(classIds[idx], confidences[idx], box.x, box.y,
+                     box.x + box.width, box.y + box.height, frame);
+        }
+    }
    else
        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
 }
@@ -227,3 +251,17 @@ void callback(int pos, void*)
 {
    confThreshold = pos * 0.01f;
 }
+
+std::vector<String> getOutputsNames(const Net& net)
+{
+    static std::vector<String> names;
+    if (names.empty())
+    {
+        std::vector<int> outLayers = net.getUnconnectedOutLayers();
+        std::vector<String> layersNames = net.getLayerNames();
+        names.resize(outLayers.size());
+        for (size_t i = 0; i < outLayers.size(); ++i)
+            names[i] = layersNames[outLayers[i] - 1];
+    }
+    return names;
+}
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@@ -55,7 +55,11 @@ net.setPreferableTarget(args.target)

 confThreshold = args.thr

-def postprocess(frame, out):
+def getOutputsNames(net):
+    layersNames = net.getLayerNames()
+    return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
+
+def postprocess(frame, outs):
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]

@@ -63,7 +67,7 @@ def postprocess(frame, out):
        # Draw a bounding box.
        cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))

-        label = '%.2f' % confidence
+        label = '%.2f' % conf

        # Print a label of class.
        if classes:
@@ -83,6 +87,8 @@ def postprocess(frame, out):
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
+        assert(len(outs) == 1)
+        out = outs[0]
        for detection in out[0, 0]:
            confidence = detection[2]
            if confidence > confThreshold:
@@ -96,6 +102,8 @@ def postprocess(frame, out):
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
+        assert(len(outs) == 1)
+        out = outs[0]
        for detection in out[0, 0]:
            confidence = detection[2]
            if confidence > confThreshold:
@@ -109,10 +117,14 @@ def postprocess(frame, out):
        # Network produces output blob with a shape NxC where N is a number of
        # detected objects and C is a number of classes + 4 where the first 4
        # numbers are [center_x, center_y, width, height]
+        classIds = []
+        confidences = []
+        boxes = []
+        for out in outs:
            for detection in out:
-            confidences = detection[5:]
-            classId = np.argmax(confidences)
-            confidence = confidences[classId]
+                scores = detection[5:]
+                classId = np.argmax(scores)
+                confidence = scores[classId]
                if confidence > confThreshold:
                    center_x = int(detection[0] * frameWidth)
                    center_y = int(detection[1] * frameHeight)
@@ -120,7 +132,18 @@ def postprocess(frame, out):
                    height = int(detection[3] * frameHeight)
                    left = center_x - width / 2
                    top = center_y - height / 2
-                drawPred(classId, confidence, left, top, left + width, top + height)
+                    classIds.append(classId)
+                    confidences.append(float(confidence))
+                    boxes.append([left, top, width, height])
+        indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, 0.4)
+        for i in indices:
+            i = i[0]
+            box = boxes[i]
+            left = box[0]
+            top = box[1]
+            width = box[2]
+            height = box[3]
+            drawPred(classIds[i], confidences[i], left, top, left + width, top + height)

 # Process inputs
 winName = 'Deep learning object detection in OpenCV'
@@ -152,9 +175,9 @@ while cv.waitKey(1) < 0:
    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
        frame = cv.resize(frame, (inpWidth, inpHeight))
        net.setInput(np.array([inpHeight, inpWidth, 1.6], dtype=np.float32), 'im_info');
-    out = net.forward()
+    outs = net.forward(getOutputsNames(net))

-    postprocess(frame, out)
+    postprocess(frame, outs)

    # Put efficiency information.
    t, _ = net.getPerfProfile()