Merge pull request #9750 from dkurt:feature_dnn_tf_text_graph

04604523 · Vadim Pisarevsky · 7d55c09a · e4aa39f9 · 04604523 · 04604523
Commit 04604523 authored Oct 10, 2017 by Vadim Pisarevsky
10 changed files
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -629,7 +629,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    /** @brief Reads a network model stored in Tensorflow model file.
      * @details This is shortcut consisting from createTensorflowImporter and Net::populateNet calls.
      */
-    CV_EXPORTS_W Net readNetFromTensorflow(const String &model);
+    CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String());
    /** @brief Reads a network model stored in Torch model file.
      * @details This is shortcut consisting from createTorchImporter and Net::populateNet calls.

--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -81,6 +81,8 @@ public:
    float _nmsThreshold;
    int _topK;
+    // Whenever predicted bounding boxes are respresented in YXHW instead of XYWH layout.
+    bool _locPredTransposed;
    enum { _numAxes = 4 };
    static const std::string _layerName;
@@ -148,6 +150,7 @@ public:
        _keepTopK = getParameter<int>(params, "keep_top_k");
        _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);
        _topK = getParameter<int>(params, "top_k", 0, false, -1);
+        _locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);
        getCodeType(params);
@@ -209,7 +212,7 @@ public:
            // Retrieve all location predictions
            std::vector<LabelBBox> allLocationPredictions;
            GetLocPredictions(locationData, num, numPriors, _numLocClasses,
-                              _shareLocation, allLocationPredictions);
+                              _shareLocation, _locPredTransposed, allLocationPredictions);
            // Retrieve all confidences
            GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);
@@ -540,11 +543,14 @@ public:
    //    num_loc_classes: number of location classes. It is 1 if share_location is
    //      true; and is equal to number of classes needed to predict otherwise.
    //    share_location: if true, all classes share the same location prediction.
+    //    loc_pred_transposed: if true, represent four bounding box values as
+    //                         [y,x,height,width] or [x,y,width,height] otherwise.
    //    loc_preds: stores the location prediction, where each item contains
    //      location prediction for an image.
    static void GetLocPredictions(const float* locData, const int num,
                           const int numPredsPerClass, const int numLocClasses,
-                           const bool shareLocation, std::vector<LabelBBox>& locPreds)
+                           const bool shareLocation, const bool locPredTransposed,
+                           std::vector<LabelBBox>& locPreds)
    {
        locPreds.clear();
        if (shareLocation)
@@ -566,10 +572,20 @@ public:
                        labelBBox[label].resize(numPredsPerClass);
                    }
                    caffe::NormalizedBBox& bbox = labelBBox[label][p];
-                    bbox.set_xmin(locData[startIdx + c * 4]);
+                    if (locPredTransposed)
-                    bbox.set_ymin(locData[startIdx + c * 4 + 1]);
+                    {
-                    bbox.set_xmax(locData[startIdx + c * 4 + 2]);
+                        bbox.set_ymin(locData[startIdx + c * 4]);
-                    bbox.set_ymax(locData[startIdx + c * 4 + 3]);
+                        bbox.set_xmin(locData[startIdx + c * 4 + 1]);
+                        bbox.set_ymax(locData[startIdx + c * 4 + 2]);
+                        bbox.set_xmax(locData[startIdx + c * 4 + 3]);
+                    }
+                    else
+                    {
+                        bbox.set_xmin(locData[startIdx + c * 4]);
+                        bbox.set_ymin(locData[startIdx + c * 4 + 1]);
+                        bbox.set_xmax(locData[startIdx + c * 4 + 2]);
+                        bbox.set_ymax(locData[startIdx + c * 4 + 3]);
+                    }
                }
            }
        }

--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -124,6 +124,20 @@ public:
        }
    }
+    void getScales(const LayerParams &params)
+    {
+        DictValue scalesParameter;
+        bool scalesRetieved = getParameterDict(params, "scales", scalesParameter);
+        if (scalesRetieved)
+        {
+            _scales.resize(scalesParameter.size());
+            for (int i = 0; i < scalesParameter.size(); ++i)
+            {
+                _scales[i] = scalesParameter.get<float>(i);
+            }
+        }
+    }
    void getVariance(const LayerParams &params)
    {
        DictValue varianceParameter;
@@ -169,13 +183,14 @@ public:
        _flip = getParameter<bool>(params, "flip");
        _clip = getParameter<bool>(params, "clip");
+        _scales.clear();
        _aspectRatios.clear();
-        _aspectRatios.push_back(1.);
        getAspectRatios(params);
        getVariance(params);
+        getScales(params);
-        _numPriors = _aspectRatios.size();
+        _numPriors = _aspectRatios.size() + 1;  // + 1 for an aspect ratio 1.0
        _maxSize = -1;
        if (params.has("max_size"))
@@ -231,6 +246,11 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        if (_scales.empty())
+            _scales.resize(_numPriors, 1.0f);
+        else
+            CV_Assert(_scales.size() == _numPriors);
        int _layerWidth = inputs[0]->size[3];
        int _layerHeight = inputs[0]->size[2];
@@ -256,7 +276,7 @@ public:
        {
            for (size_t w = 0; w < _layerWidth; ++w)
            {
-                _boxWidth = _boxHeight = _minSize;
+                _boxWidth = _boxHeight = _minSize * _scales[0];
                float center_x = (w + 0.5) * stepX;
                float center_y = (h + 0.5) * stepY;
@@ -272,7 +292,7 @@ public:
                if (_maxSize > 0)
                {
                    // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
-                    _boxWidth = _boxHeight = sqrt(_minSize * _maxSize);
+                    _boxWidth = _boxHeight = sqrt(_minSize * _maxSize) * _scales[1];
                    // xmin
                    outputPtr[idx++] = (center_x - _boxWidth / 2.) / _imageWidth;
                    // ymin
@@ -284,15 +304,13 @@ public:
                }
                // rest of priors
+                CV_Assert((_maxSize > 0 ? 2 : 1) + _aspectRatios.size() == _scales.size());
                for (size_t r = 0; r < _aspectRatios.size(); ++r)
                {
                    float ar = _aspectRatios[r];
-                    if (fabs(ar - 1.) < 1e-6)
+                    float scale = _scales[(_maxSize > 0 ? 2 : 1) + r];
-                    {
+                    _boxWidth = _minSize * sqrt(ar) * scale;
-                        continue;
+                    _boxHeight = _minSize / sqrt(ar) * scale;
-                    }
-                    _boxWidth = _minSize * sqrt(ar);
-                    _boxHeight = _minSize / sqrt(ar);
                    // xmin
                    outputPtr[idx++] = (center_x - _boxWidth / 2.) / _imageWidth;
                    // ymin
@@ -363,6 +381,7 @@ public:
    std::vector<float> _aspectRatios;
    std::vector<float> _variance;
+    std::vector<float> _scales;
    bool _flip;
    bool _clip;

--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
--- a/modules/dnn/src/tensorflow/tf_io.cpp
+++ b/modules/dnn/src/tensorflow/tf_io.cpp
@@ -52,12 +52,27 @@ bool ReadProtoFromBinaryFileTF(const char* filename, Message* proto) {
    return success;
 }
+bool ReadProtoFromTextFileTF(const char* filename, Message* proto) {
+    std::ifstream fs(filename, std::ifstream::in);
+    CHECK(fs.is_open()) << "Can't open \"" << filename << "\"";
+    IstreamInputStream input(&fs);
+    bool success = google::protobuf::TextFormat::Parse(&input, proto);
+    fs.close();
+    return success;
+}
 void ReadTFNetParamsFromBinaryFileOrDie(const char* param_file,
                                      tensorflow::GraphDef* param) {
  CHECK(ReadProtoFromBinaryFileTF(param_file, param))
      << "Failed to parse GraphDef file: " << param_file;
 }
+void ReadTFNetParamsFromTextFileOrDie(const char* param_file,
+                                      tensorflow::GraphDef* param) {
+  CHECK(ReadProtoFromTextFileTF(param_file, param))
+      << "Failed to parse GraphDef file: " << param_file;
+}
 }
 }
 #endif
--- a/modules/dnn/src/tensorflow/tf_io.hpp
+++ b/modules/dnn/src/tensorflow/tf_io.hpp
@@ -22,6 +22,9 @@ namespace dnn {
 void ReadTFNetParamsFromBinaryFileOrDie(const char* param_file,
                                      tensorflow::GraphDef* param);
+void ReadTFNetParamsFromTextFileOrDie(const char* param_file,
+                                      tensorflow::GraphDef* param);
 }
 }

--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -74,14 +74,15 @@ static std::string path(const std::string& file)
    return findDataFile("dnn/tensorflow/" + file, false);
 }
-static void runTensorFlowNet(const std::string& prefix,
+static void runTensorFlowNet(const std::string& prefix, bool hasText = false,
                             double l1 = 1e-5, double lInf = 1e-4)
 {
    std::string netPath = path(prefix + "_net.pb");
+    std::string netConfig = (hasText ? path(prefix + "_net.pbtxt") : "");
    std::string inpPath = path(prefix + "_in.npy");
    std::string outPath = path(prefix + "_out.npy");
-    Net net = readNetFromTensorflow(netPath);
+    Net net = readNetFromTensorflow(netPath, netConfig);
    cv::Mat input = blobFromNPY(inpPath);
    cv::Mat target = blobFromNPY(outPath);
@@ -120,6 +121,7 @@ TEST(Test_TensorFlow, batch_norm)
 {
    runTensorFlowNet("batch_norm");
    runTensorFlowNet("fused_batch_norm");
+    runTensorFlowNet("batch_norm_text", true);
 }
 TEST(Test_TensorFlow, pooling)
@@ -148,26 +150,60 @@ TEST(Test_TensorFlow, reshape)
 {
    runTensorFlowNet("shift_reshape_no_reorder");
    runTensorFlowNet("reshape_reduce");
+    runTensorFlowNet("flatten", true);
 }
 TEST(Test_TensorFlow, fp16)
 {
    const float l1 = 1e-3;
    const float lInf = 1e-2;
-    runTensorFlowNet("fp16_single_conv", l1, lInf);
+    runTensorFlowNet("fp16_single_conv", false, l1, lInf);
-    runTensorFlowNet("fp16_deconvolution", l1, lInf);
+    runTensorFlowNet("fp16_deconvolution", false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_odd_same", l1, lInf);
+    runTensorFlowNet("fp16_max_pool_odd_same", false, l1, lInf);
-    runTensorFlowNet("fp16_padding_valid", l1, lInf);
+    runTensorFlowNet("fp16_padding_valid", false, l1, lInf);
-    runTensorFlowNet("fp16_eltwise_add_mul", l1, lInf);
+    runTensorFlowNet("fp16_eltwise_add_mul", false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_odd_valid", l1, lInf);
+    runTensorFlowNet("fp16_max_pool_odd_valid", false, l1, lInf);
-    runTensorFlowNet("fp16_pad_and_concat", l1, lInf);
+    runTensorFlowNet("fp16_pad_and_concat", false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_even", l1, lInf);
+    runTensorFlowNet("fp16_max_pool_even", false, l1, lInf);
-    runTensorFlowNet("fp16_padding_same", l1, lInf);
+    runTensorFlowNet("fp16_padding_same", false, l1, lInf);
+}
+TEST(Test_TensorFlow, MobileNet_SSD)
+{
+    std::string netPath = findDataFile("dnn/ssd_mobilenet_v1_coco.pb", false);
+    std::string netConfig = findDataFile("dnn/ssd_mobilenet_v1_coco.pbtxt", false);
+    std::string imgPath = findDataFile("dnn/street.png", false);
+    Mat inp;
+    resize(imread(imgPath), inp, Size(300, 300));
+    inp = blobFromImage(inp, 1.0f / 127.5, Size(), Scalar(127.5, 127.5, 127.5), true);
+    std::vector<String> outNames(3);
+    outNames[0] = "concat";
+    outNames[1] = "concat_1";
+    outNames[2] = "detection_out";
+    std::vector<Mat> target(outNames.size());
+    for (int i = 0; i < outNames.size(); ++i)
+    {
+        std::string path = findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco." + outNames[i] + ".npy", false);
+        target[i] = blobFromNPY(path);
+    }
+    Net net = readNetFromTensorflow(netPath, netConfig);
+    net.setInput(inp);
+    std::vector<Mat> output;
+    net.forward(output, outNames);
+    normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1));
+    normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 2e-4);
+    normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2);
 }
 TEST(Test_TensorFlow, lstm)
 {
-    runTensorFlowNet("lstm");
+    runTensorFlowNet("lstm", true);
 }
 TEST(Test_TensorFlow, split)

--- a/samples/dnn/mobilenet_ssd_accuracy.py
+++ b/samples/dnn/mobilenet_ssd_accuracy.py
+# Script to evaluate MobileNet-SSD object detection model trained in TensorFlow
+# using both TensorFlow and OpenCV. Example:
+#
+# python mobilenet_ssd_accuracy.py \
+#   --weights=frozen_inference_graph.pb \
+#   --prototxt=ssd_mobilenet_v1_coco.pbtxt \
+#   --images=val2017 \
+#   --annotations=annotations/instances_val2017.json
+#
+# Tested on COCO 2017 object detection dataset, http://cocodataset.org/#download
+import os
+import cv2 as cv
+import json
+import argparse
+parser = argparse.ArgumentParser(
+    description='Evaluate MobileNet-SSD model using both TensorFlow and OpenCV. '
+                'COCO evaluation framework is required: http://cocodataset.org')
+parser.add_argument('--weights', required=True,
+                    help='Path to frozen_inference_graph.pb of MobileNet-SSD model. '
+                         'Download it at https://github.com/tensorflow/models/tree/master/research/object_detection')
+parser.add_argument('--prototxt', help='Path to ssd_mobilenet_v1_coco.pbtxt from opencv_extra.', required=True)
+parser.add_argument('--images', help='Path to COCO validation images directory.', required=True)
+parser.add_argument('--annotations', help='Path to COCO annotations file.', required=True)
+args = parser.parse_args()
+### Get OpenCV predictions #####################################################
+net = cv.dnn.readNetFromTensorflow(args.weights, args.prototxt)
+detections = []
+for imgName in os.listdir(args.images):
+    inp = cv.imread(os.path.join(args.images, imgName))
+    rows = inp.shape[0]
+    cols = inp.shape[1]
+    inp = cv.resize(inp, (300, 300))
+    net.setInput(cv.dnn.blobFromImage(inp, 1.0/127.5, (300, 300), (127.5, 127.5, 127.5), True))
+    out = net.forward()
+    for i in range(out.shape[2]):
+        score = float(out[0, 0, i, 2])
+        # Confidence threshold is in prototxt.
+        classId = int(out[0, 0, i, 1])
+        x = out[0, 0, i, 3] * cols
+        y = out[0, 0, i, 4] * rows
+        w = out[0, 0, i, 5] * cols - x
+        h = out[0, 0, i, 6] * rows - y
+        detections.append({
+          "image_id": int(imgName.rstrip('0')[:imgName.rfind('.')]),
+          "category_id": classId,
+          "bbox": [x, y, w, h],
+          "score": score
+        })
+with open('cv_result.json', 'wt') as f:
+    json.dump(detections, f)
+### Get TensorFlow predictions #################################################
+import tensorflow as tf
+with tf.gfile.FastGFile(args.weights) as f:
+    # Load the model
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+with tf.Session() as sess:
+    # Restore session
+    sess.graph.as_default()
+    tf.import_graph_def(graph_def, name='')
+    detections = []
+    for imgName in os.listdir(args.images):
+        inp = cv.imread(os.path.join(args.images, imgName))
+        rows = inp.shape[0]
+        cols = inp.shape[1]
+        inp = cv.resize(inp, (300, 300))
+        inp = inp[:, :, [2, 1, 0]]  # BGR2RGB
+        out = sess.run([sess.graph.get_tensor_by_name('num_detections:0'),
+                        sess.graph.get_tensor_by_name('detection_scores:0'),
+                        sess.graph.get_tensor_by_name('detection_boxes:0'),
+                        sess.graph.get_tensor_by_name('detection_classes:0')],
+                       feed_dict={'image_tensor:0': inp.reshape(1, inp.shape[0], inp.shape[1], 3)})
+        num_detections = int(out[0][0])
+        for i in range(num_detections):
+            classId = int(out[3][0][i])
+            score = float(out[1][0][i])
+            bbox = [float(v) for v in out[2][0][i]]
+            if score > 0.01:
+                x = bbox[1] * cols
+                y = bbox[0] * rows
+                w = bbox[3] * cols - x
+                h = bbox[2] * rows - y
+                detections.append({
+                  "image_id": int(imgName.rstrip('0')[:imgName.rfind('.')]),
+                  "category_id": classId,
+                  "bbox": [x, y, w, h],
+                  "score": score
+                })
+with open('tf_result.json', 'wt') as f:
+    json.dump(detections, f)
+### Evaluation part ############################################################
+# %matplotlib inline
+import matplotlib.pyplot as plt
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import numpy as np
+import skimage.io as io
+import pylab
+pylab.rcParams['figure.figsize'] = (10.0, 8.0)
+annType = ['segm','bbox','keypoints']
+annType = annType[1]      #specify type here
+prefix = 'person_keypoints' if annType=='keypoints' else 'instances'
+print 'Running demo for *%s* results.'%(annType)
+#initialize COCO ground truth api
+cocoGt=COCO(args.annotations)
+#initialize COCO detections api
+for resFile in ['tf_result.json', 'cv_result.json']:
+    print resFile
+    cocoDt=cocoGt.loadRes(resFile)
+    cocoEval = COCOeval(cocoGt,cocoDt,annType)
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    cocoEval.summarize()
--- a/samples/dnn/mobilenet_ssd_python.py
+++ b/samples/dnn/mobilenet_ssd_python.py
+# This script is used to demonstrate MobileNet-SSD network using OpenCV deep learning module.
+#
+# It works with model taken from https://github.com/chuanqi305/MobileNet-SSD/ that
+# was trained in Caffe-SSD framework, https://github.com/weiliu89/caffe/tree/ssd.
+# Model detects objects from 20 classes.
+#
+# Also TensorFlow model from TensorFlow object detection model zoo may be used to
+# detect objects from 90 classes:
+# https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md
+# Text graph definition must be taken from opencv_extra:
+# https://github.com/opencv/opencv_extra/tree/master/testdata/dnn/ssd_mobilenet_v1_coco.pbtxt
 import numpy as np
 import argparse
@@ -13,27 +24,58 @@ WHRatio = inWidth / float(inHeight)
 inScaleFactor = 0.007843
 meanVal = 127.5
-classNames = ('background',
-              'aeroplane', 'bicycle', 'bird', 'boat',
-              'bottle', 'bus', 'car', 'cat', 'chair',
-              'cow', 'diningtable', 'dog', 'horse',
-              'motorbike', 'person', 'pottedplant',
-              'sheep', 'sofa', 'train', 'tvmonitor')
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
+        description='Script to run MobileNet-SSD object detection network '
+                    'trained either in Caffe or TensorFlow frameworks.')
    parser.add_argument("--video", help="path to video file. If empty, camera's stream will be used")
    parser.add_argument("--prototxt", default="MobileNetSSD_deploy.prototxt",
-                        help="path to caffe prototxt")
+                                      help='Path to text network file: '
-    parser.add_argument("-c", "--caffemodel", default="MobileNetSSD_deploy.caffemodel",
+                                           'MobileNetSSD_deploy.prototxt for Caffe model or '
-                        help="path to caffemodel file, download it here: "
+                                           'ssd_mobilenet_v1_coco.pbtxt from opencv_extra for TensorFlow model')
-                        "https://github.com/chuanqi305/MobileNet-SSD/")
+    parser.add_argument("--weights", default="MobileNetSSD_deploy.caffemodel",
-    parser.add_argument("--thr", default=0.2, help="confidence threshold to filter out weak detections")
+                                     help='Path to weights: '
+                                          'MobileNetSSD_deploy.caffemodel for Caffe model or '
+                                          'frozen_inference_graph.pb from TensorFlow.')
+    parser.add_argument("--num_classes", default=20, type=int,
+                        help="Number of classes. It's 20 for Caffe model from "
+                             "https://github.com/chuanqi305/MobileNet-SSD/ and 90 for "
+                             "TensorFlow model from https://github.com/tensorflow/models/tree/master/research/object_detection")
+    parser.add_argument("--thr", default=0.2, type=float, help="confidence threshold to filter out weak detections")
    args = parser.parse_args()
-    net = cv.dnn.readNetFromCaffe(args.prototxt, args.caffemodel)
+    if args.num_classes == 20:
+        net = cv.dnn.readNetFromCaffe(args.prototxt, args.weights)
-    if len(args.video):
+        swapRB = False
+        classNames = { 0: 'background',
+            1: 'aeroplane', 2: 'bicycle', 3: 'bird', 4: 'boat',
+            5: 'bottle', 6: 'bus', 7: 'car', 8: 'cat', 9: 'chair',
+            10: 'cow', 11: 'diningtable', 12: 'dog', 13: 'horse',
+            14: 'motorbike', 15: 'person', 16: 'pottedplant',
+            17: 'sheep', 18: 'sofa', 19: 'train', 20: 'tvmonitor' }
+    else:
+        assert(args.num_classes == 90)
+        net = cv.dnn.readNetFromTensorflow(args.weights, args.prototxt)
+        swapRB = True
+        classNames = { 0: 'background',
+            1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus',
+            7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant',
+            13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat',
+            18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear',
+            24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag',
+            32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard',
+            37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove',
+            41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle',
+            46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon',
+            51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange',
+            56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut',
+            61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed',
+            67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse',
+            75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven',
+            80: 'toaster', 81: 'sink', 82: 'refrigerator', 84: 'book', 85: 'clock',
+            86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush' }
+    if args.video:
        cap = cv.VideoCapture(args.video)
    else:
        cap = cv.VideoCapture(0)
@@ -41,7 +83,7 @@ if __name__ == "__main__":
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
-        blob = cv.dnn.blobFromImage(frame, inScaleFactor, (inWidth, inHeight), meanVal, False)
+        blob = cv.dnn.blobFromImage(frame, inScaleFactor, (inWidth, inHeight), (meanVal, meanVal, meanVal), swapRB)
        net.setInput(blob)
        detections = net.forward()
@@ -74,14 +116,16 @@ if __name__ == "__main__":
                cv.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop),
                              (0, 255, 0))
-                label = classNames[class_id] + ": " + str(confidence)
+                if class_id in classNames:
-                labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+                    label = classNames[class_id] + ": " + str(confidence)
+                    labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
-                cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]),
-                                     (xLeftBottom + labelSize[0], yLeftBottom + baseLine),
+                    yLeftBottom = max(yLeftBottom, labelSize[1])
-                                     (255, 255, 255), cv.FILLED)
+                    cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]),
-                cv.putText(frame, label, (xLeftBottom, yLeftBottom),
+                                         (xLeftBottom + labelSize[0], yLeftBottom + baseLine),
-                            cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
+                                         (255, 255, 255), cv.FILLED)
+                    cv.putText(frame, label, (xLeftBottom, yLeftBottom),
+                                cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
        cv.imshow("detections", frame)
        if cv.waitKey(1) >= 0:

--- a/samples/dnn/shrink_tf_graph_weights.py
+++ b/samples/dnn/shrink_tf_graph_weights.py
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+#
+# Copyright (C) 2017, Intel Corporation, all rights reserved.
+# Third party copyrights are property of their respective owners.
+import tensorflow as tf
+import struct
+import argparse
+import numpy as np
+parser = argparse.ArgumentParser(description='Convert weights of a frozen TensorFlow graph to fp16.')
+parser.add_argument('--input', required=True, help='Path to frozen graph.')
+parser.add_argument('--output', required=True, help='Path to output graph.')
+parser.add_argument('--ops', default=['Conv2D', 'MatMul'], nargs='+',
+                    help='List of ops which weights are converted.')
+args = parser.parse_args()
+DT_FLOAT = 1
+DT_HALF = 19
+# For the frozen graphs, an every node that uses weights connected to Const nodes
+# through an Identity node. Usually they're called in the same way with '/read' suffix.
+# We'll replace all of them to Cast nodes.
+# Load the model
+with tf.gfile.FastGFile(args.input) as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+# Set of all inputs from desired nodes.
+inputs = []
+for node in graph_def.node:
+    if node.op in args.ops:
+        inputs += node.input
+weightsNodes = []
+for node in graph_def.node:
+    # From the whole inputs we need to keep only an Identity nodes.
+    if node.name in inputs and node.op == 'Identity' and node.attr['T'].type == DT_FLOAT:
+        weightsNodes.append(node.input[0])
+        # Replace Identity to Cast.
+        node.op = 'Cast'
+        node.attr['DstT'].type = DT_FLOAT
+        node.attr['SrcT'].type = DT_HALF
+        del node.attr['T']
+        del node.attr['_class']
+# Convert weights to halfs.
+for node in graph_def.node:
+    if node.name in weightsNodes:
+        node.attr['dtype'].type = DT_HALF
+        node.attr['value'].tensor.dtype = DT_HALF
+        floats = node.attr['value'].tensor.tensor_content
+        floats = struct.unpack('f' * (len(floats) / 4), floats)
+        halfs = np.array(floats).astype(np.float16).view(np.uint16)
+        node.attr['value'].tensor.tensor_content = struct.pack('H' * len(halfs), *halfs)
+tf.train.write_graph(graph_def, "", args.output, as_text=False)