detection_output.cpp 12.8 KB
Newer Older
openvino-pushbot's avatar
openvino-pushbot committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/

#include "detection_output_inst.h"
#include "primitive_type_base.h"
#include "network_impl.h"
#include "error_handler.h"
#include "json_object.h"
22
#include <string>
openvino-pushbot's avatar
openvino-pushbot committed
23

24 25
namespace cldnn {
primitive_type_id detection_output_type_id() {
openvino-pushbot's avatar
openvino-pushbot committed
26 27 28 29
    static primitive_type_base<detection_output> instance;
    return &instance;
}

30 31 32 33 34 35 36 37 38 39
layout detection_output_inst::calc_output_layout(detection_output_node const& node) {
    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
           "Output data type forcing is not supported for "
           "detection_output_node!");
    CLDNN_ERROR_NOT_EQUAL(node.id(),
                          "Detection output layer input number",
                          node.get_dependencies().size(),
                          "expected number of inputs",
                          static_cast<size_t>(3),
                          "");
openvino-pushbot's avatar
openvino-pushbot committed
40 41 42 43

    auto input_layout = node.location().get_output_layout();

    // Batch size and feature size are 1.
44 45
    // Number of bounding boxes to be kept is set to keep_top_k*batch size.
    // If number of detections is lower than top_k, will write dummy results at the end with image_id=-1.
openvino-pushbot's avatar
openvino-pushbot committed
46 47
    // Each row is a 7 dimension vector, which stores:
    // [image_id, label, confidence, xmin, ymin, xmax, ymax]
48
    int output_size = static_cast<int>(input_layout.get_linear_size()) / PRIOR_BOX_SIZE;
49 50
    int num_classes = node.get_primitive()->num_classes;

51 52 53
    if (node.get_primitive()->share_location) {
        num_classes = (node.get_primitive()->background_label_id == 0) ? node.get_primitive()->num_classes - 1
                                                                       : node.get_primitive()->num_classes;
54 55 56
        output_size *= num_classes;
    }

57
    if (node.get_primitive()->top_k != -1) {
58
        int top_k = node.get_primitive()->top_k * num_classes * input_layout.size.batch[0];
59
        if (top_k < output_size) {
60 61 62 63 64 65 66 67
            output_size = top_k;
        }
    }

    output_size *= DETECTION_OUTPUT_ROW_SIZE;
    // Add space for number of output results per image - needed in the next detection output step
    output_size += ((input_layout.size.batch[0] + 15) / 16) * 16;

68 69 70 71 72 73 74 75 76
    if (node.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled()) {
        return {input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, 1, output_size)};
    } else {
        return {input_layout.data_type,
                cldnn::format::bfyx,
                cldnn::tensor(1,
                              1,
                              DETECTION_OUTPUT_ROW_SIZE,
                              node.get_primitive()->keep_top_k * input_layout.size.batch[0])};
77
    }
openvino-pushbot's avatar
openvino-pushbot committed
78 79
}

80 81 82 83 84
std::string detection_output_inst::to_string(detection_output_node const& node) {
    auto node_info = node.desc_to_json();
    auto desc = node.get_primitive();
    auto share_location = desc->share_location ? "true" : "false";
    auto variance_encoded = desc->variance_encoded_in_target ? "true" : "false";
openvino-pushbot's avatar
openvino-pushbot committed
85
    auto prior_is_normalized = desc->prior_is_normalized ? "true" : "false";
86 87 88 89 90 91
    auto decrease_label_id = desc->decrease_label_id ? "true" : "false";
    auto clip_before_nms = desc->clip_before_nms ? "true" : "false";
    auto clip_after_nms = desc->clip_after_nms ? "true" : "false";
    auto& input_location = node.location();
    auto& input_prior_box = node.prior_box();
    auto& input_confidence = node.confidence();
92

openvino-pushbot's avatar
openvino-pushbot committed
93
    std::stringstream primitive_description;
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
    std::string str_code_type;

    switch (desc->code_type) {
        case prior_box_code_type::corner:
            str_code_type = "corner";
            break;
        case prior_box_code_type::center_size:
            str_code_type = "center size";
            break;
        case prior_box_code_type::corner_size:
            str_code_type = "corner size";
            break;
        default:
            str_code_type = "not supported code type";
            break;
openvino-pushbot's avatar
openvino-pushbot committed
109
    }
110

openvino-pushbot's avatar
openvino-pushbot committed
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
    json_composite detec_out_info;
    detec_out_info.add("input location id", input_location.id());
    detec_out_info.add("input confidence id", input_confidence.id());
    detec_out_info.add("input prior box id", input_prior_box.id());
    detec_out_info.add("num_classes:", desc->num_classes);
    detec_out_info.add("keep_top_k", desc->keep_top_k);
    detec_out_info.add("share_location", share_location);
    detec_out_info.add("background_label_id", desc->background_label_id);
    detec_out_info.add("nms_treshold", desc->nms_threshold);
    detec_out_info.add("top_k", desc->top_k);
    detec_out_info.add("eta", desc->eta);
    detec_out_info.add("code_type", str_code_type);
    detec_out_info.add("variance_encoded", variance_encoded);
    detec_out_info.add("confidence_threshold", desc->confidence_threshold);
    detec_out_info.add("prior_info_size", desc->prior_info_size);
    detec_out_info.add("prior_coordinates_offset", desc->prior_coordinates_offset);
    detec_out_info.add("prior_is_normalized", prior_is_normalized);
    detec_out_info.add("input_width", desc->input_width);
    detec_out_info.add("input_height", desc->input_height);
    detec_out_info.add("decrease_label_id", decrease_label_id);
131 132
    detec_out_info.add("clip_before_nms", clip_before_nms);
    detec_out_info.add("clip_after_nms", clip_after_nms);
openvino-pushbot's avatar
openvino-pushbot committed
133 134
    detec_out_info.dump(primitive_description);

Alexey Suhov's avatar
Alexey Suhov committed
135 136
    node_info->add("dection output info", detec_out_info);
    node_info->dump(primitive_description);
openvino-pushbot's avatar
openvino-pushbot committed
137 138 139 140 141

    return primitive_description.str();
}

detection_output_inst::typed_primitive_inst(network_impl& network, detection_output_node const& node)
142
    : parent(network, node) {
openvino-pushbot's avatar
openvino-pushbot committed
143 144 145
    auto location_layout = node.location().get_output_layout();
    auto confidence_layout = node.confidence().get_output_layout();
    auto prior_box_layout = node.prior_box().get_output_layout();
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
                                  "Location memory format",
                                  location_layout.format.value,
                                  "expected bfyx input format",
                                  format::bfyx);
    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
                                  "Confidence memory format",
                                  confidence_layout.format.value,
                                  "expected bfyx input format",
                                  format::bfyx);
    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
                                  "Prior box memory format",
                                  prior_box_layout.format.value,
                                  "expected bfyx input format",
                                  format::bfyx);
openvino-pushbot's avatar
openvino-pushbot committed
161 162

    tensor location_size = location_layout.size;
163 164 165 166 167 168
    CLDNN_ERROR_NOT_EQUAL(node.id(),
                          "Location input dimensions",
                          (location_size.feature[0] * location_size.batch[0]),
                          "detection output layer dimensions",
                          static_cast<int>(location_layout.count()),
                          "Location input/ detection output dims mismatch");
openvino-pushbot's avatar
openvino-pushbot committed
169 170

    tensor confidence_size = confidence_layout.size;
171 172 173 174 175 176 177 178 179 180 181 182 183
    CLDNN_ERROR_NOT_EQUAL(node.id(),
                          "Confidence input dimensions",
                          (confidence_size.feature[0] * confidence_size.batch[0]),
                          "detection output layer dimensions",
                          static_cast<int>(confidence_layout.count()),
                          "Confidence input/detection output dims mistmach");

    CLDNN_ERROR_NOT_EQUAL(node.id(),
                          "Confidence batch size",
                          confidence_size.batch[0],
                          "location input batch size",
                          location_size.batch[0],
                          "Batch sizes mismatch.");
openvino-pushbot's avatar
openvino-pushbot committed
184

185
    auto desc = node.get_primitive();
openvino-pushbot's avatar
openvino-pushbot committed
186 187 188
    int prior_feature_size = desc->variance_encoded_in_target ? 1 : 2;
    tensor prior_box_size = prior_box_layout.size;
    CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box spatial X", prior_box_size.spatial[0], "expected value", 1, "");
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
    CLDNN_ERROR_NOT_EQUAL(node.id(),
                          "Prior box feature size",
                          prior_box_size.feature[0],
                          "expected value",
                          prior_feature_size,
                          "");

    CLDNN_ERROR_BOOL(node.id(),
                     "Detection output layer padding",
                     node.is_padded(),
                     "Detection output layer doesn't support output padding.");
    CLDNN_ERROR_BOOL(node.id(),
                     "Detection output layer Prior-box input padding",
                     node.get_dependency(2).is_padded(),
                     "Detection output layer doesn't support input padding in Prior-Box input");
openvino-pushbot's avatar
openvino-pushbot committed
204
}
205 206 207

/************************ Detection Output keep_top_k part ************************/

208
primitive_type_id detection_output_sort_type_id() {
209 210 211 212
    static primitive_type_base<detection_output_sort> instance;
    return &instance;
}

213 214 215 216 217 218 219 220 221 222
layout detection_output_sort_inst::calc_output_layout(detection_output_sort_node const& node) {
    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
           "Output data type forcing is not supported for "
           "detection_output_sort_node!");
    CLDNN_ERROR_NOT_EQUAL(node.id(),
                          "Detection output layer input number",
                          node.get_dependencies().size(),
                          "expected number of inputs",
                          static_cast<size_t>(1),
                          "");
223 224 225 226 227 228

    auto input_layout = node.input().get_output_layout();
    int keep_top_k = node.as<detection_output_sort>().get_primitive()->keep_top_k;
    int num_images = node.as<detection_output_sort>().get_primitive()->num_images;

    // If detection output sort is used as a second part of detection output get proper info from detection otput node
229 230 231 232 233
    if (num_images == 0) {
        CLDNN_ERROR_BOOL(node.id(),
                         "node.get_dependency(0).is_type<detection_output>()",
                         !node.get_dependency(0).is_type<detection_output>(),
                         "Cannot calculate output layout.");
234 235 236 237 238 239 240 241 242
        input_layout = node.get_dependency(0).as<detection_output>().location().get_output_layout();
        keep_top_k = node.get_dependency(0).as<detection_output>().get_primitive()->keep_top_k;
        num_images = input_layout.size.batch[0];
    }
    // Batch size and feature size are 1.
    // Number of bounding boxes to be kept is set to keep_top_k*batch size.
    // If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1.
    // Each row is a 7 dimension vector, which stores:
    // [image_id, label, confidence, xmin, ymin, xmax, ymax]
243 244 245
    return {input_layout.data_type,
            cldnn::format::bfyx,
            cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, keep_top_k * num_images)};
246 247
}

248
std::string detection_output_sort_inst::to_string(detection_output_sort_node const& node) {
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
    auto node_info = node.desc_to_json();
    auto desc = node.get_primitive();

    auto& input_bboxes = node.input();

    std::stringstream primitive_description;

    json_composite detec_out_info;
    detec_out_info.add("input bboxes id", input_bboxes.id());
    detec_out_info.add("num_classes:", desc->num_images);
    detec_out_info.add("num_classes:", desc->num_classes);
    detec_out_info.add("keep_top_k", desc->keep_top_k);
    detec_out_info.add("share_location", desc->share_location);
    detec_out_info.add("top_k", desc->top_k);
    detec_out_info.dump(primitive_description);

    node_info->add("dection output info", detec_out_info);
    node_info->dump(primitive_description);

    return primitive_description.str();
}

detection_output_sort_inst::typed_primitive_inst(network_impl& network, detection_output_sort_node const& node)
272 273 274 275 276 277 278 279 280 281 282
    : parent(network, node) {
    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
                                  "Input memory format",
                                  node.get_dependency(0).get_output_layout().format.value,
                                  "expected bfyx input format",
                                  format::bfyx);

    CLDNN_ERROR_BOOL(node.id(),
                     "Detecion output layer padding",
                     node.is_padded(),
                     "Detection output layer doesn't support output padding.");
openvino-pushbot's avatar
openvino-pushbot committed
283
}
284
}  // namespace cldnn