reduce.cpp 19.5 KB
Newer Older
1
// Copyright (C) 2018-2020 Intel Corporation
2 3 4
// SPDX-License-Identifier: Apache-2.0
//

5 6
#include "list.hpp"
#include "base.hpp"
7 8 9 10 11 12 13

#include <cmath>
#include <limits>
#include <cfloat>
#include <string>
#include <vector>
#include <cassert>
14
#include <ie_util_internal.hpp>
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
#include "ie_parallel.hpp"

namespace InferenceEngine {
namespace Extensions {
namespace Cpu {

class ReduceImpl: public ExtLayerBase {
public:
    explicit ReduceImpl(const CNNLayer* layer) {
        try {
            if (layer->insData.empty() || layer->outData.empty())
                THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!";

            if (layer->insData.size() != 2)
                THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!";

            idx_dims = layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getDims();
            if (idx_dims.size() > 1)
                THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension";

35 36 37 38
            if (layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 &&
                layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::I32 &&
                layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::U8)
                THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only FP32/I32/U8 are supported!";
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116

            if (layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32)
                THROW_IE_EXCEPTION << layer->name << " Incorrect 'axes_to_reduction' input precision. Only I32 is supported!";

            data_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
            SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims();

            keep_dims = layer->GetParamAsBool("keep_dims", true);
            if (keep_dims) {
                if (data_dims.size() != dst_dims.size())
                    THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
            } else {
                if (data_dims.size() <= dst_dims.size())
                    THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
            }

            std::string reduce_mode = layer->type;
            if (reduce_mode == "ReduceAnd") reduceMode = Reduce::And;
            else if (reduce_mode == "ReduceL1") reduceMode = Reduce::L1;
            else if (reduce_mode == "ReduceL2") reduceMode = Reduce::L2;
            else if (reduce_mode == "ReduceLogSum") reduceMode = Reduce::LogSum;
            else if (reduce_mode == "ReduceLogSumExp") reduceMode = Reduce::LogSumExp;
            else if (reduce_mode == "ReduceMax") reduceMode = Reduce::Max;
            else if (reduce_mode == "ReduceMean") reduceMode = Reduce::Mean;
            else if (reduce_mode == "ReduceMin") reduceMode = Reduce::Min;
            else if (reduce_mode == "ReduceOr") reduceMode = Reduce::Or;
            else if (reduce_mode == "ReduceProd") reduceMode = Reduce::Prod;
            else if (reduce_mode == "ReduceSum") reduceMode = Reduce::Sum;
            else if (reduce_mode == "ReduceSumSquare") reduceMode = Reduce::SumSquare;
            else
                THROW_IE_EXCEPTION << layer->name << " Incorrect Reduce layer type!";

            src_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
            srcStrides = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides();

            addConfig(layer, { { ConfLayout::PLN, false }, { ConfLayout::PLN, false } }, { { ConfLayout::PLN, false } });
        } catch (InferenceEngine::details::InferenceEngineException &ex) {
            errorMsg = ex.what();
        }
    }

    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
        int32_t *idx_data = inputs[REDUCE_INDEXES]->cbuffer().as<int32_t *>() +
                            inputs[REDUCE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
        SizeVector axes;
        for (size_t i = 0; i < idx_dims[0]; i++) {
            int32_t axis = idx_data[i];
            if (axis < 0)
                axis += data_dims.size();

            if (static_cast<size_t>(axis) > data_dims.size()) {
                if (resp) {
                    std::string errorMsg = "Index to reduce exceeds data tensor dimension";
                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
                }
                return PARAMETER_MISMATCH;
            }
            axes.push_back(static_cast<size_t>(axis));
        }

        size_t reduced_dims_work_amount = 1;
        InferenceEngine::SizeVector our_dims, out_dims, axes_for_reduction;
        for (size_t i = 0; i < src_dims.size(); i++) {
            bool found = false;
            for (size_t axis : axes)
                if (i == axis) found = true;

            if (found) {
                axes_for_reduction.push_back(i);
                reduced_dims_work_amount *= src_dims[i];
                if (keep_dims) out_dims.push_back(1);
                our_dims.push_back(1);
            } else {
                out_dims.push_back(src_dims[i]);
                our_dims.push_back(src_dims[i]);
            }
        }

117 118 119
        if (!our_dims.size())
            our_dims = InferenceEngine::SizeVector(1, 1);

120 121 122 123 124 125 126 127 128 129 130
        InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
        for (size_t i = 0; i < (std::min)(out_dims.size(), dst_dims.size()); i++) {
            if (out_dims[i] != dst_dims[i]) {
                if (resp) {
                    std::string errorMsg = "Incorrect number of output dimensions!";
                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
                }
                return PARAMETER_MISMATCH;
            }
        }

131
        size_t work_amount_dst;
132
        if (!dst_dims.size()) {
133
            work_amount_dst = 1;
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
        } else {
            size_t stride = !outputs[0]->getTensorDesc().getBlockingDesc().getStrides().empty()
                    ? outputs[0]->getTensorDesc().getBlockingDesc().getStrides()[0]
                    : 1;
            work_amount_dst = stride * dst_dims[0];
        }

        auto compare = getPrecisionMask(inputs[REDUCE_DATA]->getTensorDesc().getPrecision(), outputs[0]->getTensorDesc().getPrecision());
        switch (compare) {
            case getPrecisionMask(Precision::FP32, Precision::FP32):
                return reduce_type<float , float>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
            case getPrecisionMask(Precision::I32, Precision::I64):
                return reduce_type<int32_t , int64_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
            case getPrecisionMask(Precision::I32, Precision::FP32):
                return reduce_type<int32_t , float>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
            case getPrecisionMask(Precision::I32, Precision::I32):
                return reduce_type<int32_t , int32_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
            case getPrecisionMask(Precision::U8, Precision::U8):
                return reduce_type<int8_t , int8_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
            case getPrecisionMask(Precision::FP32, Precision::U8):
                return reduce_type<float , uint8_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
            default:
                if (resp) {
                    std::string errorMsg = "Incorrect Reduce layer type";
                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
                }
                return GENERAL_ERROR;
        }
    }

private:
    template <typename src_d, typename dst_t, typename F1, typename F2>
    void reduce(const src_d *src_data, dst_t* dst_data, size_t work_amount_dst, size_t reduced_dims_work_amount,
        SizeVector axes_for_reduction, SizeVector dst_dims, dst_t init_value, F1 func1, F2 func2);
    template <typename src_d, typename dst_t>
    StatusCode reduce_type(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, size_t work_amount_dst, size_t reduced_dims_work_amount,
                SizeVector axes_for_reduction, SizeVector dst_dims);
    enum class Reduce { And, L1, L2, LogSum, LogSumExp, Max, Mean, Min, Or, Prod, Sum, SumSquare };
172

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
    const size_t REDUCE_DATA = 0;
    const size_t REDUCE_INDEXES = 1;
    bool keep_dims = true;
    Reduce reduceMode = Reduce::Sum;
    SizeVector data_dims;
    SizeVector idx_dims;
    SizeVector src_dims;
    SizeVector srcStrides;
};

template <typename src_d, typename dst_t>
StatusCode ReduceImpl::reduce_type(
        std::vector<Blob::Ptr>& inputs,
        std::vector<Blob::Ptr>& outputs,
        size_t       work_amount_dst,
        size_t       reduced_dims_work_amount,
        SizeVector   axes_for_reduction,
        SizeVector   our_dims
) {
    const src_d *src_data = inputs[REDUCE_DATA]->cbuffer().as<src_d *>() +
                            inputs[REDUCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
    dst_t* dst_data = outputs[0]->cbuffer().as<dst_t *>() +
                      outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();

    switch (reduceMode) {
198
        case Reduce::And:
199 200 201
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(1),
                   [](dst_t x, src_d y)->dst_t { return x && y; },
                   [](dst_t x, src_d y)->dst_t { return x && y; });
202 203
            break;
        case Reduce::L1:
204 205 206
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t old, src_d y)->dst_t { return old + (std::abs)(y); },
                   [](dst_t x, src_d y)->dst_t { return x + y; });
207 208
            break;
        case Reduce::L2:
209 210 211
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t old, src_d y)->dst_t { return old + y * y;},
                   [](dst_t x, src_d y)->dst_t { return x + y; });
212 213 214 215 216 217

            parallel_for(work_amount_dst, [&](size_t i) {
                dst_data[i] = sqrt(dst_data[i]);
            });
            break;
        case Reduce::LogSum:
218 219 220
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t x, src_d y)->dst_t { return x + y; },
                   [](dst_t x, src_d y)->dst_t { return x + y; });
221 222 223 224 225 226

            parallel_for(work_amount_dst, [&](size_t i) {
                dst_data[i] = logf(dst_data[i]);
            });
            break;
        case Reduce::LogSumExp:
227 228 229
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t old, src_d y)->dst_t { return old + expf(y); },
                   [](dst_t x, src_d y)->dst_t { return x + y; });
230 231 232 233 234 235

            parallel_for(work_amount_dst, [&](size_t i) {
                dst_data[i] = logf(dst_data[i]);
            });
            break;
        case Reduce::Max:
236 237 238 239
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims,
                                 (std::numeric_limits<dst_t>::min)(),
                   [](dst_t x, src_d y)->dst_t { return x > y ? x : y; },
                   [](dst_t x, src_d y)->dst_t { return x > y ? x : y; });
240 241
            break;
        case Reduce::Mean:
242 243 244
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t x, src_d y)->dst_t { return x + y; },
                   [](dst_t x, src_d y)->dst_t { return x + y; });
245 246

            parallel_for(work_amount_dst, [&](size_t i) {
247
                dst_data[i] /= static_cast<dst_t>(reduced_dims_work_amount);
248 249 250
            });
            break;
        case Reduce::Min:
251 252 253 254
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims,
                                 (std::numeric_limits<dst_t>::max)(),
                   [](dst_t x, src_d y)->dst_t { return x < y ? x : y; },
                   [](dst_t x, src_d y)->dst_t { return x < y ? x : y; });
255 256
            break;
        case Reduce::Or:
257 258 259
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t x, src_d y)->dst_t { return x || y; },
                   [](dst_t x, src_d y)->dst_t { return x || y; });
260 261
            break;
        case Reduce::Prod:
262 263 264
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(1),
                   [](dst_t x, src_d y)->dst_t { return x * y; },
                   [](dst_t x, src_d y)->dst_t { return x * y; });
265 266
            break;
        case Reduce::Sum:
267 268 269
            reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t x, src_d y)->dst_t { return x + y; },
                   [](dst_t x, src_d y)->dst_t { return x + y; });
270 271
            break;
        case Reduce::SumSquare:
272 273 274
            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
                   [](dst_t old, src_d y)->dst_t { return old + y * y; },
                   [](dst_t x, src_d y)->dst_t { return x + y; });
275 276 277 278
            break;
        default:
            return GENERAL_ERROR;
    }
279 280
    return OK;
}
281

282
template <typename src_d, typename dst_t, typename F1, typename F2>
283
void ReduceImpl::reduce(
284 285
    const src_d *src_data,
    dst_t       *dst_data,
286 287 288 289
    size_t       work_amount_dst,
    size_t       reduced_dims_work_amount,
    SizeVector   axes_for_reduction,
    SizeVector   dst_dims,
290
    dst_t        init_value,
291 292 293 294 295 296 297 298 299 300 301 302 303 304
    F1           func1,
    F2           func2
) {
    unsigned int nthr = parallel_get_max_threads();
    if ((work_amount_dst + 1) >= nthr) {
        parallel_nt(0, [&](const int ithr, const int nthr) {
            int j;
            size_t i, start = 0, end = 0;
            SizeVector dst_counters(dst_dims.size(), 0);
            splitter(work_amount_dst, nthr, ithr, start, end);
            for (j = dst_dims.size() - 1, i = start; j >= 0; j--) {
                dst_counters[j] = i % dst_dims[j];
                i /= dst_dims[j];
            }
305 306
            for (size_t src_idx = 0, dst_idx = start; dst_idx < end; ++dst_idx) {
                dst_t reduce_prod = init_value;
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
                bool update_idx = true;
                SizeVector src_counters = dst_counters;
                for (i = 0; i < reduced_dims_work_amount; ++i) {
                    if (update_idx) {
                        src_idx = 0;
                        for (j = 0; j < static_cast<int>(src_dims.size()); ++j)
                            src_idx += (src_counters[j] % src_dims[j]) * srcStrides[j];
                        update_idx = false;
                    }
                    reduce_prod = func1(reduce_prod, src_data[src_idx]);
                    for (j = axes_for_reduction.size() - 1; j >= 0; j--) {
                        src_counters[axes_for_reduction[j]]++;
                        if (src_counters[axes_for_reduction[j]] < src_dims[axes_for_reduction[j]]) {
                            src_idx += srcStrides[axes_for_reduction[j]];
                            break;
                        } else {
                            src_counters[axes_for_reduction[j]] = 0;
                            update_idx = true;
                        }
                    }
                }
                dst_data[dst_idx] = reduce_prod;
                for (j = dst_dims.size() - 1; j >= 0; j--) {
                    dst_counters[j]++;
                    if (dst_counters[j] < dst_dims[j])
                        break;
                    else
                        dst_counters[j] = 0;
                }
            }
        });
    } else {
339
        std::vector<dst_t> reduce_prod((nthr * work_amount_dst), init_value);
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
        if (work_amount_dst == 1) {
            parallel_nt(nthr, [&](const int ithr, const int nthr) {
                size_t i, start = 0, end = 0;
                splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
                for (i = start; i < end; ++i)
                    reduce_prod[ithr] = func1(reduce_prod[ithr], src_data[i]);
            });
        } else {
            SizeVector dstStrides(dst_dims.size(), 1);
            for (int j = dst_dims.size() - 1; j >= 1; --j)
                dstStrides[j - 1] = dstStrides[j] * dst_dims[j];
            parallel_nt(nthr, [&](const int ithr, const int nthr) {
                int j;
                bool update_idx = true;
                size_t i, src_idx, dst_idx = 0, start = 0, end = 0;
                splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
                SizeVector src_counters(src_dims.size(), 0);
                for (j = src_dims.size() - 1, src_idx = start; j >= 0; j--) {
                    src_counters[j] = src_idx % src_dims[j];
                    src_idx /= src_dims[j];
                }
                for (src_idx = start; src_idx < end; ++src_idx) {
                    if (update_idx) {
                        for (i = 0, dst_idx = 0; i < dst_dims.size(); ++i)
                            dst_idx += (src_counters[i] % dst_dims[i]) * dstStrides[i];
                        update_idx = false;
                    }
                    reduce_prod[ithr * work_amount_dst + dst_idx] = func1(reduce_prod[ithr * work_amount_dst + dst_idx], src_data[src_idx]);
                    for (j = src_dims.size() - 1; j >= 0; j--) {
                        src_counters[j]++;
                        if (src_counters[j] < src_dims[j]) {
                            if (dst_dims[j] > 1) dst_idx += dstStrides[j];
                            break;
                        } else {
                            src_counters[j] = 0;
                            update_idx = true;
                        }
                    }
                }
            });
        }
        for (size_t dst_idx = 0; dst_idx < work_amount_dst; dst_idx++) {
            for (size_t ithr = work_amount_dst; ithr < (nthr * work_amount_dst); ithr += work_amount_dst)
                reduce_prod[dst_idx] = func2(reduce_prod[dst_idx], reduce_prod[dst_idx + ithr]);
            dst_data[dst_idx] = reduce_prod[dst_idx];
        }
    }
}

REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceAnd);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL1);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceL2);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSum);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceLogSumExp);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMax);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMean);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceMin);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceOr);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceProd);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSum);
REG_FACTORY_FOR(ImplFactory<ReduceImpl>, ReduceSumSquare);

}  // namespace Cpu
}  // namespace Extensions
}  // namespace InferenceEngine