benchmark.cpp 13.5 KB
Newer Older
1
//*****************************************************************************
2
// Copyright 2017-2019 Intel Corporation
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
16

17
#include <random>
18
#if defined(__x86_64__) || defined(__amd64__)
19
#include <xmmintrin.h>
20
#endif
21

Ashok Emani's avatar
Ashok Emani committed
22
#include "benchmark.hpp"
23
#include "ngraph/file_util.hpp"
Ashok Emani's avatar
Ashok Emani committed
24
#include "ngraph/runtime/backend.hpp"
25 26
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
Ashok Emani's avatar
Ashok Emani committed
27
#include "ngraph/serializer.hpp"
28
#include "ngraph/util.hpp"
Ashok Emani's avatar
Ashok Emani committed
29

30 31 32
using namespace std;
using namespace ngraph;

33 34
static default_random_engine s_random_engine;

35 36 37 38 39 40 41 42 43
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
    // Avoids perf impact from denormals while benchmarking with random data
    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}

44
template <typename T>
Robert Kimball's avatar
Robert Kimball committed
45
void init_int_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
46
{
Robert Kimball's avatar
Robert Kimball committed
47
    size_t size = tensor->get_element_count();
48
    uniform_int_distribution<T> dist(min, max);
49
    vector<T> vec(size);
50 51 52 53
    for (T& element : vec)
    {
        element = dist(s_random_engine);
    }
Robert Kimball's avatar
Robert Kimball committed
54
    tensor->write(vec.data(), vec.size() * sizeof(T));
55 56
}

57
template <>
Robert Kimball's avatar
Robert Kimball committed
58
void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
59
{
Robert Kimball's avatar
Robert Kimball committed
60
    size_t size = tensor->get_element_count();
61 62 63 64 65 66
    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
    vector<char> vec(size);
    for (char& element : vec)
    {
        element = static_cast<char>(dist(s_random_engine));
    }
Robert Kimball's avatar
Robert Kimball committed
67
    tensor->write(vec.data(), vec.size() * sizeof(char));
68 69 70
}

template <>
Robert Kimball's avatar
Robert Kimball committed
71
void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
72
{
Robert Kimball's avatar
Robert Kimball committed
73
    size_t size = tensor->get_element_count();
74 75 76 77 78 79
    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
    vector<int8_t> vec(size);
    for (int8_t& element : vec)
    {
        element = static_cast<int8_t>(dist(s_random_engine));
    }
Robert Kimball's avatar
Robert Kimball committed
80
    tensor->write(vec.data(), vec.size() * sizeof(int8_t));
81 82 83
}

template <>
Robert Kimball's avatar
Robert Kimball committed
84
void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
85
{
Robert Kimball's avatar
Robert Kimball committed
86
    size_t size = tensor->get_element_count();
87 88 89 90 91 92
    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
    vector<uint8_t> vec(size);
    for (uint8_t& element : vec)
    {
        element = static_cast<uint8_t>(dist(s_random_engine));
    }
Robert Kimball's avatar
Robert Kimball committed
93
    tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
94 95
}

96
template <typename T>
Robert Kimball's avatar
Robert Kimball committed
97
void init_real_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
98
{
Robert Kimball's avatar
Robert Kimball committed
99
    size_t size = tensor->get_element_count();
100
    uniform_real_distribution<T> dist(min, max);
101
    vector<T> vec(size);
102 103 104 105
    for (T& element : vec)
    {
        element = dist(s_random_engine);
    }
Robert Kimball's avatar
Robert Kimball committed
106
    tensor->write(vec.data(), vec.size() * sizeof(T));
107 108
}

Robert Kimball's avatar
Robert Kimball committed
109
static void random_init(shared_ptr<runtime::Tensor> tensor)
110
{
Robert Kimball's avatar
Robert Kimball committed
111 112 113 114 115 116
    element::Type et = tensor->get_element_type();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
117 118
    switch (et.get_type_enum())
    {
Robert Kimball's avatar
Robert Kimball committed
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
    case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
    case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
    case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
    case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
    case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
    case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
    case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
    case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
    case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
    case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
    case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
    case element::Type_t::undefined:
    case element::Type_t::dynamic:
    case element::Type_t::bf16:
    case element::Type_t::f16:
134
    default: throw runtime_error("unsupported type");
135
    }
Robert Kimball's avatar
Robert Kimball committed
136 137 138
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
139 140
}

141 142 143 144
vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                                                  const string& backend_name,
                                                  size_t iterations,
                                                  bool timing_detail,
145 146
                                                  int warmup_iterations,
                                                  bool copy_data)
147 148 149
{
    stopwatch timer;
    timer.start();
150
    auto backend = runtime::Backend::create(backend_name);
151
    auto compiled_func = backend->compile(f, timing_detail);
152 153 154
    timer.stop();
    cout.imbue(locale(""));
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
Ashok Emani's avatar
Ashok Emani committed
155

156 157
    vector<shared_ptr<runtime::HostTensor>> arg_data;
    vector<shared_ptr<runtime::Tensor>> args;
158
    vector<bool> args_cacheable;
Ashok Emani's avatar
Ashok Emani committed
159 160
    for (shared_ptr<op::Parameter> param : f->get_parameters())
    {
161
        auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
162
        auto tensor_data =
163
            make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
164
        random_init(tensor_data);
165 166
        tensor->write(tensor_data->get_data_ptr(),
                      tensor_data->get_element_count() * tensor_data->get_element_type().size());
Ashok Emani's avatar
Ashok Emani committed
167
        args.push_back(tensor);
168
        arg_data.push_back(tensor_data);
169
        args_cacheable.push_back(param->get_cacheable());
Ashok Emani's avatar
Ashok Emani committed
170
    }
171 172
    set_denormals_flush_to_zero();

173 174
    vector<shared_ptr<runtime::HostTensor>> result_data;
    vector<shared_ptr<runtime::Tensor>> results;
Ashok Emani's avatar
Ashok Emani committed
175 176
    for (shared_ptr<Node> out : f->get_results())
    {
177
        auto result = backend->create_tensor(out->get_element_type(), out->get_shape());
178
        auto tensor_data =
179
            make_shared<runtime::HostTensor>(out->get_element_type(), out->get_shape());
Ashok Emani's avatar
Ashok Emani committed
180
        results.push_back(result);
181
        result_data.push_back(tensor_data);
Ashok Emani's avatar
Ashok Emani committed
182 183
    }

184 185 186 187 188 189 190
    for (size_t i = 0; i < args.size(); i++)
    {
        if (args_cacheable[i])
        {
            args[i]->set_stale(false);
        }
    }
191

192 193
    stopwatch t1;
    for (size_t i = 0; i < iterations + warmup_iterations; i++)
194
    {
195
        if (i == warmup_iterations)
196
        {
197
            t1.start();
198
        }
199 200 201 202
        if (copy_data)
        {
            for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
            {
203
                const shared_ptr<runtime::Tensor>& arg = args[arg_index];
204 205
                if (arg->get_stale())
                {
206
                    const shared_ptr<runtime::HostTensor>& data = arg_data[arg_index];
207
                    arg->write(data->get_data_ptr(),
208
                               data->get_element_count() * data->get_element_type().size());
209 210 211
                }
            }
        }
212
        compiled_func->call(results, args);
213 214 215 216
        if (copy_data)
        {
            for (size_t result_index = 0; result_index < results.size(); result_index++)
            {
217 218
                const shared_ptr<runtime::HostTensor>& data = result_data[result_index];
                const shared_ptr<runtime::Tensor>& result = results[result_index];
219 220
                result->read(data->get_data_ptr(),
                             data->get_element_count() * data->get_element_type().size());
221 222
            }
        }
Ashok Emani's avatar
Ashok Emani committed
223 224 225 226 227
    }
    t1.stop();
    float time = t1.get_milliseconds();
    cout << time / iterations << "ms per iteration" << endl;

228
    vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
229
    return perf_data;
Ashok Emani's avatar
Ashok Emani committed
230
}
Robert Kimball's avatar
Robert Kimball committed
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344

vector<runtime::PerformanceCounter> run_benchmark_double_buffered(shared_ptr<Function> f,
                                                                  const string& backend_name,
                                                                  size_t iterations,
                                                                  bool timing_detail,
                                                                  int warmup_iterations,
                                                                  bool copy_data)
{
    stopwatch timer;
    timer.start();
    auto backend = runtime::Backend::create(backend_name);
    auto compiled_func = backend->compile(f, timing_detail);
    timer.stop();
    cout.imbue(locale(""));
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
    set_denormals_flush_to_zero();

    array<vector<shared_ptr<runtime::HostTensor>>, 2> args_data_set;
    array<vector<shared_ptr<runtime::Tensor>>, 2> args_set;
    array<vector<shared_ptr<runtime::HostTensor>>, 2> results_data_set;
    array<vector<shared_ptr<runtime::Tensor>>, 2> results_set;
    for (size_t i = 0; i < 2; i++)
    {
        vector<shared_ptr<runtime::HostTensor>> args_data;
        vector<shared_ptr<runtime::Tensor>> args;
        for (shared_ptr<op::Parameter> param : f->get_parameters())
        {
            auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
            auto tensor_data =
                make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
            random_init(tensor_data);
            tensor->write(tensor_data->get_data_ptr(),
                          tensor_data->get_element_count() *
                              tensor_data->get_element_type().size());
            args.push_back(tensor);
            args_data.push_back(tensor_data);
        }
        args_set[i] = args;
        args_data_set[i] = args_data;
        vector<shared_ptr<runtime::Tensor>> results;
        vector<shared_ptr<runtime::HostTensor>> results_data;
        for (shared_ptr<Node> out : f->get_results())
        {
            auto result = backend->create_tensor(out->get_element_type(), out->get_shape());
            auto result_data =
                make_shared<runtime::HostTensor>(out->get_element_type(), out->get_shape());
            results.push_back(result);
            results_data.push_back(result_data);
        }
        results_set[i] = results;
        results_data_set[i] = results_data;
    }

    stopwatch t1;

    // Before we start we write the first iteration's data
    size_t buffer_number = 0;
    auto args = args_set[buffer_number];
    auto args_data = args_data_set[buffer_number];
    for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
    {
        const shared_ptr<runtime::Tensor>& arg = args[arg_index];
        const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
        arg->begin_write(data->get_data_ptr(),
                         data->get_element_count() * data->get_element_type().size(),
                         buffer_number);
    }

    const vector<shared_ptr<runtime::Tensor>>& results = results_set[buffer_number];
    const vector<shared_ptr<runtime::HostTensor>>& results_data = results_data_set[buffer_number];
    for (size_t i = 0; i < iterations + warmup_iterations; i++)
    {
        if (i == warmup_iterations)
        {
            t1.start();
        }
        future<void> exec_future = compiled_func->begin_execute(results, args);
        if (i > 0)
        {
            for (size_t result_index = 0; result_index < results.size(); result_index++)
            {
                const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
                const shared_ptr<runtime::Tensor>& result = results[result_index];
                result->begin_read(data->get_data_ptr(),
                                   data->get_element_count() * data->get_element_type().size(),
                                   (buffer_number - 1) & 1);
            }
        }
        buffer_number = (buffer_number + 1) & 1;
        for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
        {
            const shared_ptr<runtime::Tensor>& arg = args[arg_index];
            const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
            arg->begin_write(data->get_data_ptr(),
                             data->get_element_count() * data->get_element_type().size(),
                             buffer_number);
        }
        exec_future.get();
    }
    for (size_t result_index = 0; result_index < results.size(); result_index++)
    {
        const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
        const shared_ptr<runtime::Tensor>& result = results[result_index];
        result->begin_read(data->get_data_ptr(),
                           data->get_element_count() * data->get_element_type().size(),
                           (buffer_number - 1) & 1);
    }
    t1.stop();
    float time = t1.get_milliseconds();
    cout << time / iterations << "ms per iteration" << endl;

    vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
    return perf_data;
}