graph_partition.cpp 20.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
16 17 18 19 20 21 22 23 24 25 26 27 28 29

#include <memory>
#include <sstream>
#include <string>
#include <typeindex>
#include <typeinfo>
#include <vector>

#include "gtest/gtest.h"

#include "ngraph/graph_util.hpp"
#include "ngraph/ngraph.hpp"
#include "ngraph/pass/assign_placement.hpp"
#include "ngraph/pass/manager.hpp"
30
#include "ngraph/runtime/host_tensor.hpp"
31 32 33 34 35 36 37
#include "ngraph/util.hpp"
#include "util/ndarray.hpp"
#include "util/test_tools.hpp"

using namespace std;
using namespace ngraph;

38 39 40 41 42
// Perform all operations on INTERPRETER and fallback Multiply to CPU
static function<Placement(shared_ptr<Node>)> int_with_cpu_mul_policy = [](shared_ptr<Node> node) {
    Placement placement;
    string node_op = node->description();
    if (node_op == "Multiply")
43
    {
44
        placement = Placement::CPU;
45
    }
46
    else
47
    {
48
        placement = Placement::INTERPRETER;
49
    }
50 51
    return placement;
};
52

53 54 55 56
// HybridCallFrame servers 2 purposes:
// 1. HybridBackend's main use case is to test device placement and graph partition routines.
// 2. It also shows how glued-hybrid runtime can be built by combining different runtimes.
//
57
// By default, HybridBackend operates on INTERPRETER (for example, the tensor view is
58 59
// INTERPRETER tensor view). It falls back to CPU when requested by placement.
class HybridBackend
60 61
{
public:
62 63
    HybridBackend(const function<Placement(shared_ptr<Node>)>& placement_policy)
        : m_placement_policy(placement_policy)
64 65 66
    {
    }

67
    ~HybridBackend() {}
68
    shared_ptr<runtime::Tensor> create_tensor(const element::Type& element_type, const Shape& shape)
69 70 71 72 73 74
    {
        return get_cached_backend(Placement::INTERPRETER)->create_tensor(element_type, shape);
    }

    bool compile(const shared_ptr<Function>& func)
    {
75
        if (m_function_map.find(func) == m_function_map.end())
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
        {
            // Clone function
            FunctionInstance instance;
            instance.m_function = clone_function(*func);

            // Run placement pass
            pass::Manager pass_manager;
            pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
            pass_manager.run_passes(instance.m_function);

            // Split function to sub_functions
            tie(instance.m_sub_functions, instance.m_map_parameter_to_result) =
                split_function_by_placement(instance.m_function);
            m_function_map.insert({func, instance});

            // Compile subfunctions in corresponding backends
            for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
            {
                Placement placement = get_colocated_function_placement(sub_function);
                auto backend = get_cached_backend(placement);
                backend->compile(sub_function);
            }
        }
        return true;
    }

102
    bool call_with_validate(const shared_ptr<Function>& func,
103 104
                            const vector<shared_ptr<runtime::Tensor>>& outputs,
                            const vector<shared_ptr<runtime::Tensor>>& inputs)
105
    {
106 107 108 109 110 111 112 113 114 115 116 117 118
        // Get FunctionInstance
        bool rc = true;
        auto it = m_function_map.find(func);
        if (it == m_function_map.end())
        {
            compile(func);
            it = m_function_map.find(func);
        }
        if (it == m_function_map.end())
        {
            throw runtime_error("Error constructing backend.");
        }
        FunctionInstance& instance = it->second;
119

120 121
        // Parameter and result node in sub_function maps to one Tensor
        unordered_map<shared_ptr<Node>, shared_ptr<runtime::Tensor>> map_node_to_tensor_view;
122
        for (size_t i = 0; i < inputs.size(); ++i)
123
        {
124
            map_node_to_tensor_view[instance.m_function->get_parameters()[i]] = inputs[i];
125
        }
126
        for (size_t i = 0; i < outputs.size(); ++i)
127
        {
128
            map_node_to_tensor_view[instance.m_function->get_results()[i]] = outputs[i];
129 130
        }

131 132
        // Call subfunctions
        for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
133
        {
134
            // Init backend
135
            Placement placement = get_colocated_function_placement(sub_function);
136
            auto backend = get_cached_backend(placement);
137

138
            // Prepare parameter TensorViews
139
            vector<shared_ptr<runtime::Tensor>> parameter_tvs;
140
            for (auto parameter_node : sub_function->get_parameters())
141
            {
142
                if (map_node_to_tensor_view.find(parameter_node) != map_node_to_tensor_view.end())
143
                {
144
                    parameter_tvs.push_back(map_node_to_tensor_view.at(parameter_node));
145 146 147
                }
                else
                {
148
                    auto result_node = instance.m_map_parameter_to_result.at(parameter_node);
149
                    auto result_tv = map_node_to_tensor_view.at(result_node);
150 151
                    auto parameter_tv = backend->create_tensor(parameter_node->get_element_type(),
                                                               parameter_node->get_shape());
152 153 154
                    copy_data(parameter_tv, read_vector<float>(result_tv));
                    map_node_to_tensor_view[parameter_node] = parameter_tv;
                    parameter_tvs.push_back(parameter_tv);
155 156 157
                }
            }

158
            // Prepare result TensorViews
159
            vector<shared_ptr<runtime::Tensor>> result_tvs;
160
            for (auto result_node : sub_function->get_results())
161
            {
162
                if (map_node_to_tensor_view.find(result_node) != map_node_to_tensor_view.end())
163
                {
164
                    result_tvs.push_back(map_node_to_tensor_view.at(result_node));
165 166 167
                }
                else
                {
168 169
                    auto result_tv = backend->create_tensor(result_node->get_element_type(),
                                                            result_node->get_shape());
170 171
                    map_node_to_tensor_view[result_node] = result_tv;
                    result_tvs.push_back(result_tv);
172 173 174 175
                }
            }

            // Call
176
            backend->call_with_validate(sub_function, result_tvs, parameter_tvs);
177
        }
178
        return rc;
179 180 181
    }

protected:
182
    class FunctionInstance
183
    {
184 185 186 187 188
    public:
        shared_ptr<Function> m_function;
        vector<shared_ptr<Function>> m_sub_functions;
        unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>> m_map_parameter_to_result;
    };
189

190
    shared_ptr<runtime::Backend> get_cached_backend(Placement placement)
191
    {
192
        if (m_cached_backends.find(placement) == m_cached_backends.end())
193
        {
194
            m_cached_backends[placement] = runtime::Backend::create(placement_to_string(placement));
195
        }
196
        return m_cached_backends.at(placement);
197 198
    }

199 200 201
    map<Placement, shared_ptr<runtime::Backend>> m_cached_backends;
    map<shared_ptr<Function>, FunctionInstance> m_function_map;
    function<Placement(shared_ptr<Node>)> m_placement_policy;
202 203 204 205 206
};

TEST(graph_partition, placement_all_cpu_policy)
{
    Shape shape = Shape{2, 2};
207 208 209 210 211
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> AplusB = A + B;
    shared_ptr<Node> AplusBtimesC = AplusB * C;
212
    shared_ptr<Function> f = make_shared<Function>(AplusBtimesC, ParameterVector{A, B, C});
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229

    for (auto node : f->get_ordered_ops())
    {
        EXPECT_EQ(node->get_placement(), Placement::DEFAULT);
    }

    pass::Manager pass_manager;
    pass_manager.register_pass<pass::AssignPlacement>(
        [](shared_ptr<Node> node) { return Placement::CPU; });
    pass_manager.run_passes(f);

    for (auto node : f->get_ordered_ops())
    {
        EXPECT_EQ(node->get_placement(), Placement::CPU);
    }
}

230
#ifdef NGRAPH_CPU_ENABLE
231 232 233
TEST(graph_partition, placement_int_with_cpu_mul_policy)
{
    Shape shape = Shape{2, 2};
234 235 236 237 238
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> AplusB = A + B;
    shared_ptr<Node> AplusBtimesC = AplusB * C;
239
    shared_ptr<Function> f = make_shared<Function>(AplusBtimesC, ParameterVector{A, B, C});
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263

    for (auto node : f->get_ordered_ops())
    {
        EXPECT_EQ(node->get_placement(), Placement::DEFAULT);
    }

    pass::Manager pass_manager;
    pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
    pass_manager.run_passes(f);

    for (auto node : f->get_ordered_ops())
    {
        string node_op = node->description();
        if (node_op == "Multiply")
        {
            EXPECT_EQ(node->get_placement(), Placement::CPU);
        }
        else
        {
            EXPECT_EQ(node->get_placement(), Placement::INTERPRETER);
        }
    }
}

264
TEST(graph_partition, hybrid_abc_manual)
265
{
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
    // A   B   C    A   B     C
    //  \ /   /      \ /     /
    //   +D  /        +D    /
    //    \ /         |    /
    //     *E         R0  R1  f0(INT)
    //     |       ------------------
    //     R          P0  P1
    //                 \ /
    //                  *E
    //                  |
    //                  R2    f1(CPU)
    //             ------------------
    //                  P2
    //                  |
    //                  R     f2(INT)
    //             ------------------
282
    Shape shape = Shape{2, 2};
283 284 285 286 287 288
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
    auto C = make_shared<op::Parameter>(element::f32, shape);
    auto D = A + B;
    auto E = D * C;
    auto R = make_shared<op::Result>(E);
289
    auto f = make_shared<Function>(ResultVector{R}, ParameterVector{A, B, C});
290 291 292 293 294

    pass::Manager pass_manager;
    pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
    pass_manager.run_passes(f);

295 296 297 298 299 300 301 302 303 304 305
    // Insert parameter
    auto RP0 = insert_result_parameter_split(D, E);
    shared_ptr<op::Result> R0 = RP0.first;
    shared_ptr<op::Parameter> P0 = RP0.second;
    auto RP1 = insert_result_parameter_split(C, E);
    shared_ptr<op::Result> R1 = RP1.first;
    shared_ptr<op::Parameter> P1 = RP1.second;
    auto RP2 = insert_result_parameter_split(E, R);
    shared_ptr<op::Result> R2 = RP2.first;
    shared_ptr<op::Parameter> P2 = RP2.second;

306
    // Backends
307 308
    auto int_backend = runtime::Backend::create(placement_to_string(Placement::INTERPRETER));
    auto cpu_backend = runtime::Backend::create(placement_to_string(Placement::CPU));
309

310
    // f0 on INT
311 312 313 314 315
    auto a = int_backend->create_tensor(element::f32, shape);
    auto b = int_backend->create_tensor(element::f32, shape);
    auto c = int_backend->create_tensor(element::f32, shape);
    auto r0 = int_backend->create_tensor(element::f32, shape);
    auto r1 = int_backend->create_tensor(element::f32, shape);
316 317 318 319
    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

320
    auto f0 = make_shared<Function>(ResultVector{R0, R1}, ParameterVector{A, B, C});
321
    int_backend->compile(f0);
322
    int_backend->call_with_validate(f0, {r0, r1}, {a, b, c});
323 324

    // f1 on CPU
325 326 327
    auto p0 = cpu_backend->create_tensor(element::f32, shape);
    auto p1 = cpu_backend->create_tensor(element::f32, shape);
    auto r2 = cpu_backend->create_tensor(element::f32, shape);
328 329 330
    copy_data(p0, read_vector<float>(r0));
    copy_data(p1, read_vector<float>(r1));

331
    auto f1 = make_shared<Function>(ResultVector{R2}, ParameterVector{P0, P1});
332
    cpu_backend->compile(f1);
333
    cpu_backend->call_with_validate(f1, {r2}, {p0, p1});
334 335

    // f2 on INT
336 337
    auto p2 = int_backend->create_tensor(element::f32, shape);
    auto r = int_backend->create_tensor(element::f32, shape);
338 339
    copy_data(p2, read_vector<float>(r2));

340
    auto f2 = make_shared<Function>(ResultVector{R}, ParameterVector{P2});
341
    int_backend->compile(f2);
342
    int_backend->call_with_validate(f2, {r}, {p2});
343 344 345

    // Check final result on INT
    EXPECT_EQ(read_vector<float>(r),
346 347 348
              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
}

349
TEST(graph_partition, hybrid_abc)
350
{
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
    // Same as hybrid_abc_manual, but using the test hybrid transformer
    //
    // A   B   C    A   B     C
    //  \ /   /      \ /     /
    //   +D  /        +D    /
    //    \ /         |    /
    //     *E         R0  R1  f0(INT)
    //     |       ------------------
    //     R          P0  P1
    //                 \ /
    //                  *E
    //                  |
    //                  R2    f1(CPU)
    //             ------------------
    //                  P2
    //                  |
    //                  R     f2(INT)
    //             ------------------
369
    Shape shape = Shape{2, 2};
370 371 372 373 374 375
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
    auto C = make_shared<op::Parameter>(element::f32, shape);
    auto D = A + B;
    auto E = D * C;
    auto R = make_shared<op::Result>(E);
376
    auto f = make_shared<Function>(ResultVector{R}, ParameterVector{A, B, C});
377

378
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
379 380 381 382
    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
383 384 385 386 387

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

388
    backend->call_with_validate(f, {r}, {a, b, c});
389 390 391 392
    EXPECT_EQ(read_vector<float>(r),
              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
}

393
TEST(graph_partition, hybrid_abcd)
394 395 396
{
    //   A   B
    //    \ /
397
    // C  E*   D
398 399 400 401 402
    //  \ / \ /
    //  F+  G+
    //    \ /
    //    H+
    Shape shape = Shape{2, 2};
403 404 405 406 407 408 409 410
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> D = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> E = A * B;
    shared_ptr<Node> F = C + E;
    shared_ptr<Node> G = E + D;
    shared_ptr<Node> H = F + G;
411
    shared_ptr<Function> f = make_shared<Function>(H, ParameterVector{A, B, C, D});
412

413 414
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
415

416 417 418 419 420
    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> d = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
421 422 423 424 425 426

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
    copy_data(d, test::NDArray<float, 2>({{13, 14}, {15, 16}}).get_vector());

427
    backend->call_with_validate(f, {r}, {a, b, c, d});
428 429 430
    EXPECT_EQ(read_vector<float>(r), (test::NDArray<float, 2>({{32, 48}, {68, 92}})).get_vector());
}

431
TEST(graph_partition, hybrid_back_and_forth)
432
{
433 434 435 436 437
    // A   B
    //  \ / \
    //  D*   |
    //    \ /
    //    E+   C
438
    //      \ /
439
    //      F*
440
    Shape shape = Shape{2, 2};
441 442 443 444 445 446
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> D = A * B;
    shared_ptr<Node> E = D + B;
    shared_ptr<Node> F = E * C;
447
    shared_ptr<Function> f = make_shared<Function>(F, ParameterVector{A, B, C});
448

449 450
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
451

452 453 454 455
    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
456 457 458 459 460

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

461
    backend->call_with_validate(f, {r}, {a, b, c});
462 463 464 465
    EXPECT_EQ(read_vector<float>(r),
              (test::NDArray<float, 2>({{90, 180}, {308, 480}})).get_vector());
}

466
TEST(graph_partition, hybrid_multi_middle_nodes)
467
{
468 469 470 471 472 473 474
    // A   B   C
    //  \ / \ / \
    //  D+  E+  |
    //    \ / \ /
    //    F*  G*
    //      \ /
    //      H+
475
    Shape shape = Shape{2, 2};
476 477 478 479 480 481 482 483
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> D = A + B;
    shared_ptr<Node> E = B + C;
    shared_ptr<Node> F = D * E;
    shared_ptr<Node> G = E * C;
    shared_ptr<Node> H = F + G;
484
    shared_ptr<Function> f = make_shared<Function>(H, ParameterVector{A, B, C});
485

486 487
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
488

489 490 491 492
    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
493 494 495 496 497

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

498
    backend->call_with_validate(f, {r}, {a, b, c});
499 500 501 502
    EXPECT_EQ(read_vector<float>(r),
              (test::NDArray<float, 2>({{210, 288}, {378, 480}})).get_vector());
}

503
TEST(graph_partition, hybrid_no_split)
504
{
505 506 507
    // A   B
    //  \ /
    //   +
508
    Shape shape = Shape{2, 2};
509 510 511
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> C = A + B;
512
    shared_ptr<Function> f = make_shared<Function>(C, ParameterVector{A, B});
513

514 515
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
516

517 518 519
    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
520 521 522 523

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());

524
    backend->call_with_validate(f, {c}, {a, b});
525 526
    EXPECT_EQ(read_vector<float>(c), (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
}
527 528

#endif