graph_partition.cpp 20.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include <memory>
#include <sstream>
#include <string>
#include <typeindex>
#include <typeinfo>
#include <vector>

#include "gtest/gtest.h"

#include "ngraph/graph_util.hpp"
#include "ngraph/ngraph.hpp"
#include "ngraph/pass/assign_placement.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/runtime/host_tensor_view.hpp"
#include "ngraph/util.hpp"
#include "util/ndarray.hpp"
#include "util/test_tools.hpp"

using namespace std;
using namespace ngraph;

38 39 40 41 42
// Perform all operations on INTERPRETER and fallback Multiply to CPU
static function<Placement(shared_ptr<Node>)> int_with_cpu_mul_policy = [](shared_ptr<Node> node) {
    Placement placement;
    string node_op = node->description();
    if (node_op == "Multiply")
43
    {
44
        placement = Placement::CPU;
45
    }
46
    else
47
    {
48
        placement = Placement::INTERPRETER;
49
    }
50 51
    return placement;
};
52

53 54 55 56 57 58 59
// HybridCallFrame servers 2 purposes:
// 1. HybridBackend's main use case is to test device placement and graph partition routines.
// 2. It also shows how glued-hybrid runtime can be built by combining different runtimes.
//
// By default, HybridBackend operates on INTERPRETER (for example, the primary tensor view is
// INTERPRETER tensor view). It falls back to CPU when requested by placement.
class HybridBackend
60 61
{
public:
62 63
    HybridBackend(const function<Placement(shared_ptr<Node>)>& placement_policy)
        : m_placement_policy(placement_policy)
64 65 66
    {
    }

67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
    ~HybridBackend() {}
    shared_ptr<runtime::TensorView> create_tensor(const element::Type& element_type,
                                                  const Shape& shape)
    {
        return get_cached_backend(Placement::INTERPRETER)->create_tensor(element_type, shape);
    }

    bool compile(const shared_ptr<Function>& func)
    {
        if (!contains_key(m_function_map, func))
        {
            // Clone function
            FunctionInstance instance;
            instance.m_function = clone_function(*func);

            // Run placement pass
            pass::Manager pass_manager;
            pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
            pass_manager.run_passes(instance.m_function);

            // Split function to sub_functions
            tie(instance.m_sub_functions, instance.m_map_parameter_to_result) =
                split_function_by_placement(instance.m_function);
            m_function_map.insert({func, instance});

            // Compile subfunctions in corresponding backends
            for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
            {
                Placement placement = get_colocated_function_placement(sub_function);
                auto backend = get_cached_backend(placement);
                backend->compile(sub_function);
            }
        }
        return true;
    }

    bool call(const shared_ptr<Function>& func,
              const vector<shared_ptr<runtime::TensorView>>& outputs,
Yixing Lao's avatar
Yixing Lao committed
105
              const vector<shared_ptr<runtime::TensorView>>& inputs)
106
    {
107 108 109 110 111 112 113 114 115 116 117 118 119
        // Get FunctionInstance
        bool rc = true;
        auto it = m_function_map.find(func);
        if (it == m_function_map.end())
        {
            compile(func);
            it = m_function_map.find(func);
        }
        if (it == m_function_map.end())
        {
            throw runtime_error("Error constructing backend.");
        }
        FunctionInstance& instance = it->second;
120

121 122
        // Parameter and result node in sub_function maps to one TensorView
        unordered_map<shared_ptr<Node>, shared_ptr<runtime::TensorView>> map_node_to_tensor_view;
123
        for (size_t i = 0; i < inputs.size(); ++i)
124
        {
125
            map_node_to_tensor_view[instance.m_function->get_parameters()[i]] = inputs[i];
126
        }
127
        for (size_t i = 0; i < outputs.size(); ++i)
128
        {
129
            map_node_to_tensor_view[instance.m_function->get_results()[i]] = outputs[i];
130 131
        }

132 133
        // Call subfunctions
        for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
134
        {
135
            // Init backend
136
            Placement placement = get_colocated_function_placement(sub_function);
137
            auto backend = get_cached_backend(placement);
138

139
            // Prepare parameter TensorViews
140 141
            vector<shared_ptr<runtime::TensorView>> parameter_tvs;
            for (auto parameter_node : sub_function->get_parameters())
142
            {
143
                if (map_node_to_tensor_view.find(parameter_node) != map_node_to_tensor_view.end())
144
                {
145
                    parameter_tvs.push_back(map_node_to_tensor_view.at(parameter_node));
146 147 148
                }
                else
                {
149
                    auto result_node = instance.m_map_parameter_to_result.at(parameter_node);
150
                    auto result_tv = map_node_to_tensor_view.at(result_node);
151 152
                    auto parameter_tv = backend->create_tensor(parameter_node->get_element_type(),
                                                               parameter_node->get_shape());
153 154 155
                    copy_data(parameter_tv, read_vector<float>(result_tv));
                    map_node_to_tensor_view[parameter_node] = parameter_tv;
                    parameter_tvs.push_back(parameter_tv);
156 157 158
                }
            }

159
            // Prepare result TensorViews
160 161
            vector<shared_ptr<runtime::TensorView>> result_tvs;
            for (auto result_node : sub_function->get_results())
162
            {
163
                if (map_node_to_tensor_view.find(result_node) != map_node_to_tensor_view.end())
164
                {
165
                    result_tvs.push_back(map_node_to_tensor_view.at(result_node));
166 167 168
                }
                else
                {
169 170
                    auto result_tv = backend->create_tensor(result_node->get_element_type(),
                                                            result_node->get_shape());
171 172
                    map_node_to_tensor_view[result_node] = result_tv;
                    result_tvs.push_back(result_tv);
173 174 175 176
                }
            }

            // Call
177
            backend->call(sub_function, result_tvs, parameter_tvs);
178
        }
179
        return rc;
180 181 182
    }

protected:
183
    class FunctionInstance
184
    {
185 186 187 188 189
    public:
        shared_ptr<Function> m_function;
        vector<shared_ptr<Function>> m_sub_functions;
        unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>> m_map_parameter_to_result;
    };
190

191
    shared_ptr<runtime::Backend> get_cached_backend(Placement placement)
192
    {
193
        if (m_cached_backends.find(placement) == m_cached_backends.end())
194
        {
195
            m_cached_backends[placement] = runtime::Backend::create(placement_to_string(placement));
196
        }
197
        return m_cached_backends.at(placement);
198 199
    }

200 201 202
    map<Placement, shared_ptr<runtime::Backend>> m_cached_backends;
    map<shared_ptr<Function>, FunctionInstance> m_function_map;
    function<Placement(shared_ptr<Node>)> m_placement_policy;
203 204 205 206 207
};

TEST(graph_partition, placement_all_cpu_policy)
{
    Shape shape = Shape{2, 2};
208 209 210 211 212 213
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> AplusB = A + B;
    shared_ptr<Node> AplusBtimesC = AplusB * C;
    shared_ptr<Function> f = make_shared<Function>(AplusBtimesC, op::ParameterVector{A, B, C});
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230

    for (auto node : f->get_ordered_ops())
    {
        EXPECT_EQ(node->get_placement(), Placement::DEFAULT);
    }

    pass::Manager pass_manager;
    pass_manager.register_pass<pass::AssignPlacement>(
        [](shared_ptr<Node> node) { return Placement::CPU; });
    pass_manager.run_passes(f);

    for (auto node : f->get_ordered_ops())
    {
        EXPECT_EQ(node->get_placement(), Placement::CPU);
    }
}

231
#ifdef NGRAPH_CPU_ENABLE
232 233 234
TEST(graph_partition, placement_int_with_cpu_mul_policy)
{
    Shape shape = Shape{2, 2};
235 236 237 238 239 240
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> AplusB = A + B;
    shared_ptr<Node> AplusBtimesC = AplusB * C;
    shared_ptr<Function> f = make_shared<Function>(AplusBtimesC, op::ParameterVector{A, B, C});
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264

    for (auto node : f->get_ordered_ops())
    {
        EXPECT_EQ(node->get_placement(), Placement::DEFAULT);
    }

    pass::Manager pass_manager;
    pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
    pass_manager.run_passes(f);

    for (auto node : f->get_ordered_ops())
    {
        string node_op = node->description();
        if (node_op == "Multiply")
        {
            EXPECT_EQ(node->get_placement(), Placement::CPU);
        }
        else
        {
            EXPECT_EQ(node->get_placement(), Placement::INTERPRETER);
        }
    }
}

265
TEST(graph_partition, hybrid_abc_manual)
266
{
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
    // A   B   C    A   B     C
    //  \ /   /      \ /     /
    //   +D  /        +D    /
    //    \ /         |    /
    //     *E         R0  R1  f0(INT)
    //     |       ------------------
    //     R          P0  P1
    //                 \ /
    //                  *E
    //                  |
    //                  R2    f1(CPU)
    //             ------------------
    //                  P2
    //                  |
    //                  R     f2(INT)
    //             ------------------
283
    Shape shape = Shape{2, 2};
284 285 286 287 288 289
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
    auto C = make_shared<op::Parameter>(element::f32, shape);
    auto D = A + B;
    auto E = D * C;
    auto R = make_shared<op::Result>(E);
290
    auto f = make_shared<Function>(ResultVector{R}, op::ParameterVector{A, B, C});
291 292 293 294 295

    pass::Manager pass_manager;
    pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
    pass_manager.run_passes(f);

296 297 298 299 300 301 302 303 304 305 306
    // Insert parameter
    auto RP0 = insert_result_parameter_split(D, E);
    shared_ptr<op::Result> R0 = RP0.first;
    shared_ptr<op::Parameter> P0 = RP0.second;
    auto RP1 = insert_result_parameter_split(C, E);
    shared_ptr<op::Result> R1 = RP1.first;
    shared_ptr<op::Parameter> P1 = RP1.second;
    auto RP2 = insert_result_parameter_split(E, R);
    shared_ptr<op::Result> R2 = RP2.first;
    shared_ptr<op::Parameter> P2 = RP2.second;

307
    // Backends
308 309
    auto int_backend = runtime::Backend::create(placement_to_string(Placement::INTERPRETER));
    auto cpu_backend = runtime::Backend::create(placement_to_string(Placement::CPU));
310

311
    // f0 on INT
312 313 314 315 316
    auto a = int_backend->create_tensor(element::f32, shape);
    auto b = int_backend->create_tensor(element::f32, shape);
    auto c = int_backend->create_tensor(element::f32, shape);
    auto r0 = int_backend->create_tensor(element::f32, shape);
    auto r1 = int_backend->create_tensor(element::f32, shape);
317 318 319 320
    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

321
    auto f0 = make_shared<Function>(ResultVector{R0, R1}, op::ParameterVector{A, B, C});
322 323
    int_backend->compile(f0);
    int_backend->call(f0, {r0, r1}, {a, b, c});
324 325

    // f1 on CPU
326 327 328
    auto p0 = cpu_backend->create_tensor(element::f32, shape);
    auto p1 = cpu_backend->create_tensor(element::f32, shape);
    auto r2 = cpu_backend->create_tensor(element::f32, shape);
329 330 331 332
    copy_data(p0, read_vector<float>(r0));
    copy_data(p1, read_vector<float>(r1));

    auto f1 = make_shared<Function>(ResultVector{R2}, op::ParameterVector{P0, P1});
333 334
    cpu_backend->compile(f1);
    cpu_backend->call(f1, {r2}, {p0, p1});
335 336

    // f2 on INT
337 338
    auto p2 = int_backend->create_tensor(element::f32, shape);
    auto r = int_backend->create_tensor(element::f32, shape);
339 340 341
    copy_data(p2, read_vector<float>(r2));

    auto f2 = make_shared<Function>(ResultVector{R}, op::ParameterVector{P2});
342 343
    int_backend->compile(f2);
    int_backend->call(f2, {r}, {p2});
344 345 346

    // Check final result on INT
    EXPECT_EQ(read_vector<float>(r),
347 348 349
              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
}

350
TEST(graph_partition, hybrid_abc)
351
{
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
    // Same as hybrid_abc_manual, but using the test hybrid transformer
    //
    // A   B   C    A   B     C
    //  \ /   /      \ /     /
    //   +D  /        +D    /
    //    \ /         |    /
    //     *E         R0  R1  f0(INT)
    //     |       ------------------
    //     R          P0  P1
    //                 \ /
    //                  *E
    //                  |
    //                  R2    f1(CPU)
    //             ------------------
    //                  P2
    //                  |
    //                  R     f2(INT)
    //             ------------------
370
    Shape shape = Shape{2, 2};
371 372 373 374 375 376
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
    auto C = make_shared<op::Parameter>(element::f32, shape);
    auto D = A + B;
    auto E = D * C;
    auto R = make_shared<op::Result>(E);
377
    auto f = make_shared<Function>(ResultVector{R}, op::ParameterVector{A, B, C});
378

379 380 381 382 383
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> r = backend->create_tensor(element::f32, shape);
384 385 386 387 388

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

389
    backend->call(f, {r}, {a, b, c});
390 391 392 393
    EXPECT_EQ(read_vector<float>(r),
              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
}

394
TEST(graph_partition, hybrid_abcd)
395 396 397
{
    //   A   B
    //    \ /
398
    // C  E*   D
399 400 401 402 403
    //  \ / \ /
    //  F+  G+
    //    \ /
    //    H+
    Shape shape = Shape{2, 2};
404 405 406 407 408 409 410 411 412
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> D = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> E = A * B;
    shared_ptr<Node> F = C + E;
    shared_ptr<Node> G = E + D;
    shared_ptr<Node> H = F + G;
    shared_ptr<Function> f = make_shared<Function>(H, op::ParameterVector{A, B, C, D});
413

414 415
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
416

417 418 419 420 421
    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> d = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> r = backend->create_tensor(element::f32, shape);
422 423 424 425 426 427

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
    copy_data(d, test::NDArray<float, 2>({{13, 14}, {15, 16}}).get_vector());

428
    backend->call(f, {r}, {a, b, c, d});
429 430 431
    EXPECT_EQ(read_vector<float>(r), (test::NDArray<float, 2>({{32, 48}, {68, 92}})).get_vector());
}

432
TEST(graph_partition, hybrid_back_and_forth)
433
{
434 435 436 437 438
    // A   B
    //  \ / \
    //  D*   |
    //    \ /
    //    E+   C
439
    //      \ /
440
    //      F*
441
    Shape shape = Shape{2, 2};
442 443 444 445 446 447 448
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> D = A * B;
    shared_ptr<Node> E = D + B;
    shared_ptr<Node> F = E * C;
    shared_ptr<Function> f = make_shared<Function>(F, op::ParameterVector{A, B, C});
449

450 451
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
452

453 454 455 456
    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> r = backend->create_tensor(element::f32, shape);
457 458 459 460 461

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

462
    backend->call(f, {r}, {a, b, c});
463 464 465 466
    EXPECT_EQ(read_vector<float>(r),
              (test::NDArray<float, 2>({{90, 180}, {308, 480}})).get_vector());
}

467
TEST(graph_partition, hybrid_multi_middle_nodes)
468
{
469 470 471 472 473 474 475
    // A   B   C
    //  \ / \ / \
    //  D+  E+  |
    //    \ / \ /
    //    F*  G*
    //      \ /
    //      H+
476
    Shape shape = Shape{2, 2};
477 478 479 480 481 482 483 484 485
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> D = A + B;
    shared_ptr<Node> E = B + C;
    shared_ptr<Node> F = D * E;
    shared_ptr<Node> G = E * C;
    shared_ptr<Node> H = F + G;
    shared_ptr<Function> f = make_shared<Function>(H, op::ParameterVector{A, B, C});
486

487 488
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
489

490 491 492 493
    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> c = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> r = backend->create_tensor(element::f32, shape);
494 495 496 497 498

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());

499
    backend->call(f, {r}, {a, b, c});
500 501 502 503
    EXPECT_EQ(read_vector<float>(r),
              (test::NDArray<float, 2>({{210, 288}, {378, 480}})).get_vector());
}

504
TEST(graph_partition, hybrid_no_split)
505
{
506 507 508
    // A   B
    //  \ /
    //   +
509
    Shape shape = Shape{2, 2};
510 511 512 513
    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
    shared_ptr<Node> C = A + B;
    shared_ptr<Function> f = make_shared<Function>(C, op::ParameterVector{A, B});
514

515 516
    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
    backend->compile(f);
517

518 519 520
    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::f32, shape);
    shared_ptr<runtime::TensorView> c = backend->create_tensor(element::f32, shape);
521 522 523 524

    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());

525
    backend->call(f, {c}, {a, b});
526 527
    EXPECT_EQ(read_vector<float>(c), (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
}
528 529

#endif