nbench buffer copy each iteration (#1578)

* add option to copy intput/output data for each iteration * add support for stale buffers

nbench buffer copy each iteration (#1578)
* add option to copy intput/output data for each iteration * add support for stale buffers
c631b50b · Robert Kimball · GitHub · 20c2325c · c631b50b · c631b50b
Unverified Commit c631b50b authored Sep 12, 2018 by Robert Kimball Committed by GitHub Sep 12, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 12 deletions

benchmark.cpp src/tools/nbench/benchmark.cpp +36 -2

benchmark.hpp src/tools/nbench/benchmark.hpp +2 -1

nbench.cpp src/tools/nbench/nbench.cpp +22 -9

No files found.
--- a/src/tools/nbench/benchmark.cpp
+++ b/src/tools/nbench/benchmark.cpp
@@ -19,6 +19,7 @@
 #include "benchmark.hpp"
 #include "ngraph/file_util.hpp"
 #include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor_view.hpp"
 #include "ngraph/runtime/tensor_view.hpp"
 #include "ngraph/runtime/tensor_view.hpp"
 #include "ngraph/serializer.hpp"
@@ -151,7 +152,8 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                                                  const string& backend_name,
                                                  size_t iterations,
                                                  bool timing_detail,
-                                                  int warmup_iterations)
+                                                  int warmup_iterations,
+                                                  bool copy_data)
 {
    stopwatch timer;
    timer.start();
@@ -162,20 +164,28 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    cout.imbue(locale(""));
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;

+    vector<shared_ptr<runtime::HostTensorView>> arg_data;
    vector<shared_ptr<runtime::TensorView>> args;
    vector<bool> args_cacheable;
    for (shared_ptr<op::Parameter> param : f->get_parameters())
    {
        auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
+        auto tensor_data =
+            make_shared<runtime::HostTensorView>(param->get_element_type(), param->get_shape());
        random_init(tensor);
        args.push_back(tensor);
+        arg_data.push_back(tensor_data);
        args_cacheable.push_back(param->get_cacheable());
    }
+    vector<shared_ptr<runtime::HostTensorView>> result_data;
    vector<shared_ptr<runtime::TensorView>> results;
    for (shared_ptr<Node> out : f->get_results())
    {
        auto result = backend->create_tensor(out->get_element_type(), out->get_shape());
+        auto tensor_data =
+            make_shared<runtime::HostTensorView>(out->get_element_type(), out->get_shape());
        results.push_back(result);
+        result_data.push_back(tensor_data);
    }

    for (size_t i = 0; i < args.size(); i++)
@@ -196,9 +206,33 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,

    stopwatch t1;
    t1.start();
-    for (size_t i = 0; i < static_cast<size_t>(iterations); i++)
+    for (size_t i = 0; i < iterations; i++)
    {
+        if (copy_data)
+        {
+            for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
+            {
+                const shared_ptr<runtime::TensorView>& arg = args[arg_index];
+                if (arg->get_stale())
+                {
+                    const shared_ptr<runtime::HostTensorView>& data = arg_data[arg_index];
+                    arg->write(data->get_data_ptr(),
+                               0,
+                               data->get_size() * data->get_element_type().size());
+                }
+            }
+        }
        backend->call(f, results, args);
+        if (copy_data)
+        {
+            for (size_t result_index = 0; result_index < results.size(); result_index++)
+            {
+                const shared_ptr<runtime::HostTensorView>& data = result_data[result_index];
+                const shared_ptr<runtime::TensorView>& result = results[result_index];
+                result->read(
+                    data->get_data_ptr(), 0, data->get_size() * data->get_element_type().size());
+            }
+        }
    }
    t1.stop();
    float time = t1.get_milliseconds();

--- a/src/tools/nbench/benchmark.hpp
+++ b/src/tools/nbench/benchmark.hpp
@@ -32,4 +32,5 @@ std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<n
                                                               const std::string& backend_name,
                                                               size_t iterations,
                                                               bool timing_detail,
-                                                               int warmup_iterations);
+                                                               int warmup_iterations,
+                                                               bool copy_data);
--- a/src/tools/nbench/nbench.cpp
+++ b/src/tools/nbench/nbench.cpp
@@ -153,7 +153,7 @@ void print_results(vector<PerfShape> perf_data, bool timing_detail)
 int main(int argc, char** argv)
 {
    string model;
-    string backend = "CPU";
+    string backend;
    string directory;
    int iterations = 10;
    bool failed = false;
@@ -161,6 +161,7 @@ int main(int argc, char** argv)
    bool timing_detail = false;
    bool visualize = false;
    int warmup_iterations = 1;
+    bool copy_data = true;

    for (size_t i = 1; i < argc; i++)
    {
@@ -193,6 +194,10 @@ int main(int argc, char** argv)
        {
            timing_detail = true;
        }
+        else if (arg == "--no_copy_data")
+        {
+            copy_data = false;
+        }
        else if (arg == "-v" || arg == "--visualize")
        {
            visualize = true;
@@ -234,6 +239,11 @@ int main(int argc, char** argv)
        cout << "Either file or directory must be specified\n";
        failed = true;
    }
+    else if (backend.empty())
+    {
+        cout << "Backend missing\n";
+        failed = true;
+    }

    if (failed)
    {
@@ -251,8 +261,9 @@ OPTIONS
        -i|--iterations           Iterations (default: 10)
        -s|--statistics           Display op stastics
        -v|--visualize            Visualize a model (WARNING: requires GraphViz installed)
-        --timing-detail           Gather detailed timing
+        --timing_detail           Gather detailed timing
        -w|--warmup_iterations    Number of warm-up iterations
+        --no_copy_data            Disable copy of input/result data every iteration
 )###";
        return 1;
    }
@@ -322,10 +333,8 @@ OPTIONS
            try
            {
                shared_ptr<Function> f = deserialize(m);
-                // cout << "Benchmarking " << m << ", " << backend << " backend, " << iterations
-                //      << " iterations.\n";
-                auto perf_data =
-                    run_benchmark(f, backend, iterations, timing_detail, warmup_iterations);
+                auto perf_data = run_benchmark(
+                    f, backend, iterations, timing_detail, warmup_iterations, copy_data);
                auto perf_shape = to_perf_shape(f, perf_data);
                aggregate_perf_data.insert(
                    aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end());
@@ -340,9 +349,13 @@ OPTIONS
    else if (iterations > 0)
    {
        shared_ptr<Function> f = deserialize(model);
-        cout << "Benchmarking " << model << ", " << backend << " backend, " << iterations
-             << " iterations.\n";
-        auto perf_data = run_benchmark(f, backend, iterations, timing_detail, warmup_iterations);
+        cout << "Benchmarking " << model << endl;
+        cout << "    Backend: " << backend << endl;
+        cout << "    Iterations: " << iterations << endl;
+        cout << "    Warmup: " << warmup_iterations << endl;
+        cout << "    Copy Data: " << (copy_data ? "true" : "false") << endl;
+        auto perf_data =
+            run_benchmark(f, backend, iterations, timing_detail, warmup_iterations, copy_data);
        auto perf_shape = to_perf_shape(f, perf_data);
        print_results(perf_shape, timing_detail);
    }