Unverified Commit aa357702 authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

Merge pull request #300 from NervanaSystems/bob/perf

Performance benchmarking and utils
parents 094f2fa9 75bf66e4
......@@ -56,8 +56,7 @@ nervana_aeon.egg-info/
*.swp
*.swo
# setup.py intermediate files
build/
build*/
# makeenv and test intermediate files
tmp/
......
......@@ -56,6 +56,83 @@ if((NGRAPH_CPU_ENABLE OR USE_CUDA) AND (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin
set(LLVM_INCLUDE_DIR "${source_dir}/include" PARENT_SCOPE)
set(LLVM_LIB_DIR "${source_dir}/lib" PARENT_SCOPE)
set(LLVM_LINK_LIBS clangTooling clangFrontendTool clangFrontend clangDriver clangSerialization clangCodeGen clangParse clangSema clangStaticAnalyzerFrontend clangStaticAnalyzerCheckers clangStaticAnalyzerCore clangAnalysis clangARCMigrate clangRewriteFrontend clangEdit clangAST clangLex clangBasic LLVMLTO LLVMPasses LLVMObjCARCOpts LLVMSymbolize LLVMDebugInfoPDB LLVMDebugInfoDWARF LLVMMIRParser LLVMCoverage LLVMTableGen LLVMDlltoolDriver LLVMOrcJIT LLVMXCoreDisassembler LLVMXCoreCodeGen LLVMXCoreDesc LLVMXCoreInfo LLVMXCoreAsmPrinter LLVMSystemZDisassembler LLVMSystemZCodeGen LLVMSystemZAsmParser LLVMSystemZDesc LLVMSystemZInfo LLVMSystemZAsmPrinter LLVMSparcDisassembler LLVMSparcCodeGen LLVMSparcAsmParser LLVMSparcDesc LLVMSparcInfo LLVMSparcAsmPrinter LLVMPowerPCDisassembler LLVMPowerPCCodeGen LLVMPowerPCAsmParser LLVMPowerPCDesc LLVMPowerPCInfo LLVMPowerPCAsmPrinter LLVMNVPTXCodeGen LLVMNVPTXDesc LLVMNVPTXInfo LLVMNVPTXAsmPrinter LLVMMSP430CodeGen LLVMMSP430Desc LLVMMSP430Info LLVMMSP430AsmPrinter LLVMMipsDisassembler LLVMMipsCodeGen LLVMMipsAsmParser LLVMMipsDesc LLVMMipsInfo LLVMMipsAsmPrinter LLVMLanaiDisassembler LLVMLanaiCodeGen LLVMLanaiAsmParser LLVMLanaiDesc LLVMLanaiAsmPrinter LLVMLanaiInfo LLVMHexagonDisassembler LLVMHexagonCodeGen LLVMHexagonAsmParser LLVMHexagonDesc LLVMHexagonInfo LLVMBPFDisassembler LLVMBPFCodeGen LLVMBPFDesc LLVMBPFInfo LLVMBPFAsmPrinter LLVMARMDisassembler LLVMARMCodeGen LLVMARMAsmParser LLVMARMDesc LLVMARMInfo LLVMARMAsmPrinter LLVMAMDGPUDisassembler LLVMAMDGPUCodeGen LLVMAMDGPUAsmParser LLVMAMDGPUDesc LLVMAMDGPUInfo LLVMAMDGPUAsmPrinter LLVMAMDGPUUtils LLVMAArch64Disassembler LLVMAArch64CodeGen LLVMAArch64AsmParser LLVMAArch64Desc LLVMAArch64Info LLVMAArch64AsmPrinter LLVMAArch64Utils LLVMObjectYAML LLVMLibDriver LLVMOption LLVMX86Disassembler LLVMX86AsmParser LLVMX86CodeGen LLVMGlobalISel LLVMSelectionDAG LLVMAsmPrinter LLVMDebugInfoCodeView LLVMDebugInfoMSF LLVMX86Desc LLVMMCDisassembler LLVMX86Info LLVMX86AsmPrinter LLVMX86Utils LLVMMCJIT LLVMLineEditor LLVMInterpreter LLVMExecutionEngine LLVMRuntimeDyld LLVMCodeGen LLVMTarget LLVMCoroutines LLVMipo LLVMInstrumentation LLVMVectorize LLVMScalarOpts LLVMLinker LLVMIRReader LLVMAsmParser LLVMInstCombine LLVMTransformUtils LLVMBitWriter LLVMAnalysis LLVMProfileData LLVMObject LLVMMCParser LLVMMC LLVMBitReader LLVMCore LLVMBinaryFormat LLVMSupport LLVMDemangle tinfo z m PARENT_SCOPE)
set(LLVM_LINK_LIBS
clangTooling
clangFrontendTool
clangFrontend
clangDriver
clangSerialization
clangCodeGen
clangParse
clangSema
clangStaticAnalyzerFrontend
clangStaticAnalyzerCheckers
clangStaticAnalyzerCore
clangAnalysis
clangARCMigrate
clangRewriteFrontend
clangEdit
clangAST
clangLex
clangBasic
LLVMLTO
LLVMPasses
LLVMObjCARCOpts
LLVMSymbolize
LLVMDebugInfoPDB
LLVMDebugInfoDWARF
LLVMMIRParser
LLVMCoverage
LLVMTableGen
LLVMDlltoolDriver
LLVMOrcJIT
LLVMObjectYAML
LLVMLibDriver
LLVMOption
LLVMX86Disassembler
LLVMX86AsmParser
LLVMX86CodeGen
LLVMGlobalISel
LLVMSelectionDAG
LLVMAsmPrinter
LLVMDebugInfoCodeView
LLVMDebugInfoMSF
LLVMX86Desc
LLVMMCDisassembler
LLVMX86Info
LLVMX86AsmPrinter
LLVMX86Utils
LLVMMCJIT
LLVMLineEditor
LLVMInterpreter
LLVMExecutionEngine
LLVMRuntimeDyld
LLVMCodeGen
LLVMTarget
LLVMCoroutines
LLVMipo
LLVMInstrumentation
LLVMVectorize
LLVMScalarOpts
LLVMLinker
LLVMIRReader
LLVMAsmParser
LLVMInstCombine
LLVMTransformUtils
LLVMBitWriter
LLVMAnalysis
LLVMProfileData
LLVMObject
LLVMMCParser
LLVMMC
LLVMBitReader
LLVMCore
LLVMBinaryFormat
LLVMSupport
LLVMDemangle
tinfo
z
m
PARENT_SCOPE)
endif()
......@@ -14,14 +14,20 @@
#include <iostream>
#include <clang/Basic/DiagnosticOptions.h>
#include <clang/Basic/TargetInfo.h>
#include <clang/CodeGen/CodeGenAction.h>
#include <clang/CodeGen/ObjectFilePCHContainerOperations.h>
#include <clang/Driver/DriverDiagnostic.h>
#include <clang/Driver/Options.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/CompilerInvocation.h>
#include <clang/Frontend/FrontendActions.h>
#include <clang/Frontend/FrontendDiagnostic.h>
#include <clang/Frontend/TextDiagnosticBuffer.h>
#include <clang/Frontend/TextDiagnosticPrinter.h>
#include <clang/Frontend/TextDiagnosticPrinter.h>
#include <clang/Frontend/Utils.h>
#include <clang/FrontendTool/Utils.h>
#include <clang/Lex/Preprocessor.h>
......@@ -35,17 +41,10 @@
#include <llvm/Support/ManagedStatic.h>
#include <llvm/Support/Signals.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Support/Timer.h>
#include <llvm/Support/raw_ostream.h>
#include <clang/Basic/DiagnosticOptions.h>
#include <clang/Basic/TargetInfo.h>
#include <clang/CodeGen/CodeGenAction.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/FrontendActions.h>
#include <clang/Frontend/TextDiagnosticPrinter.h>
#include <llvm/Support/TargetSelect.h>
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/log.hpp"
......@@ -107,10 +106,9 @@ StaticCompiler::StaticCompiler()
m_debuginfo_enabled = true;
#endif
llvm::InitializeAllTargets();
llvm::InitializeAllTargetMCs();
llvm::InitializeAllAsmPrinters();
llvm::InitializeAllAsmParsers();
InitializeNativeTarget();
LLVMInitializeNativeAsmPrinter();
LLVMInitializeNativeAsmParser();
// Prepare compilation arguments
vector<const char*> args;
......
......@@ -15,6 +15,7 @@
#include <algorithm>
#include "ngraph/runtime/cpu/cpu_call_frame.hpp"
#include "ngraph/runtime/cpu/cpu_external_function.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
using namespace std;
......@@ -69,3 +70,40 @@ void runtime::cpu::CPU_CallFrame::call(
tensor_call(inputs, outputs);
}
vector<runtime::cpu::PerformanceCounter> runtime::cpu::CPU_CallFrame::get_performance_data() const
{
auto* engine = m_external_function->m_execution_engine.get();
auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds = engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count = engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
if (!get_count)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_count'");
}
if (!get_name)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_name'");
}
if (!get_microseconds)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_microseconds'");
}
if (!get_call_count)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_call_count'");
}
vector<runtime::cpu::PerformanceCounter> rc;
size_t count = get_count();
for (size_t i = 0; i < count; i++)
{
rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
}
return rc;
}
......@@ -33,6 +33,24 @@ namespace ngraph
class CPU_CallFrame;
class CPU_ExternalFunction;
class PerformanceCounter
{
public:
PerformanceCounter(const char* n, size_t us, size_t calls)
: m_name(n)
, m_total_microseconds(us)
, m_call_count(calls)
{
}
const std::string& name() const { return m_name; }
size_t total_microseconds() const { return m_total_microseconds; }
size_t call_count() const { return m_call_count; }
private:
std::string m_name;
size_t m_total_microseconds;
size_t m_call_count;
};
using EntryPoint_t = void(void** inputs, void** outputs);
using EntryPoint = std::function<EntryPoint_t>;
......@@ -55,6 +73,8 @@ namespace ngraph
void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
const std::vector<std::shared_ptr<TensorView>>& outputs);
std::vector<ngraph::runtime::cpu::PerformanceCounter> get_performance_data() const;
protected:
std::shared_ptr<CPU_ExternalFunction> m_external_function;
EntryPoint m_compiled_function;
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#include <cstdlib>
#include <fstream>
#include <memory>
#include <string>
......@@ -99,8 +100,6 @@ public:
static StaticInitializers s_static_initializers;
using ngraph::descriptor::layout::DenseTensorViewLayout;
#define TI(x) type_index(typeid(x))
static const runtime::cpu::OpMap dispatcher{
......@@ -172,6 +171,7 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr)
{
}
......@@ -188,7 +188,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
pass::Manager pass_manager;
pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major.
pass_manager.register_pass<pass::AssignLayout<DenseTensorViewLayout>>();
pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
pass_manager.register_pass<pass::Liveness>();
pass_manager.register_pass<pass::MemoryLayout>(64);
pass_manager.register_pass<pass::DumpSorted>(dump_filename);
......@@ -213,6 +213,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
#include "ngraph/runtime/kernel/replace_slice.hpp"
#include "ngraph/runtime/kernel/slice.hpp"
#include "ngraph/runtime/kernel/sum.hpp"
#include "ngraph/util.hpp"
using namespace ngraph::runtime::cpu::eigen;
using namespace ngraph::runtime;
......@@ -227,6 +228,75 @@ using namespace ngraph::runtime;
writer << "void *__dso_handle = 0;\n\n";
if (m_emit_timing)
{
writer << "// Declare debug timers\n";
vector<string> names;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (!dynamic_pointer_cast<op::Parameter>(node))
{
names.push_back(node->get_name());
}
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{
......@@ -341,7 +411,15 @@ using namespace ngraph::runtime;
auto tv = output.get_tensor_view();
out.push_back(TensorViewWrapper(tv));
}
if (m_emit_timing)
{
emit_debug_function_entry(writer, node.get(), in, out);
}
handler->second(&emitter, node.get(), in, out);
if (m_emit_timing)
{
emit_debug_function_exit(writer, node.get(), in, out);
}
}
writer.indent--;
......@@ -359,20 +437,20 @@ using namespace ngraph::runtime;
out << code;
out.close();
compiler.reset(new codegen::Compiler());
execution_engine.reset(new codegen::ExecutionEngine());
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
compiler->set_precompiled_header_source(pch_header_source);
m_compiler->set_precompiled_header_source(pch_header_source);
auto llvm_module = compiler->compile(code);
auto llvm_module = m_compiler->compile(code);
if (llvm_module == nullptr)
{
throw runtime_error("function failed to compile");
}
execution_engine->add_module(llvm_module);
execution_engine->finalize();
m_compiled_function = execution_engine->find_function<EntryPoint_t>(function_name);
m_execution_engine->add_module(llvm_module);
m_execution_engine->finalize();
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
assert(m_compiled_function);
m_is_compiled = true;
......@@ -392,3 +470,27 @@ shared_ptr<ngraph::runtime::CallFrame> runtime::cpu::CPU_ExternalFunction::make_
return make_shared<ngraph::runtime::cpu::CPU_CallFrame>(shared_from_this(),
m_compiled_function);
}
void runtime::cpu::CPU_ExternalFunction::emit_debug_function_entry(
codegen::CodeWriter& writer,
Node* node,
const std::vector<TensorViewWrapper>& in,
const std::vector<TensorViewWrapper>& out)
{
if (!dynamic_cast<op::Parameter*>(node))
{
writer << "timer_" << node->get_name() << ".start();\n";
}
}
void runtime::cpu::CPU_ExternalFunction::emit_debug_function_exit(
codegen::CodeWriter& writer,
Node* node,
const std::vector<TensorViewWrapper>& in,
const std::vector<TensorViewWrapper>& out)
{
if (!dynamic_cast<op::Parameter*>(node))
{
writer << "timer_" << node->get_name() << ".stop();\n\n";
}
}
......@@ -20,6 +20,7 @@
#include <typeinfo>
#include <unordered_map>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/function.hpp"
......@@ -47,6 +48,8 @@ namespace ngraph
class CPU_ExternalFunction : public ngraph::runtime::ExternalFunction,
public std::enable_shared_from_this<CPU_ExternalFunction>
{
friend class CPU_CallFrame;
public:
CPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
bool release_function = true);
......@@ -58,8 +61,18 @@ namespace ngraph
EntryPoint m_compiled_function;
private:
std::unique_ptr<codegen::Compiler> compiler;
std::unique_ptr<codegen::ExecutionEngine> execution_engine;
void emit_debug_function_entry(codegen::CodeWriter& writer,
Node* node,
const std::vector<TensorViewWrapper>& in,
const std::vector<TensorViewWrapper>& out);
void emit_debug_function_exit(codegen::CodeWriter& writer,
Node* node,
const std::vector<TensorViewWrapper>& in,
const std::vector<TensorViewWrapper>& out);
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
};
}
}
......
......@@ -26,8 +26,6 @@
using namespace std;
map<string, ngraph::stopwatch*> ngraph::stopwatch_statistics;
void ngraph::dump(ostream& out, const void* _data, size_t _size)
{
auto flags = out.flags();
......
......@@ -30,7 +30,6 @@ namespace ngraph
class Node;
class Function;
class stopwatch;
extern std::map<std::string, stopwatch*> stopwatch_statistics;
template <typename T>
std::string join(const T& v, const std::string& sep = ", ")
......@@ -105,21 +104,6 @@ namespace ngraph
class stopwatch
{
public:
stopwatch() {}
stopwatch(const std::string& name)
: m_name{name}
{
stopwatch_statistics.insert({m_name, this});
}
~stopwatch()
{
if (m_name.size() > 0)
{
stopwatch_statistics.find(m_name);
}
}
void start()
{
if (m_active == false)
......@@ -169,7 +153,6 @@ namespace ngraph
std::chrono::high_resolution_clock::duration::zero();
std::chrono::nanoseconds m_last_time;
size_t m_total_count = 0;
std::string m_name;
};
/// Parses a string containing a literal of the underlying type.
......
......@@ -22,6 +22,7 @@ include_directories(
)
set (SRC
backend_performance.cpp
builder.cpp
builder_autobroadcast.cpp
build_graph.cpp
......@@ -47,6 +48,8 @@ set (SRC
uuid.cpp
)
add_subdirectory(models)
#================================================================================================
# To auto generate a suite of unit tests for a backend add a line like this
# set(BACKEND_NAMES ${BACKEND_NAMES} "BACKEND_NAME_GOES_HERE")
......
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#include <sstream>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/call_frame.hpp"
#include "ngraph/runtime/cpu/cpu_call_frame.hpp"
#include "ngraph/runtime/manager.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
#include "util/random.hpp"
using namespace std;
using namespace ngraph;
// Starting point CPU: 1.2ms/iteration
shared_ptr<runtime::TensorView> make_tensor(runtime::Backend& backend, const ValueType& value)
{
shared_ptr<runtime::TensorView> arg =
backend.make_primary_tensor_view(value.get_element_type(), value.get_shape());
return arg;
}
TEST(benchmark, mxnet_mnist_mlp_forward)
{
test::Uniform<float> rng{-1, 1, 0};
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
const string json_string = file_util::read_file_to_string(json_path);
stringstream ss(json_string);
shared_ptr<Function> f = ngraph::deserialize(ss);
auto manager = runtime::Manager::get("CPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
vector<shared_ptr<runtime::Value>> args;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto arg = make_tensor(*backend, *(param->get_value_type()));
rng.initialize(arg);
args.push_back(arg);
}
shared_ptr<const ValueType> result_type = f->get_result_type();
auto result = make_tensor(*backend, *result_type);
stopwatch t1;
t1.start();
float count = 1000;
for (size_t i = 0; i < static_cast<size_t>(count); i++)
{
cf->call(args, {result});
}
t1.stop();
float time = t1.get_milliseconds();
cout << time / count << "ms per iteration\n";
}
TEST(benchmark, mxnet_10_bucket_lstm)
{
test::Uniform<float> rng{-1, 1, 0};
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/10_bucket_LSTM.json");
const string json_string = file_util::read_file_to_string(json_path);
stringstream ss(json_string);
shared_ptr<Function> f = ngraph::deserialize(ss);
auto manager = runtime::Manager::get("CPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
runtime::cpu::CPU_CallFrame* cpu_cf = static_cast<runtime::cpu::CPU_CallFrame*>(cf.get());
vector<shared_ptr<runtime::Value>> args;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto arg = make_tensor(*backend, *(param->get_value_type()));
rng.initialize(arg);
args.push_back(arg);
}
shared_ptr<const ValueType> result_type = f->get_result_type();
auto result = make_tensor(*backend, *result_type);
stopwatch t1;
t1.start();
float count = 10;
for (size_t i = 0; i < static_cast<size_t>(count); i++)
{
cf->call(args, {result});
}
t1.stop();
float time = t1.get_milliseconds();
cout << time / count << "ms per iteration\n";
vector<runtime::cpu::PerformanceCounter> perf_data = cpu_cf->get_performance_data();
sort(
perf_data.begin(),
perf_data.end(),
[](const runtime::cpu::PerformanceCounter& p1, const runtime::cpu::PerformanceCounter& p2) {
return p1.total_microseconds() > p2.total_microseconds();
});
for (const runtime::cpu::PerformanceCounter& p : perf_data)
{
NGRAPH_INFO << p.name() << ", " << p.total_microseconds();
}
}
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSERIALIZED_ZOO=\\\"${CMAKE_CURRENT_SOURCE_DIR}\\\"" PARENT_SCOPE)
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment