Unverified Commit 7f3dc2d7 authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

Optimizations to reduce compile time (#357)

* much faster compile time
* Remove all variables and just directly access inputs, output, and temps.
* compare layouts when checking if two ops are equal
* make performance counters available to all backends
parent 524d04fc
......@@ -76,5 +76,6 @@ void codegen::ExecutionEngine::finalize()
void* codegen::ExecutionEngine::get_pointer_to_named_function(const std::string& func_name)
{
return m_execution_engine->getPointerToNamedFunction(func_name);
// set AbortOnFailure flag to false so call fails by returning nullptr
return m_execution_engine->getPointerToNamedFunction(func_name, false);
}
......@@ -18,6 +18,7 @@
#include <typeinfo>
#include "ngraph/autodiff/adjoints.hpp"
#include "ngraph/descriptor/layout/tensor_view_layout.hpp"
#include "ngraph/descriptor/primary_tensor_view.hpp"
#include "ngraph/ops/parameter.hpp"
......@@ -334,3 +335,53 @@ bool Node::has_same_type(std::shared_ptr<const Node> node) const
}
return true;
}
bool Node::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (this->description() == other.description())
{
const deque<descriptor::Input>& i1 = this->get_inputs();
const deque<descriptor::Input>& i2 = other.get_inputs();
const deque<descriptor::Output>& o1 = this->get_outputs();
const deque<descriptor::Output>& o2 = other.get_outputs();
if (i1.size() == i2.size() && o1.size() == o2.size())
{
for (size_t i = 0; i < i1.size(); i++)
{
auto tvl1 = i1[i].get_output().get_tensor_view()->get_tensor_view_layout();
auto tvl2 = i2[i].get_output().get_tensor_view()->get_tensor_view_layout();
if (tvl1->get_shape() != tvl2->get_shape())
{
rc = false;
}
else if (*tvl1 != *tvl2)
{
rc = false;
}
}
for (size_t i = 0; i < o1.size(); i++)
{
auto tvl1 = o1[i].get_tensor_view()->get_tensor_view_layout();
auto tvl2 = o2[i].get_tensor_view()->get_tensor_view_layout();
if (tvl1->get_shape() != tvl2->get_shape())
{
rc = false;
}
else if (*tvl1 != *tvl2)
{
rc = false;
}
}
}
else
{
rc = false;
}
}
else
{
rc = false;
}
return rc;
}
......@@ -160,6 +160,8 @@ namespace ngraph
// True if this and node have one output with same element type and shape
bool has_same_type(std::shared_ptr<const Node> node) const;
virtual bool is_functionally_identical(const Node&) const;
protected:
void add_output(const element::Type& element_type, const Shape& shape);
void assert_argument_list_equivalency(const Nodes& b);
......
......@@ -102,3 +102,18 @@ void op::Concat::generate_adjoints(autodiff::Adjoints& adjoints, const std::shar
pos = next_pos;
}
}
bool op::Concat::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const Concat& concat = dynamic_cast<const Concat&>(other);
rc &= m_concatenation_axis == concat.m_concatenation_axis;
}
else
{
rc = false;
}
return rc;
}
......@@ -74,6 +74,8 @@ namespace ngraph
/// \return The concatenation axis.
size_t get_concatenation_axis() const { return m_concatenation_axis; }
bool is_functionally_identical(const Node&) const override;
protected:
virtual void generate_adjoints(autodiff::Adjoints& adjoints,
const std::shared_ptr<Node>& delta) override;
......
......@@ -43,7 +43,9 @@ std::string to_cpp_string(T value)
}
else
{
rc = to_string(value);
stringstream ss;
ss << value;
rc = ss.str();
}
return rc;
}
......
......@@ -257,6 +257,30 @@ std::shared_ptr<Node>
m_padding_above);
}
bool op::Convolution::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const Convolution& rhs = dynamic_cast<const Convolution&>(other);
rc &= m_window_movement_strides == rhs.m_window_movement_strides;
rc &= m_window_dilation_strides == rhs.m_window_dilation_strides;
rc &= m_input_channel_count == rhs.m_input_channel_count;
rc &= m_output_channel_count == rhs.m_output_channel_count;
rc &= m_input_image_shape == rhs.m_input_image_shape;
rc &= m_output_image_shape == rhs.m_output_image_shape;
rc &= m_window_physical_shape == rhs.m_window_physical_shape;
rc &= m_window_virtual_shape == rhs.m_window_virtual_shape;
rc &= m_batch_size == rhs.m_batch_size;
rc &= m_image_dimension_count == rhs.m_image_dimension_count;
}
else
{
rc = false;
}
return rc;
}
/*
void op::Convolution::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
{
......
......@@ -125,6 +125,8 @@ namespace ngraph
size_t get_batch_size() const { return m_batch_size; }
/// \return The number of image dimensions.
size_t get_image_dimension_count() const { return m_image_dimension_count; }
bool is_functionally_identical(const Node&) const override;
protected:
Strides m_window_movement_strides;
Strides m_window_dilation_strides;
......
......@@ -143,3 +143,18 @@ void op::Dot::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_
auto x_reshaped_dot_delta = make_shared<Dot>(x_reshaped, delta, I_shape.size()); // JK
adjoints.add_delta(y, x_reshaped_dot_delta);
}
bool op::Dot::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const Dot& rhs = dynamic_cast<const Dot&>(other);
rc &= m_reduction_axes_count == rhs.m_reduction_axes_count;
}
else
{
rc = false;
}
return rc;
}
......@@ -85,6 +85,7 @@ namespace ngraph
return std::make_shared<Dot>(
new_args.at(0), new_args.at(1), m_reduction_axes_count);
}
bool is_functionally_identical(const Node&) const override;
protected:
size_t m_reduction_axes_count;
......
......@@ -47,3 +47,8 @@ op::FunctionCall::FunctionCall(std::shared_ptr<Function> function,
add_output(function->get_output_element_type(i), function->get_output_shape(i));
}
}
bool op::FunctionCall::is_functionally_identical(const Node&) const
{
return false;
}
......@@ -55,6 +55,8 @@ namespace ngraph
return std::make_shared<FunctionCall>(m_function, new_args);
}
/// \return The function to be called.
bool is_functionally_identical(const Node&) const override;
/// \return A singleton vector containing the function to be called.
std::vector<std::shared_ptr<Function>> get_functions() const override
{
......
......@@ -147,6 +147,27 @@ op::MaxPool::MaxPool(const std::shared_ptr<Node>& arg, const Shape& window_shape
{
}
bool op::MaxPool::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const MaxPool& rhs = dynamic_cast<const MaxPool&>(other);
rc &= m_window_shape == rhs.m_window_shape;
rc &= m_window_movement_strides == rhs.m_window_movement_strides;
rc &= m_channel_count == rhs.m_channel_count;
rc &= m_input_image_shape == rhs.m_input_image_shape;
rc &= m_output_image_shape == rhs.m_output_image_shape;
rc &= m_batch_size == rhs.m_batch_size;
rc &= m_image_dimension_count == rhs.m_image_dimension_count;
}
else
{
rc = false;
}
return rc;
}
/*
void op::MaxPool::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
{
......
......@@ -77,6 +77,8 @@ namespace ngraph
size_t get_batch_size() const { return m_batch_size; }
/// \return The number of image dimensions.
size_t get_image_dimension_count() const { return m_image_dimension_count; }
bool is_functionally_identical(const Node&) const override;
protected:
Shape m_window_shape;
Strides m_window_movement_strides;
......
......@@ -41,3 +41,19 @@ op::OneHot::OneHot(const std::shared_ptr<Node>& arg, const Shape& shape, size_t
set_value_type_checked(make_shared<TensorViewType>(input_element_type, shape));
}
bool op::OneHot::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const OneHot& rhs = dynamic_cast<const OneHot&>(other);
rc &= m_shape == rhs.m_shape;
rc &= m_one_hot_axis == rhs.m_one_hot_axis;
}
else
{
rc = false;
}
return rc;
}
......@@ -60,6 +60,8 @@ namespace ngraph
/// \return The index of the one-hot axis.
size_t get_one_hot_axis() const { return m_one_hot_axis; }
bool is_functionally_identical(const Node&) const override;
protected:
Shape m_shape;
size_t m_one_hot_axis;
......
......@@ -90,3 +90,8 @@ op::Reduce::Reduce(const std::shared_ptr<Node>& arg_reductee,
add_output(input_reductee.get_element_type(), result_shape);
}
bool op::Reduce::is_functionally_identical(const Node& other) const
{
return false;
}
......@@ -110,6 +110,8 @@ namespace ngraph
}
/// \return The axis positions (0-based) to be eliminated through reduction.
const AxisSet& get_reduction_axes() const { return m_reduction_axes; }
bool is_functionally_identical(const Node&) const override;
protected:
std::shared_ptr<Function> m_reduction_function;
AxisSet m_reduction_axes;
......
......@@ -132,3 +132,20 @@ void op::ReplaceSlice::generate_adjoints(autodiff::Adjoints& adjoints,
adjoints.add_delta(
y, std::make_shared<op::Slice>(delta, m_lower_bounds, m_upper_bounds, m_strides));
}
bool op::ReplaceSlice::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const ReplaceSlice& slice = dynamic_cast<const ReplaceSlice&>(other);
rc &= m_lower_bounds == slice.m_lower_bounds;
rc &= m_upper_bounds == slice.m_upper_bounds;
rc &= m_strides == slice.m_strides;
}
else
{
rc = false;
}
return rc;
}
......@@ -87,6 +87,8 @@ namespace ngraph
const Coordinate& get_upper_bounds() const { return m_upper_bounds; }
/// \return The slicing strides.
const Strides& get_strides() const { return m_strides; }
bool is_functionally_identical(const Node&) const override;
protected:
virtual void generate_adjoints(autodiff::Adjoints& adjoints,
const std::shared_ptr<Node>& delta) override;
......
......@@ -99,3 +99,19 @@ void op::Reshape::generate_adjoints(autodiff::Adjoints& adjoints,
adjoints.add_delta(get_input_op(0), reshape);
}
bool op::Reshape::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const Reshape& reshape = dynamic_cast<const Reshape&>(other);
rc &= m_input_order == reshape.m_input_order;
rc &= m_output_shape == reshape.m_output_shape;
}
else
{
rc = false;
}
return rc;
}
......@@ -79,6 +79,8 @@ namespace ngraph
const AxisVector& get_input_order() const { return m_input_order; }
/// \return The shape of the output tensor.
const Shape& get_output_shape() const { return m_output_shape; }
bool is_functionally_identical(const Node&) const override;
protected:
virtual void generate_adjoints(autodiff::Adjoints& adjoints,
const std::shared_ptr<Node>& delta) override;
......
......@@ -97,3 +97,20 @@ void op::Slice::generate_adjoints(autodiff::Adjoints& adjoints, const std::share
adjoints.add_delta_to_slice(x, delta, m_lower_bounds, m_upper_bounds, m_strides);
}
bool op::Slice::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const Slice& slice = dynamic_cast<const Slice&>(other);
rc &= m_lower_bounds == slice.m_lower_bounds;
rc &= m_upper_bounds == slice.m_upper_bounds;
rc &= m_strides == slice.m_strides;
}
else
{
rc = false;
}
return rc;
}
......@@ -85,6 +85,8 @@ namespace ngraph
const Coordinate& get_upper_bounds() const { return m_upper_bounds; }
/// \return The slicing strides.
const Strides& get_strides() const { return m_strides; }
bool is_functionally_identical(const Node&) const override;
protected:
virtual void generate_adjoints(autodiff::Adjoints& adjoints,
const std::shared_ptr<Node>& delta) override;
......
......@@ -60,3 +60,18 @@ void op::Sum::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_
adjoints.add_delta(x, make_shared<op::Broadcast>(delta, x_shape, m_reduction_axes));
}
bool op::Sum::is_functionally_identical(const Node& other) const
{
bool rc = true;
if (Node::is_functionally_identical(other))
{
const Sum& slice = dynamic_cast<const Sum&>(other);
rc &= m_reduction_axes == slice.m_reduction_axes;
}
else
{
rc = false;
}
return rc;
}
......@@ -93,6 +93,8 @@ namespace ngraph
/// \return The axis positions (0-based) to be eliminated through summation.
const AxisSet& get_reduction_axes() const { return m_reduction_axes; }
bool is_functionally_identical(const Node&) const override;
protected:
virtual void generate_adjoints(autodiff::Adjoints& adjoints,
const std::shared_ptr<Node>& delta) override;
......
......@@ -24,6 +24,28 @@ namespace ngraph
{
namespace runtime
{
class PrimaryTensorView;
class Value;
class PerformanceCounter
{
public:
PerformanceCounter(const char* n, size_t us, size_t calls)
: m_name(n)
, m_total_microseconds(us)
, m_call_count(calls)
{
}
const std::string& name() const { return m_name; }
size_t total_microseconds() const { return m_total_microseconds; }
size_t microseconds() const { return m_total_microseconds / m_call_count; }
size_t call_count() const { return m_call_count; }
private:
std::string m_name;
size_t m_total_microseconds;
size_t m_call_count;
};
// A VM for executing lightly-compiled graph functions.
class CallFrame
{
......@@ -38,6 +60,11 @@ namespace ngraph
/// @brief Invoke the function with tuples pre-expanded to their underlying tensor views.
virtual void tensor_call(const TensorViewPtrs& inputs,
const TensorViewPtrs& outputs) = 0;
virtual std::vector<PerformanceCounter> get_performance_data() const
{
return std::vector<PerformanceCounter>();
}
};
}
}
......@@ -71,39 +71,26 @@ void runtime::cpu::CPU_CallFrame::call(
tensor_call(inputs, outputs);
}
vector<runtime::cpu::PerformanceCounter> runtime::cpu::CPU_CallFrame::get_performance_data() const
vector<runtime::PerformanceCounter> runtime::cpu::CPU_CallFrame::get_performance_data() const
{
vector<runtime::PerformanceCounter> rc;
auto* engine = m_external_function->m_execution_engine.get();
auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds = engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count = engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
if (!get_count)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_count'");
}
if (!get_name)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_name'");
}
if (!get_microseconds)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_microseconds'");
}
if (!get_call_count)
if (engine)
{
throw runtime_error("failed to find accessor function 'get_debug_timer_call_count'");
}
auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds =
engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count = engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
vector<runtime::cpu::PerformanceCounter> rc;
size_t count = get_count();
for (size_t i = 0; i < count; i++)
{
rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
if (get_count && get_name && get_microseconds && get_call_count)
{
size_t count = get_count();
for (size_t i = 0; i < count; i++)
{
rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
}
}
}
return rc;
}
......@@ -33,25 +33,6 @@ namespace ngraph
class CPU_CallFrame;
class CPU_ExternalFunction;
class PerformanceCounter
{
public:
PerformanceCounter(const char* n, size_t us, size_t calls)
: m_name(n)
, m_total_microseconds(us)
, m_call_count(calls)
{
}
const std::string& name() const { return m_name; }
size_t total_microseconds() const { return m_total_microseconds; }
size_t microseconds() const { return m_total_microseconds / m_call_count; }
size_t call_count() const { return m_call_count; }
private:
std::string m_name;
size_t m_total_microseconds;
size_t m_call_count;
};
using EntryPoint_t = void(void** inputs, void** outputs);
using EntryPoint = std::function<EntryPoint_t>;
......@@ -66,15 +47,17 @@ namespace ngraph
/// @brief Invoke the function with values matching the signature of the function.
///
/// Tuples will be expanded into their tensor views to build the call frame.
void call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
const std::vector<std::shared_ptr<runtime::TensorView>>& outputs);
void
call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
const std::vector<std::shared_ptr<runtime::TensorView>>& outputs) override;
/// @brief Invoke the function with tuples pre-expanded to their underlying
/// tensor views.
void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
const std::vector<std::shared_ptr<TensorView>>& outputs);
const std::vector<std::shared_ptr<TensorView>>& outputs) override;
std::vector<ngraph::runtime::cpu::PerformanceCounter> get_performance_data() const;
std::vector<ngraph::runtime::PerformanceCounter>
get_performance_data() const override;
protected:
std::shared_ptr<CPU_ExternalFunction> m_external_function;
......
......@@ -78,6 +78,7 @@ namespace ngraph
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
bool m_use_tbb;
std::unordered_map<std::string, std::string> m_variable_name_map;
};
}
}
......
......@@ -19,8 +19,10 @@
using namespace std;
using namespace ngraph;
runtime::cpu::TensorViewWrapper::TensorViewWrapper(const shared_ptr<descriptor::TensorView>& tv)
runtime::cpu::TensorViewWrapper::TensorViewWrapper(const shared_ptr<descriptor::TensorView>& tv,
const string& alias)
: m_tensor_view(tv)
, m_alias(alias)
{
}
......@@ -46,7 +48,14 @@ const element::Type& runtime::cpu::TensorViewWrapper::get_element_type() const
const std::string& runtime::cpu::TensorViewWrapper::get_name() const
{
return m_tensor_view->get_tensor().get_name();
if (m_alias.empty())
{
return m_tensor_view->get_tensor().get_name();
}
else
{
return m_alias;
}
}
const std::string& runtime::cpu::TensorViewWrapper::get_type() const
......
......@@ -33,7 +33,8 @@ namespace ngraph
class ngraph::runtime::cpu::TensorViewWrapper
{
public:
TensorViewWrapper(const std::shared_ptr<descriptor::TensorView>&);
TensorViewWrapper(const std::shared_ptr<descriptor::TensorView>&,
const std::string& alias = "");
size_t get_size() const;
const std::vector<size_t>& get_shape() const;
......@@ -45,4 +46,5 @@ public:
private:
std::shared_ptr<descriptor::TensorView> m_tensor_view;
std::string m_alias;
};
......@@ -13,6 +13,8 @@
// ----------------------------------------------------------------------------
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include "ngraph/runtime/interpreter/int_call_frame.hpp"
#include "ngraph/runtime/interpreter/int_tensor_view.hpp"
......@@ -24,6 +26,7 @@ runtime::interpreter::INT_CallFrame::INT_CallFrame(shared_ptr<ExternalFunction>
shared_ptr<Function> func)
: m_external_function(external_function)
, m_function(func)
, m_emit_timing(std::getenv("NGRAPH_INTERPRETER_EMIT_TIMING") != nullptr)
{
}
......@@ -136,7 +139,16 @@ void runtime::interpreter::INT_CallFrame::call(
secondary_type = op->get_inputs().at(0).get_tensor().get_element_type();
}
if (m_emit_timing)
{
m_timer_map[op.get()].start();
}
generate_calls(base_type, secondary_type, *op, inputs, outputs);
if (m_emit_timing)
{
stopwatch& timer = m_timer_map[op.get()];
timer.stop();
}
handle_output_alias(*op, output_alias_map, output_tvs);
......@@ -281,3 +293,16 @@ void runtime::interpreter::INT_CallFrame::call(
tensor_call(inputs, outputs);
}
vector<runtime::PerformanceCounter>
runtime::interpreter::INT_CallFrame::get_performance_data() const
{
vector<runtime::PerformanceCounter> rc;
for (const pair<const Node*, stopwatch> p : m_timer_map)
{
rc.emplace_back(p.first->get_name().c_str(),
p.second.get_total_microseconds(),
p.second.get_call_count());
}
return rc;
}
......@@ -112,13 +112,14 @@ public:
///
/// Tuples will be expanded into their tensor views to build the call frame.
void call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
const std::vector<std::shared_ptr<runtime::TensorView>>& outputs);
const std::vector<std::shared_ptr<runtime::TensorView>>& outputs) override;
std::vector<runtime::PerformanceCounter> get_performance_data() const override;
private:
/// @brief Invoke the function with tuples pre-expanded to their underlying
/// tensor views.
void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
const std::vector<std::shared_ptr<TensorView>>& outputs);
const std::vector<std::shared_ptr<TensorView>>& outputs) override;
void tensor_call(const std::vector<std::shared_ptr<INT_TensorView>>& inputs,
const std::vector<std::shared_ptr<INT_TensorView>>& outputs);
void call(std::shared_ptr<Function> function,
......@@ -131,6 +132,9 @@ private:
std::shared_ptr<ExternalFunction> m_external_function;
std::shared_ptr<Function> m_function;
bool m_emit_timing;
std::unordered_map<const Node*, stopwatch> m_timer_map;
void generate_calls(const element::Type& base_type,
const element::Type& secondary_type,
ngraph::Node& op,
......
......@@ -42,10 +42,10 @@ static void copy_data(shared_ptr<runtime::TensorView> tv, const vector<T>& data)
}
static multimap<size_t, string>
agregate_timing(const vector<runtime::cpu::PerformanceCounter>& perf_data)
agregate_timing(const vector<runtime::PerformanceCounter>& perf_data)
{
unordered_map<string, size_t> timing;
for (const runtime::cpu::PerformanceCounter& p : perf_data)
for (const runtime::PerformanceCounter& p : perf_data)
{
string op = p.name().substr(0, p.name().find('_'));
timing[op] += p.microseconds();
......@@ -59,12 +59,13 @@ static multimap<size_t, string>
return rc;
}
void run_benchmark(const std::string& json_path, size_t iterations)
void run_benchmark(const string& json_path, const string& backend_name, size_t iterations)
{
bool emit_timing = (std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr);
string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
if (!emit_timing)
{
cout << "To get per-op timing set the environment variable NGRAPH_CPU_EMIT_TIMING\n";
cout << "To get per-op timing set the environment variable " << env_var_name << "\n";
}
test::Uniform<float> rng{-1, 1, 0};
......@@ -74,11 +75,10 @@ void run_benchmark(const std::string& json_path, size_t iterations)
stopwatch build_time;
build_time.start();
auto manager = runtime::Manager::get("CPU");
auto manager = runtime::Manager::get(backend_name);
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
runtime::cpu::CPU_CallFrame* cpu_cf = static_cast<runtime::cpu::CPU_CallFrame*>(cf.get());
build_time.stop();
cout << "build_time " << build_time.get_milliseconds() << "ms" << endl;
......@@ -107,47 +107,60 @@ void run_benchmark(const std::string& json_path, size_t iterations)
float time = t1.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl;
if (emit_timing)
vector<runtime::PerformanceCounter> perf_data = cf->get_performance_data();
sort(perf_data.begin(),
perf_data.end(),
[](const runtime::PerformanceCounter& p1, const runtime::PerformanceCounter& p2) {
return p1.total_microseconds() > p2.total_microseconds();
});
multimap<size_t, string> timing = agregate_timing(perf_data);
for (auto it = timing.rbegin(); it != timing.rend(); it++)
{
vector<runtime::cpu::PerformanceCounter> perf_data = cpu_cf->get_performance_data();
sort(perf_data.begin(),
perf_data.end(),
[](const runtime::cpu::PerformanceCounter& p1,
const runtime::cpu::PerformanceCounter& p2) {
return p1.total_microseconds() > p2.total_microseconds();
});
multimap<size_t, string> timing = agregate_timing(perf_data);
for (auto it = timing.rbegin(); it != timing.rend(); it++)
{
cout.imbue(locale(""));
cout << setw(15) << left << it->second << " " << setw(10) << right << it->first
<< "us\n";
}
cout.imbue(locale(""));
cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n";
}
}
TEST(benchmark, mxnet_mnist_mlp_forward)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
run_benchmark(json_path, 1000);
run_benchmark(json_path, "CPU", 1000);
}
TEST(benchmark, mxnet_10_bucket_lstm)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/10_bucket_LSTM.json");
run_benchmark(json_path, 10);
run_benchmark(json_path, "CPU", 10);
}
TEST(benchmark, mxnet_10_bucket_lstm_int)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/10_bucket_LSTM.json");
run_benchmark(json_path, "INTERPRETER", 10);
}
TEST(benchmark, mxnet_lstm_backward)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_backward.json");
run_benchmark(json_path, 10);
run_benchmark(json_path, "CPU", 10);
}
TEST(benchmark, mxnet_lstm_backward_int)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_backward.json");
run_benchmark(json_path, "INTERPRETER", 1);
}
TEST(benchmark, mxnet_lstm_forward)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_forward.json");
run_benchmark(json_path, 10);
run_benchmark(json_path, "CPU", 10);
}
TEST(benchmark, mxnet_lstm_forward_int)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_forward.json");
run_benchmark(json_path, "INTERPRETER", 10);
}
//
......
......@@ -1208,7 +1208,7 @@ TEST(${BACKEND_NAME}, log)
auto result = backend->make_primary_tensor_view(element::f32, shape);
cf->call({a}, {result});
EXPECT_EQ(loga, result->get_vector<float>());
EXPECT_TRUE(test::all_close(loga, result->get_vector<float>()));
}
TEST(${BACKEND_NAME}, maximum)
......@@ -2615,7 +2615,7 @@ TEST(${BACKEND_NAME}, cosh)
input.begin(), input.end(), input.begin(), [](float x) -> float { return coshf(x); });
cf->call({a}, {result});
EXPECT_EQ(input, result->get_vector<float>());
EXPECT_TRUE(test::all_close(input, result->get_vector<float>()));
}
TEST(${BACKEND_NAME}, tanh)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment