Unverified Commit b50c17bf authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

Merge branch 'master' into tsocha/improve-cmake-grama

parents b5549e0d 1eda1350
......@@ -32,7 +32,7 @@ int main()
auto t1 = std::make_shared<op::Multiply>(t0, c);
// Make the function
auto f = std::make_shared<Function>(NodeVector{t1},
auto f = std::make_shared<Function>(OutputVector{t1},
ParameterVector{a, b, c});
// Create the backend
......
......@@ -31,7 +31,7 @@ int main()
auto t1 = (a + b) * c;
// Make the function
auto f = std::make_shared<Function>(NodeVector{t1},
auto f = std::make_shared<Function>(OutputVector{t1},
ParameterVector{a, b, c});
// Get the backend
......
......@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
auto delta = -learning_rate * loss;
// Updates
ngraph::autodiff::Adjoints adjoints(NodeVector{loss},
NodeVector{delta});
ngraph::autodiff::Adjoints adjoints(OutputVector{loss},
OutputVector{delta});
auto grad_W0 = adjoints.backprop_node(W0);
auto grad_b0 = adjoints.backprop_node(b0);
auto grad_W1 = adjoints.backprop_node(W1);
......@@ -231,7 +231,7 @@ int main(int argc, char* argv[])
NodeMap train_node_map;
auto train_function = clone_function(
Function(
NodeVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
OutputVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}),
train_node_map);
auto train_exec = backend->compile(train_function);
......@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
// X, W0, b0, W1, b1 -> softmax
NodeMap inference_node_map;
auto inference_function = clone_function(
Function(NodeVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
Function(OutputVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
inference_node_map);
auto inference_exec = backend->compile(inference_function);
......
......@@ -172,8 +172,8 @@ int main(int argc, const char* argv[])
auto delta = -learning_rate * loss;
// Updates
ngraph::autodiff::Adjoints adjoints(NodeVector{loss},
NodeVector{delta});
ngraph::autodiff::Adjoints adjoints(OutputVector{loss},
OutputVector{delta});
auto W0_next = W0 + adjoints.backprop_node(W0);
auto b0_next = b0 + adjoints.backprop_node(b0);
auto W1_next = W1 + adjoints.backprop_node(W1);
......@@ -218,7 +218,7 @@ int main(int argc, const char* argv[])
NodeMap train_node_map;
auto train_function = clone_function(
Function(
NodeVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
OutputVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}),
train_node_map);
auto train_exec = backend->compile(train_function);
......@@ -227,7 +227,7 @@ int main(int argc, const char* argv[])
// X, W0, b0, W1, b1 -> softmax
NodeMap inference_node_map;
auto inference_function = clone_function(
Function(NodeVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
Function(OutputVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
inference_node_map);
auto inference_exe = backend->compile(inference_function);
......
......@@ -5,7 +5,7 @@ Build and Test
###############
* :ref:`default_ngflags`
* :ref:`ngraph_plaidml_backend`
.. :ref:`ngraph_plaidml_backend`
There are a few common paths to take when manually building the |project|
from source code. Today nGraph supports various developers working on all
......@@ -161,17 +161,17 @@ The process documented here will work on CentOS 7.4.
$ make && sudo make install
.. _ngraph_plaidml_backend:
.. .. _ngraph_plaidml_backend: hide this until announcement is official
Building nGraph-PlaidML from source
===================================
.. Building nGraph-PlaidML from source
.. ===================================
The following instructions will create the ``~/ngraph_plaidml_dist``
locally:
.. The following instructions will create the ``~/ngraph_plaidml_dist``
.. locally:
#. Ensure you have installed the :ref:`prerequisites` for your OS.
.. #. Ensure you have installed the :ref:`prerequisites` for your OS.
#. Install the prerequisites for the backend. Our hybrid ``NGRAPH_PLAIDML``
.. #. Install the prerequisites for the backend. Our hybrid ``NGRAPH_PLAIDML``
backend works best with Python3 versions. We recommend that you use a
virtual environment, due to some of the difficulties that users have
seen when trying to install outside of a venv.
......@@ -182,20 +182,20 @@ locally:
$ pip install plaidml
$ plaidml-setup
#. Clone the source code, create and enter your build directory:
.. #. Clone the source code, create and enter your build directory:
.. code-block:: console
$ git clone https://github.com/NervanaSystems/ngraph.git
$ cd ngraph && mkdir build && cd build
#. Prepare the CMake files as follows:
.. #. Prepare the CMake files as follows:
.. code-block:: console
$ cmake .. -DCMAKE_INSTALL_PREFIX=~/ngraph_plaidml_dist -DNGRAPH_CPU_ENABLE=OFF -DNGRAPH_PLAIDML_ENABLE=ON
#. Run :command:`make` and ``make install``. Note that if you are building
.. #. Run :command:`make` and ``make install``. Note that if you are building
outside a local or user path, you may need to run ``make install`` as the
root user.
......@@ -210,8 +210,8 @@ locally:
tests can be run when PlaidML devices are available at the machine
level.
For more about working with the PlaidML backend from nGraph, see our
API documentation :doc:`backends/plaidml-ng-api/index`.
.. For more about working with the PlaidML backend from nGraph, see our
.. API documentation :doc:`backends/plaidml-ng-api/index`.
macOS\* development
......
......@@ -99,8 +99,8 @@ Once the graph is built, we need to package it in a ``Function``:
:lines: 35-36
The first argument to the constuctor specifies the nodes that the function will
return; in this case, the product. A ``NodeVector`` is a vector of shared
pointers of ``op::Node``. The second argument specifies the parameters of the
return; in this case, the product. An ``OutputVector`` is a vector of references to
outputs of ``op::Node``. The second argument specifies the parameters of the
function, in the order they are to be passed to the compiled function. A
``ParameterVector`` is a vector of shared pointers to ``op::Parameter``.
......
......@@ -62,14 +62,14 @@ more detail and describe how nGraph addresses them.
Problem 1: Kernel libraries do not support graph-level optimizations
--------------------------------------------------------------------
The example diagrams below shows how a deep learning framework, when integrated
The example diagrams below show how a deep learning framework, when integrated
with a kernel library, can optimally run each operation in a computational
graph, but the choice of operations in the graph may not be optimal.
.. _figure-A:
.. figure:: ../graphics/kernel-problem-1.png
:width: 555px
:width: 100%
:alt:
......@@ -95,7 +95,7 @@ diagram.
.. _figure-B:
.. figure:: ../graphics/kernel-problem-2.png
:width: 555px
:width: 100%
:alt:
Each framework must be manually integrated with each hardware-specific kernel
......@@ -130,7 +130,7 @@ work for what will ultimately be a fragile setup that is costly to maintain.
.. _figure-C:
.. figure:: ../graphics/kernel-problem-3.png
:width: 555px
:width: 100%
:alt:
......
......@@ -16,14 +16,19 @@ We are pleased to announce the release of version |version|-doc.
Core updates for |version|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Better PlaidML support
+ More ONNX ops
+ Elementwise divide defaults to Python semantics
+ GenerateMask seed optional
+ Graph visualization improvements
+ Preserve control dependencies in more places
+ GetOutputElement has single input
Latest doc updates
~~~~~~~~~~~~~~~~~~
+ Add instructions how to build ``NGRAPH_PLAIDML`` backend.
.. Latest doc updates
.. ~~~~~~~~~~~~~~~~~~
.. + Add instructions how to build ``NGRAPH_PLAIDML`` backend.
.. important:: Pre-releases (``-rc-0.*``) have newer features, and are less stable.
......@@ -35,7 +40,6 @@ Changelog on Previous Releases
0.23
----
+ PlaidML support
+ More ONNX ops
+ Elementwise divide defaults to Python semantics
+ GenerateMask seed optional
......
:orphan:
.. toctree::
:caption: Sitemap
:maxdepth: 1
frameworks/index
python_api/index
inspection/index
core/overview
backends/index
project/index
.. toctree::
:includehidden:
frameworks/index
project/index
python_api/index
inspection/index
core/overview
backends/index
project/extras/index
......@@ -23,6 +23,8 @@
namespace py = pybind11;
static const char* CAPSULE_NAME = "ngraph_function";
void regclass_pyngraph_Function(py::module m)
{
py::class_<ngraph::Function, std::shared_ptr<ngraph::Function>> function(m, "Function");
......@@ -49,4 +51,41 @@ void regclass_pyngraph_Function(py::module m)
py::cast(self.get_output_shape(0)).attr("__str__")().cast<std::string>();
return "<" + class_name + ": '" + self.get_friendly_name() + "' (" + shape + ")>";
});
function.def_static("from_capsule", [](py::object* capsule) {
// get the underlying PyObject* which is a PyCapsule pointer
auto* pybind_capsule_ptr = capsule->ptr();
// extract the pointer stored in the PyCapsule under the name CAPSULE_NAME
auto* capsule_ptr = PyCapsule_GetPointer(pybind_capsule_ptr, CAPSULE_NAME);
auto* ngraph_function = static_cast<std::shared_ptr<ngraph::Function>*>(capsule_ptr);
if (ngraph_function)
{
return *ngraph_function;
}
else
{
throw std::runtime_error("The provided capsule does not contain an ngraph::Function");
}
});
function.def_static("to_capsule", [](std::shared_ptr<ngraph::Function>& ngraph_function) {
// create a shared pointer on the heap before putting it in the capsule
// this secures the lifetime of the object transferred by the capsule
auto* sp_copy = new std::shared_ptr<ngraph::Function>(ngraph_function);
// a destructor callback that will delete the heap allocated shared_ptr
// when the capsule is destructed
auto sp_deleter = [](PyObject* capsule) {
auto* capsule_ptr = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
auto* function_sp = static_cast<std::shared_ptr<ngraph::Function>*>(capsule_ptr);
if (function_sp)
{
delete function_sp;
}
};
// put the shared_ptr in a new capsule under the same name as in "from_capsule"
auto pybind_capsule = py::capsule(sp_copy, CAPSULE_NAME, sp_deleter);
return pybind_capsule;
});
}
This diff is collapsed.
......@@ -98,25 +98,21 @@ namespace ngraph
void build_ng_dialect();
template <typename OP>
static mlir::Value* create_op(MLIRCompiler& compiler, const ngraph::Node* ng_node)
template <typename Op>
static mlir::Operation* create_op(MLIRCompiler& compiler,
const ngraph::Node* ng_node)
{
throw std::runtime_error("Unimplemented op '" + ng_node->description() +
"' in MLIR Compiler");
}
template <typename UnaryOp>
mlir::Value* create_unary_op(const ngraph::Node* ng_node);
template <typename BinOp>
mlir::Value* create_binary_op(const ngraph::Node* ng_node);
// TODO(amprocte): Can we have a create_variadic_op that is able to handle the
// attributes?
mlir::Value* create_concat(const ngraph::Node* ng_node);
// Generic op lowerer to ng dialect.
// Simply maps ngraph tensors to values and generate an OP. No op-specific logic.
template <typename Op>
mlir::Operation* create_generic_op(const ngraph::Node* ng_node);
template <typename RedOp>
mlir::Value* create_index_reduction(const ngraph::Node* ng_node);
mlir::Operation* create_index_reduction(const ngraph::Node* ng_node);
void create_return();
......@@ -150,7 +146,7 @@ namespace ngraph
using TensorToInfo = std::pair<descriptor::Tensor*, TensorInfo>;
using TensorToInfoMap = std::unordered_map<descriptor::Tensor*, TensorInfo>;
using MLIRCompOpFunction =
std::function<mlir::Value*(MLIRCompiler& compiler, const ngraph::Node*)>;
std::function<mlir::Operation*(MLIRCompiler& compiler, const ngraph::Node*)>;
using MLIRCompOpMap = std::unordered_map<std::type_index, MLIRCompOpFunction>;
// Maps tensor to the value it represents in the IR
......
......@@ -168,6 +168,39 @@ static mlir::LogicalResult verifyCmpOp(T* op)
return mlir::success();
}
template <>
mlir::LogicalResult verifyOp(NGGatherOp* op)
{
Type ty = op->params()->getType();
NGTensorType inputType = ty.cast<NGTensorType>();
ty = op->indices()->getType();
NGTensorType indicesType = ty.cast<NGTensorType>();
// ensure axis < params rank
if (op->axis().getSExtValue() >= inputType.getRank())
return op->emitOpError("Gather axis is larger than input rank");
ty = indicesType.getElementType();
// ensure indices are I32 or I64
if (!ty.isa<NGIntegerType>())
return op->emitOpError("Indices tensor is not of Integer type");
NGIntegerType indicesEltType = ty.cast<NGIntegerType>();
if (!indicesEltType.isInt32() && !indicesEltType.isInt64())
return op->emitOpError("Indices tensor is not of I32 or I64 type");
mlir::Type r0 = op->res()->getType();
NGTensorType resType = r0.cast<NGTensorType>();
// ensure result is compatible with input
if (!resType.getRank() == inputType.getRank() + indicesType.getRank() - 1)
return op->emitOpError("Incompatible result shape and/or type");
return mlir::success();
}
namespace mlir
{
#define GET_OP_CLASSES
......
......@@ -186,8 +186,8 @@ def NGDotOp : NG_Binary_Op<"dot">
// class, but I'm not sure how to add concatenation_axis into the args if we
// do that.
def NGConcatOp :
NG_OneResult_Op<"concat", [NoSideEffect]>,
Arguments<(ins Variadic<NG_TensorType>:$args, I64Attr:$concatenation_axis)>
NG_OneResult_Op<"concat", [NoSideEffect]>,
Arguments<(ins Variadic<NG_TensorType>:$args, I64Attr:$concatenation_axis)>
{
let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
......@@ -200,7 +200,7 @@ class NG_Axis_Reduction_Op<string mnemonic, list<OpTrait> traits = []> :
{
let summary = "Base class for reduction operations that perform a reduction "
"across the axes of a single tensor.";
let description = "Axes are represented as an array of I64 attributes.";
let description = [{Axes are represented as an array of I64 attributes.}];
let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
......@@ -257,6 +257,24 @@ def NGAnyRedOp : NG_Axis_Reduction_Op<"any.red">
let verifier = [{ return verifyLogicalReductionOp(this); }];
}
// Gather
def NGGatherOp :
NG_OneResult_Op<"gather", [NoSideEffect]>,
Arguments<(ins NG_TensorType:$params, NG_TensorType:$indices, I64Attr:$axis)>
{
let summary = "Gather slices from params along the specified axis according to indices";
let description = [{
Gather slices from axis of params according to indices
params The tensor from which slices are gathered
indices Index tensor. Data type must be `element::i32` or `element::i64`
axis Axis in params to gather
}];
let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
let verifier = [{ return verifyOp(this); }];
}
// Terminator Ops
def NGReturnOp : NG_Terminator_Op<"return">;
......
......@@ -199,6 +199,7 @@ namespace mlir
}
Shape getShape() const { return m_shape; }
int64_t getRank() const { return m_shape.size(); }
EltType getElementType() const { return m_eltType; }
private:
NGTensorTypeStorage(EltType eltType, Shape shape)
......
......@@ -646,6 +646,123 @@ namespace
return matchSuccess();
}
REWRITER(NGGatherOp)
{
auto gatherOp = cast<NGGatherOp>(op);
auto loc = gatherOp.getLoc();
ScopedContext scope(rewriter, loc);
// Get operands
Value* result = m_pass.buildOutputDefs(op, rewriter)[0];
NGRAPH_CHECK(result, "Unexpected null result in GatherOp");
auto resultTy = result->getType().cast<MemRefType>();
Value* params = operands[0];
Value* indices = operands[1];
auto axis = gatherOp.axis().getSExtValue();
// Create view to write into result.
MemRefView vRes(result), vParams(params), vIndices(indices);
// Indexed Values
IndexedValue iRes(result), iParams(params), iIndices(indices);
// Construct outer loop for params dims. Exclude the axis dim.
SmallVector<ValueHandle, 4> paramsLbs, paramsUbs;
SmallVector<IndexHandle, 4> paramsIVs;
SmallVector<int64_t, 4> paramsSteps;
SmallVector<ValueHandle*, 4> paramsIVPtrs;
for (auto i = 0; i < vParams.rank(); i++)
{
// skip gather axis
if (i == axis)
continue;
paramsLbs.push_back(IndexHandle(vParams.lb(i)));
paramsUbs.push_back(IndexHandle(vParams.ub(i)));
paramsSteps.push_back(vParams.step(i));
}
NGRAPH_CHECK(paramsLbs.size() == vParams.rank() - 1 &&
paramsUbs.size() == paramsLbs.size() &&
paramsSteps.size() == paramsLbs.size(),
"Incorrect loop nest bounds size for gather params");
paramsIVs = IndexHandle::makeIndexHandles(vParams.rank() - 1);
paramsIVPtrs = IndexHandle::makeIndexHandlePointers(paramsIVs);
auto indicesLbs = vIndices.getLbs();
auto indicesUbs = vIndices.getUbs();
auto indicesSteps = vIndices.getSteps();
auto indicesIVs = IndexHandle::makeIndexHandles(vIndices.rank());
auto indicesIVPtrs = IndexHandle::makeIndexHandlePointers(indicesIVs);
SmallVector<IndexHandle, 8> paramsIndices, resIndices;
// Make sure we are going to create loops
NGRAPH_CHECK(vParams.rank() > 0, "Invalid size for indices steps");
// Let params rank : N
// Let indices rank : M
// Let axis be A
// Generate
// params loops
// for P_0: 0 -> params.dim[0]
// for P_1: 0 -> params.dim[1]
// for P_2: 0 -> params.dim[2]
// ...
// for P_(A-1):0 -> params.dim[A-1]
// for P_(A+1):0 -> params.dim[A+1]
// ...
// for P_(N-1):0 -> params.dim[N-1]
// indices loops
// for I_0:0 -> indices.dim[0]
// ...
// for I_(M-1):0 -> indices.dim[M-1]
// res[P_0, P_1, .. P_(A-1), I_0, .., I_(M-1), P_(A+1), ... P_(N-1)] =
// params[P_0, P_1, .. P_(A-1), indices[I_0, .., I_(M-1)], P_(A+1), ... P_(N-1)];
LoopNestBuilder(paramsIVPtrs, paramsLbs, paramsUbs, paramsSteps)([&] {
LoopNestBuilder(indicesIVPtrs, indicesLbs, indicesUbs, indicesSteps)([&] {
// Load axis value from indices array and cast it to Index Type
ValueHandle axisIdx = ValueHandle::create<IndexCastOp>(
(ValueHandle)iIndices(indicesIVs), rewriter.getIndexType());
// construct indices for param
// [P_0, P_1, .. P_axis-1, Indices[I0, I1, .. I_k-1], P_axis+1, P_axis+2, .. P_n-1]
for (auto i = 0, j = 0; i < vParams.rank(); i++)
{
if (i == axis)
{
paramsIndices.push_back(IndexHandle(axisIdx));
}
else
{
paramsIndices.push_back(paramsIVs[j++]);
}
}
// construct indices for result
// [P_0, P_1, .. P_axis-1, I0, I1, .. I_k-1, P_axis+1, P_axis+2, .. P_n-1]
for (auto i = 0, j = 0; i < vParams.rank() + vIndices.rank() - 1;)
{
if (i == axis && indicesIVs.size() > 0)
{
resIndices.append(indicesIVs.begin(), indicesIVs.end());
i += indicesIVs.size();
}
else
{
resIndices.push_back(paramsIVs[j++]);
i++;
}
}
// Store into result
iRes(resIndices) = iParams(paramsIndices);
});
});
rewriter.replaceOp(op, {result});
return matchSuccess();
}
REWRITER(NGReturnOp)
{
rewriter.replaceOpWithNewOp<ReturnOp>(op);
......@@ -653,7 +770,7 @@ namespace
}
#undef REWRITER
/// End of pattern matchers
template <typename OP>
void lower_binary_elementwise(Operation* op,
ArrayRef<Value*> operands,
......
......@@ -29,6 +29,7 @@ MLIR_OP(NGArgMinRedOp)
MLIR_OP(NGConcatOp)
MLIR_OP(NGDivOp)
MLIR_OP(NGDotOp)
MLIR_OP(NGGatherOp)
MLIR_OP(NGGreaterOp)
MLIR_OP(NGLessOp)
MLIR_OP(NGMulOp)
......
......@@ -9,6 +9,7 @@ MLIR_OP(ArgMax)
MLIR_OP(Divide)
MLIR_OP(Dot)
MLIR_OP(Concat)
MLIR_OP(Gather)
MLIR_OP(Greater)
MLIR_OP(Less)
MLIR_OP(Maximum)
......
......@@ -25,6 +25,7 @@
#include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/gather.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp"
#include "ngraph/op/less.hpp"
......
......@@ -463,6 +463,8 @@ set (SRC
runtime/backend.hpp
runtime/backend_manager.cpp
runtime/backend_manager.hpp
runtime/chrome_trace.cpp
runtime/chrome_trace.hpp
runtime/executable.cpp
runtime/executable.hpp
runtime/host_tensor.cpp
......
......@@ -51,11 +51,6 @@ OutputVector make_zeros(std::shared_ptr<Node> x)
return zeros;
}
autodiff::Adjoints::Adjoints(const NodeVector& ys, const NodeVector& cs)
: Adjoints(OutputVector(ys.begin(), ys.end()), OutputVector(cs.begin(), cs.end()))
{
}
autodiff::Adjoints::Adjoints(const OutputVector& ys, const OutputVector& cs)
{
if (ys.size() != cs.size())
......
......@@ -46,8 +46,6 @@ namespace ngraph
/// \param c An expression for where to evaluate the derivatives
Adjoints(const OutputVector& y, const OutputVector& c);
Adjoints(const NodeVector& y, const NodeVector& c);
Adjoints(const Adjoints& adjoints) = default;
Adjoints& operator=(const Adjoints& adjoints) = default;
Adjoints() = default;
......
......@@ -41,6 +41,30 @@ Function::Function(const ResultVector& results,
init();
}
Function::Function(const OutputVector& results,
const ParameterVector& parameters,
const std::string& name)
: m_results(results.size())
, m_parameters(parameters)
, m_temporary_pool_size(0)
, m_instance_id(m_next_instance_id.fetch_add(1))
, m_name(name)
, m_unique_name("Function_" + to_string(m_instance_id))
{
if (std::any_of(results.cbegin(), results.cend(), [](Output<Node> n) {
return std::dynamic_pointer_cast<op::Result>(n.get_node_shared_ptr());
}))
{
throw ngraph_error(
" Results already contain op::Results. Use a c-tor that takes a ResultVector");
}
std::transform(results.begin(), results.end(), m_results.begin(), [](Output<Node> n) {
return std::make_shared<op::Result>(n);
});
init();
}
Function::Function(const NodeVector& results,
const ParameterVector& parameters,
const std::string& name)
......@@ -208,6 +232,11 @@ shared_ptr<Node> Function::get_output_op(size_t i) const
return m_results.at(i);
}
Output<Node> Function::output(size_t i) const
{
return m_results.at(i);
}
shared_ptr<Node> Function::get_result() const
{
if (m_results.size() != 1)
......
......@@ -37,6 +37,10 @@ namespace ngraph
const ParameterVector& parameters,
const std::string& name = "");
Function(const OutputVector& results,
const ParameterVector& parameters,
const std::string& name = "");
Function(const std::shared_ptr<Node>& result,
const ParameterVector& parameters,
const std::string& name = "");
......@@ -55,6 +59,8 @@ namespace ngraph
/// Return the op that generates output i
std::shared_ptr<Node> get_output_op(size_t i) const;
Output<Node> output(size_t i) const;
/// Return the element type of output i
const element::Type& get_output_element_type(size_t i) const;
......
//*****************************************************************************
// Copyright 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "chrome_trace.hpp"
#include "ngraph/log.hpp"
using namespace std;
using namespace ngraph;
static bool read_tracing_env_var()
{
static const bool is_enabled = (getenv("NGRAPH_ENABLE_TRACING") != nullptr);
return is_enabled;
}
mutex runtime::event::Manager::s_file_mutex;
bool runtime::event::Manager::s_tracing_enabled = read_tracing_env_var();
runtime::event::Duration::Duration(const string& name, const string& category, const string& args)
{
if (Manager::is_tracing_enabled())
{
m_start = Manager::get_current_microseconds();
m_stop = 0;
m_name = name;
m_category = category;
m_args = args;
}
}
void runtime::event::Duration::stop()
{
if (Manager::is_tracing_enabled())
{
m_stop = Manager::get_current_microseconds();
}
}
void runtime::event::Duration::write()
{
if (Manager::is_tracing_enabled())
{
size_t stop_time = (m_stop != 0 ? m_stop : Manager::get_current_microseconds());
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
Manager::get_output_stream() <<
R"({"name":")" << m_name << R"(","cat":")" << m_category << R"(","ph":"X","pid":)"
<< Manager::get_process_id() << R"(,"tid":)"
<< Manager::get_thread_id() <<
R"(,"ts":)" << m_start << R"(,"dur":)" << (stop_time - m_start);
if (!m_args.empty())
{
out <<
R"(,"args":)" << m_args;
}
out << "}";
}
}
runtime::event::Object::Object(const string& name, const string& args)
: m_name{name}
, m_id{static_cast<size_t>(chrono::high_resolution_clock::now().time_since_epoch().count())}
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
out << R"({"name":")" << m_name << R"(","ph":"N","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
if (!args.empty())
{
out <<
R"(,"args":)" << args;
}
out << "}";
write_snapshot(out, args);
}
}
void runtime::event::Object::snapshot(const string& args)
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
write_snapshot(out, args);
}
}
void runtime::event::Object::write_snapshot(ostream& out, const string& args)
{
out << R"({"name":")" << m_name << R"(","ph":"O","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
if (!args.empty())
{
out <<
R"(,"args":)" << args;
}
out << "}";
}
void runtime::event::Object::destroy()
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
out << R"({"name":")" << m_name << R"(","ph":"D","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id()
<< "}";
}
}
void runtime::event::Manager::open(const string& path)
{
ofstream& out = get_output_stream();
if (out.is_open() == false)
{
out.open(path, ios_base::trunc);
out << "[\n";
}
}
void runtime::event::Manager::close()
{
ofstream& out = get_output_stream();
if (out.is_open())
{
out << "\n]\n";
out.close();
}
}
ofstream& runtime::event::Manager::get_output_stream()
{
static ofstream s_event_log;
return s_event_log;
}
const string& runtime::event::Manager::get_process_id()
{
static const string s_pid = to_string(getpid());
return s_pid;
}
void runtime::event::Manager::enable_event_tracing()
{
s_tracing_enabled = true;
}
void runtime::event::Manager::disable_event_tracing()
{
s_tracing_enabled = false;
}
bool runtime::event::Manager::is_event_tracing_enabled()
{
return s_tracing_enabled;
}
string runtime::event::Manager::get_thread_id()
{
thread::id tid = this_thread::get_id();
static map<thread::id, string> tid_map;
auto it = tid_map.find(tid);
string rc;
if (it == tid_map.end())
{
stringstream ss;
ss << "\"" << tid << "\"";
rc = ss.str();
tid_map.insert({tid, rc});
}
else
{
rc = it->second;
}
return rc;
}
//*****************************************************************************
// Copyright 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <chrono>
#include <fstream>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#ifdef _WIN32
#include <windows.h>
// windows.h must be before processthreadsapi.h so we need this comment
#include <processthreadsapi.h>
#define getpid() GetCurrentProcessId()
#else
#include <unistd.h>
#endif
namespace ngraph
{
namespace runtime
{
namespace event
{
class Duration;
class Object;
class Manager;
}
}
}
//
// This class records timestamps for a given user defined event and
// produces output in the chrome tracing format that can be used to view
// the events of a running program
//
// Following is the format of a trace event
//
// {
// "name": "myName",
// "cat": "category,list",
// "ph": "B",
// "ts": 12345,
// "pid": 123,
// "tid": 456,
// "args": {
// "someArg": 1,
// "anotherArg": {
// "value": "my value"
// }
// }
// }
//
// The trace file format is defined here:
// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
//
// The trace file can be viewed by Chrome browser using the
// URL: chrome://tracing/
//
// More information about this is at:
// http://dev.chromium.org/developers/how-tos/trace-event-profiling-tool
class ngraph::runtime::event::Manager
{
friend class Duration;
friend class Object;
public:
static void open(const std::string& path = "runtime_event_trace.json");
static void close();
static bool is_tracing_enabled() { return s_tracing_enabled; }
static void enable_event_tracing();
static void disable_event_tracing();
static bool is_event_tracing_enabled();
private:
static std::ofstream& get_output_stream();
static const std::string& get_process_id();
static size_t get_current_microseconds()
{
return std::chrono::high_resolution_clock::now().time_since_epoch().count() / 1000;
}
static std::string get_thread_id();
static std::mutex& get_mutex() { return s_file_mutex; }
static std::ostream s_ostream;
static std::mutex s_file_mutex;
static bool s_tracing_enabled;
};
class ngraph::runtime::event::Duration
{
public:
explicit Duration(const std::string& name,
const std::string& category,
const std::string& args = "");
~Duration() { write(); }
/// \brief stop the timer without writing the data to the log file. To write the data
/// call the `write` method
/// Calls to stop() are optional
void stop();
/// \brief write the log data to the log file for this event
/// This funtion has an implicit stop() if stop() has not been previously called
void write();
Duration(const Duration&) = delete;
Duration& operator=(Duration const&) = delete;
private:
std::string to_json() const;
size_t m_start;
size_t m_stop;
std::string m_name;
std::string m_category;
std::string m_args;
};
class ngraph::runtime::event::Object
{
public:
Object(const std::string& name, const std::string& args);
void snapshot(const std::string& args);
void destroy();
private:
void write_snapshot(std::ostream& out, const std::string& args);
const std::string m_name;
size_t m_id;
};
......@@ -18,6 +18,7 @@
#include <memory>
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
#include "ngraph/runtime/chrome_trace.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/util.hpp"
......@@ -96,6 +97,8 @@ const char* runtime::HostTensor::get_data_ptr() const
void runtime::HostTensor::write(const void* source, size_t n)
{
runtime::event::Duration d1("write", "HostTensor");
if (n > m_buffer_size)
{
throw out_of_range("write access past end of tensor");
......@@ -106,6 +109,7 @@ void runtime::HostTensor::write(const void* source, size_t n)
void runtime::HostTensor::read(void* target, size_t n) const
{
runtime::event::Duration d1("read", "HostTensor");
if (n > m_buffer_size)
{
throw out_of_range("read access past end of tensor");
......
......@@ -30,6 +30,7 @@
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/backend_manager.hpp"
#include "ngraph/runtime/chrome_trace.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
......@@ -74,6 +75,8 @@ runtime::interpreter::INTExecutable::INTExecutable(const std::string& model_stri
bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
{
runtime::event::Duration d1("call", "Interpreter");
// convert inputs to HostTensor
vector<shared_ptr<HostTensor>> func_inputs;
for (auto tensor : inputs)
......@@ -122,6 +125,7 @@ bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::
for (const NodeWrapper& wrapped : m_wrapped_nodes)
{
auto op = wrapped.get_node();
runtime::event::Duration d2(op->description(), "Interpreter");
auto type_id = wrapped.get_typeid();
if (type_id == OP_TYPEID::Parameter)
{
......@@ -304,3 +308,74 @@ void runtime::interpreter::INTExecutable::save(ostream& out)
string model = serialize(m_function, 0);
writer.write("model", model.data(), model.size());
}
shared_ptr<ngraph::op::Parameter>
runtime::interpreter::INTExecutable::get_parameter(size_t index) const
{
const ParameterVector& parameters = get_parameters();
NGRAPH_CHECK(index < parameters.size(), "create_tensor for input out of bounds");
return parameters[index];
}
shared_ptr<ngraph::op::Result> runtime::interpreter::INTExecutable::get_result(size_t index) const
{
const ResultVector& results = get_results();
NGRAPH_CHECK(index < results.size(), "create_tensor for input out of bounds");
return results[index];
}
shared_ptr<runtime::Tensor>
runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index)
{
shared_ptr<op::Parameter> parameter = get_parameter(input_index);
return make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
}
shared_ptr<runtime::Tensor>
runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index)
{
shared_ptr<op::Result> result = get_result(output_index);
return make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
}
vector<shared_ptr<runtime::Tensor>>
runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index,
size_t pipeline_depth)
{
vector<shared_ptr<runtime::HostTensor>> tensors;
shared_ptr<op::Parameter> parameter = get_parameter(input_index);
for (size_t i = 0; i < pipeline_depth; i++)
{
shared_ptr<runtime::HostTensor> tensor;
auto t =
make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
tensor = static_pointer_cast<runtime::HostTensor>(t);
tensors.push_back(tensor);
}
vector<shared_ptr<runtime::Tensor>> result_tensors;
for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
{
result_tensors.push_back(tensor);
}
return result_tensors;
}
vector<shared_ptr<runtime::Tensor>>
runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index,
size_t pipeline_depth)
{
vector<shared_ptr<runtime::HostTensor>> tensors;
shared_ptr<op::Result> result = get_result(output_index);
for (size_t i = 0; i < pipeline_depth; i++)
{
shared_ptr<runtime::HostTensor> tensor;
auto t = make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
tensor = static_pointer_cast<runtime::HostTensor>(t);
tensors.push_back(tensor);
}
vector<shared_ptr<runtime::Tensor>> result_tensors;
for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
{
result_tensors.push_back(tensor);
}
return result_tensors;
}
......@@ -186,9 +186,21 @@ public:
std::vector<PerformanceCounter> get_performance_data() const override;
std::shared_ptr<runtime::Tensor> create_input_tensor(size_t input_index) override;
std::shared_ptr<runtime::Tensor> create_output_tensor(size_t output_index) override;
std::vector<std::shared_ptr<runtime::Tensor>>
create_input_tensor(size_t input_index, size_t pipeline_depth) override;
std::vector<std::shared_ptr<runtime::Tensor>>
create_output_tensor(size_t output_index, size_t pipeline_depth) override;
private:
INTExecutable(const std::string& model_string);
std::shared_ptr<ngraph::op::Parameter> get_parameter(size_t index) const;
std::shared_ptr<ngraph::op::Result> get_result(size_t index) const;
int get_alignment() const { return 64; }
bool m_is_compiled = false;
bool m_nan_check_enabled = false;
......
......@@ -17,6 +17,8 @@
set (SRC
nbench.cpp
benchmark.cpp
benchmark_pipelined.cpp
benchmark_utils.cpp
)
add_executable(nbench ${SRC})
......
......@@ -14,12 +14,8 @@
// limitations under the License.
//*****************************************************************************
#include <random>
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark.hpp"
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
......@@ -30,106 +26,6 @@
using namespace std;
using namespace ngraph;
static default_random_engine s_random_engine;
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}
template <typename T>
void init_int_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
}
template <>
void init_int_tv<char>(shared_ptr<runtime::Tensor> tv, char min, char max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tv<int8_t>(shared_ptr<runtime::Tensor> tv, int8_t min, int8_t max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tv<uint8_t>(shared_ptr<runtime::Tensor> tv, uint8_t min, uint8_t max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(uint8_t));
}
template <typename T>
void init_real_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
{
size_t size = tv->get_element_count();
uniform_real_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
}
static void random_init(shared_ptr<runtime::Tensor> tv)
{
element::Type et = tv->get_element_type();
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tv<char>(tv, 0, 1); break;
case element::Type_t::f32: init_real_tv<float>(tv, -1, 1); break;
case element::Type_t::f64: init_real_tv<double>(tv, -1, 1); break;
case element::Type_t::i8: init_int_tv<int8_t>(tv, -1, 1); break;
case element::Type_t::i16: init_int_tv<int16_t>(tv, -1, 1); break;
case element::Type_t::i32: init_int_tv<int32_t>(tv, 0, 1); break;
case element::Type_t::i64: init_int_tv<int64_t>(tv, 0, 1); break;
case element::Type_t::u8: init_int_tv<uint8_t>(tv, 0, 1); break;
case element::Type_t::u16: init_int_tv<uint16_t>(tv, 0, 1); break;
case element::Type_t::u32: init_int_tv<uint32_t>(tv, 0, 1); break;
case element::Type_t::u64: init_int_tv<uint64_t>(tv, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
}
vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
const string& backend_name,
size_t iterations,
......@@ -140,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
stopwatch timer;
timer.start();
auto backend = runtime::Backend::create(backend_name);
auto compiled_func = backend->compile(f, timing_detail);
auto exec = backend->compile(f, timing_detail);
timer.stop();
cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
......@@ -201,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
}
}
}
compiled_func->call(results, args);
exec->call(results, args);
if (copy_data)
{
for (size_t result_index = 0; result_index < results.size(); result_index++)
......@@ -217,6 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
float time = t1.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
return perf_data;
}
......@@ -24,10 +24,6 @@
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
/// performance test utilities
std::multimap<size_t, std::string>
aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
......
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <array>
#include <condition_variable>
#include <mutex>
#include <thread>
#include "benchmark.hpp"
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
class TensorCollection
{
public:
vector<shared_ptr<runtime::HostTensor>> parameter_data;
vector<shared_ptr<runtime::HostTensor>> result_data;
vector<shared_ptr<runtime::Tensor>> input_tensors;
vector<shared_ptr<runtime::Tensor>> output_tensors;
private:
};
static mutex s_mutex;
static condition_variable s_condition;
static size_t current_iteration = 0;
static size_t s_iterations;
static size_t s_warmup_iterations;
static stopwatch s_timer;
static void
thread_entry(runtime::Executable* exec, const TensorCollection& tensors, size_t pipeline_stage)
{
bool data_written = false;
const vector<shared_ptr<runtime::Tensor>>& args = tensors.input_tensors;
const vector<shared_ptr<runtime::Tensor>>& results = tensors.output_tensors;
while (current_iteration < s_iterations + s_warmup_iterations)
{
if (!data_written)
{
for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
{
const shared_ptr<runtime::Tensor>& arg = args[arg_index];
if (arg->get_stale())
{
const shared_ptr<runtime::HostTensor>& data = tensors.parameter_data[arg_index];
arg->write(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size());
}
}
data_written = true;
}
unique_lock<mutex> lock(s_mutex);
if ((current_iteration & 1) != pipeline_stage)
{
s_condition.wait(lock);
}
else
{
if (current_iteration == s_warmup_iterations)
{
s_timer.start();
}
// our turn to run
exec->call(results, args);
current_iteration++;
data_written = false;
s_condition.notify_all();
lock.unlock();
for (size_t result_index = 0; result_index < results.size(); result_index++)
{
const shared_ptr<runtime::HostTensor>& data = tensors.result_data[result_index];
const shared_ptr<runtime::Tensor>& result = results[result_index];
result->read(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size());
}
}
}
}
vector<runtime::PerformanceCounter> run_benchmark_pipelined(shared_ptr<Function> f,
const string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data)
{
constexpr size_t pipeline_depth = 2;
s_iterations = iterations;
s_warmup_iterations = warmup_iterations;
array<TensorCollection, pipeline_depth> tensor_collections;
stopwatch timer;
timer.start();
auto backend = runtime::Backend::create(backend_name);
auto exec = backend->compile(f, timing_detail);
timer.stop();
cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
set_denormals_flush_to_zero();
// Create random input data for all input tensors
for (size_t i = 0; i < pipeline_depth; i++)
{
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto tensor_data =
make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
random_init(tensor_data);
tensor_collections[i].parameter_data.push_back(tensor_data);
}
}
// Create output tensors for all outputs
for (size_t i = 0; i < pipeline_depth; i++)
{
for (shared_ptr<Node> result : f->get_results())
{
auto tensor_data =
make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
tensor_collections[i].result_data.push_back(tensor_data);
}
}
// Create input tensors for all Parameters
array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> input_tensors_array;
size_t input_index = 0;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto input_tensors = exec->create_input_tensor(input_index++, pipeline_depth);
for (size_t i = 0; i < pipeline_depth; i++)
{
tensor_collections[i].input_tensors.push_back(input_tensors[i]);
}
}
// Create output tensors for all Results
array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
size_t output_index = 0;
for (shared_ptr<Node> result : f->get_results())
{
auto output_tensors = exec->create_output_tensor(output_index++, pipeline_depth);
for (size_t i = 0; i < pipeline_depth; i++)
{
tensor_collections[i].output_tensors.push_back(output_tensors[i]);
}
}
thread threads[pipeline_depth];
for (size_t i = 0; i < pipeline_depth; i++)
{
threads[i] = thread(thread_entry, exec.get(), tensor_collections[i], i);
}
for (size_t i = 0; i < pipeline_depth; i++)
{
threads[i].join();
}
s_timer.stop();
float time = s_timer.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
return perf_data;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
std::vector<ngraph::runtime::PerformanceCounter>
run_benchmark_pipelined(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data);
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
template <>
void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
}
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}
void random_init(shared_ptr<runtime::Tensor> tensor)
{
element::Type et = tensor->get_element_type();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
default_random_engine& get_random_engine()
{
static std::default_random_engine s_random_engine;
return s_random_engine;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <random>
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
void set_denormals_flush_to_zero();
void random_init(std::shared_ptr<ngraph::runtime::Tensor> tensor);
std::default_random_engine& get_random_engine();
template <typename T>
void init_int_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
std::uniform_int_distribution<T> dist(min, max);
std::vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
template <typename T>
void init_real_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
std::uniform_real_distribution<T> dist(min, max);
std::vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
......@@ -24,6 +24,7 @@
#include <iomanip>
#include "benchmark.hpp"
#include "benchmark_pipelined.hpp"
#include "ngraph/distributed.hpp"
#include "ngraph/except.hpp"
#include "ngraph/file_util.hpp"
......@@ -181,6 +182,7 @@ int main(int argc, char** argv)
int warmup_iterations = 1;
bool copy_data = true;
bool dot_file = false;
bool double_buffer = false;
for (size_t i = 1; i < argc; i++)
{
......@@ -229,6 +231,10 @@ int main(int argc, char** argv)
{
directory = argv[++i];
}
else if (arg == "--double_buffer")
{
double_buffer = true;
}
else if (arg == "-w" || arg == "--warmup_iterations")
{
try
......@@ -283,6 +289,7 @@ OPTIONS
-w|--warmup_iterations Number of warm-up iterations
--no_copy_data Disable copy of input/result data every iteration
--dot Generate Graphviz dot file
--double_buffer Double buffer inputs and outputs
)###";
return 1;
}
......@@ -420,8 +427,17 @@ OPTIONS
{
cout << "\n---- Benchmark ----\n";
shared_ptr<Function> f = deserialize(model);
auto perf_data = run_benchmark(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
vector<runtime::PerformanceCounter> perf_data;
if (double_buffer)
{
perf_data = run_benchmark_pipelined(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
else
{
perf_data = run_benchmark(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
auto perf_shape = to_perf_shape(f, perf_data);
aggregate_perf_data.insert(
aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end());
......
......@@ -733,8 +733,8 @@ NGRAPH_TEST(${BACKEND_NAME}, batch_norm_bprop_n4c3h2w2)
auto C = std::make_shared<op::Parameter>(element::f32, shape_r);
auto zero = ngraph::make_zero(bn_dgamma->get_element_type(), bn_dgamma->get_shape());
ngraph::autodiff::Adjoints adjoints(NodeVector{bn_dx, bn_dgamma, bn_dbeta},
NodeVector{C, zero, zero});
ngraph::autodiff::Adjoints adjoints(OutputVector{bn_dx, bn_dgamma, bn_dbeta},
OutputVector{C, zero, zero});
auto dinput = adjoints.backprop_node(input);
auto dgamma = adjoints.backprop_node(gamma);
......
......@@ -257,10 +257,10 @@ NGRAPH_TEST(${BACKEND_NAME}, divide_adjoint_stability)
auto B = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Divide>(A, B), ParameterVector{A, B});
auto Y_out = f->get_output_op(0);
auto Y_out = f->output(0);
auto Xs = f->get_parameters();
auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape());
ngraph::autodiff::Adjoints adjoints(NodeVector{Y_out}, NodeVector{C});
auto C = std::make_shared<op::Parameter>(Y_out.get_element_type(), Y_out.get_shape());
ngraph::autodiff::Adjoints adjoints(OutputVector{Y_out}, OutputVector{C});
std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
transform(
Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) {
......
......@@ -507,7 +507,8 @@ TEST(cpu_fusion, conv_bias_bprop_n1c1h3w3)
auto f = make_shared<Function>(
convolution_bias, ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
ngraph::autodiff::Adjoints adjoints(NodeVector{convolution_bias}, NodeVector{conv_test.delta});
ngraph::autodiff::Adjoints adjoints(OutputVector{convolution_bias},
OutputVector{conv_test.delta});
auto d_data = adjoints.backprop_node(conv_test.data);
auto d_weights = adjoints.backprop_node(conv_test.weights);
......@@ -546,7 +547,7 @@ TEST(cpu_fusion, conv_bias_bprop)
pass_manager.register_pass<pass::VisualizeTree>("conv_bias_bprop_fusion.png");
auto f = make_shared<Function>(conv_bias, ParameterVector{data_batch, filters, bias});
ngraph::autodiff::Adjoints adjoints(NodeVector{conv_bias}, NodeVector{delta});
ngraph::autodiff::Adjoints adjoints(OutputVector{conv_bias}, OutputVector{delta});
auto d_data = adjoints.backprop_node(data_batch);
auto d_weights = adjoints.backprop_node(filters);
......@@ -1452,7 +1453,7 @@ TEST(cpu_fusion, max_pool_with_indices)
auto max_pool = std::make_shared<op::MaxPool>(input, window_shape);
auto C = std::make_shared<op::Parameter>(element::f32, max_pool->get_shape());
ngraph::autodiff::Adjoints adjoints(NodeVector{max_pool}, NodeVector{C});
ngraph::autodiff::Adjoints adjoints(ngraph::OutputVector{max_pool}, ngraph::OutputVector{C});
auto dinput = adjoints.backprop_node(input);
......@@ -1789,14 +1790,14 @@ static std::shared_ptr<ngraph::Function> make_forward_function()
return std::make_shared<Function>(NodeVector{max_pool, neg, absn}, ParameterVector{input});
}
static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<ngraph::Node>>>
static std::pair<std::shared_ptr<ngraph::Function>, OutputVector>
make_backward_function(std::shared_ptr<ngraph::Function> f)
{
// get parameters
std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = f->get_parameters();
ngraph::NodeVector adjoints;
ngraph::NodeVector outputs;
ngraph::OutputVector adjoints;
ngraph::OutputVector outputs;
for (auto Y : f->get_results())
{
// Get the output
......@@ -1809,7 +1810,7 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<
ngraph::autodiff::Adjoints adjoint{outputs, adjoints};
// Perform autodiff
std::vector<std::shared_ptr<Node>> dYdXs(back_parameters.size());
OutputVector dYdXs(back_parameters.size());
transform(back_parameters.begin(),
back_parameters.end(),
dYdXs.begin(),
......@@ -1818,7 +1819,8 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<
// create the backward function
std::vector<std::shared_ptr<ngraph::op::Parameter>> param_adjoints;
for (auto n : adjoints)
param_adjoints.push_back(std::dynamic_pointer_cast<ngraph::op::Parameter>(n));
param_adjoints.push_back(
std::dynamic_pointer_cast<ngraph::op::Parameter>(n.get_node_shared_ptr()));
back_parameters.insert(back_parameters.begin(), param_adjoints.begin(), param_adjoints.end());
return {std::make_shared<ngraph::Function>(dYdXs, back_parameters), adjoints};
......@@ -2703,7 +2705,7 @@ void sigmoid_multiply_fusion_backward_compute(runtime::Backend* backend,
auto sigmoid_mul =
make_shared<op::SigmoidMultiply>(input_0_alt, input_1_alt, input_0_type, input_1_type);
ngraph::autodiff::Adjoints adjoints(NodeVector{sigmoid_mul}, NodeVector{delta_param});
ngraph::autodiff::Adjoints adjoints(OutputVector{sigmoid_mul}, OutputVector{delta_param});
auto d_input_0 = adjoints.backprop_node(input_0_adjoint);
auto d_input_1 = adjoints.backprop_node(input_1_adjoint);
auto df = make_shared<Function>(NodeVector{d_input_0, d_input_1}, back_params);
......
......@@ -144,7 +144,7 @@ namespace ngraph
// df/dX*
std::vector<std::shared_ptr<Node>> df_output_params;
Adjoints adjoints(NodeVector{f->get_output_op(0)}, NodeVector{c_param});
Adjoints adjoints(OutputVector{f->output(0)}, OutputVector{c_param});
// for each x "of interest"
for (auto x : indep_params)
......
......@@ -32,10 +32,10 @@ using namespace ngraph;
std::shared_ptr<Function> autodiff::backprop_function(const std::shared_ptr<Function>& f)
{
auto Y_out = f->get_output_op(0);
auto Y_out = f->output(0);
auto Xs = f->get_parameters();
auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape());
Adjoints adjoints(NodeVector{Y_out}, NodeVector{C});
auto C = std::make_shared<op::Parameter>(Y_out.get_element_type(), Y_out.get_shape());
Adjoints adjoints(OutputVector{Y_out}, OutputVector{C});
std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
transform(Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) {
return adjoints.backprop_node(X);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment