Unverified Commit b50c17bf authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

Merge branch 'master' into tsocha/improve-cmake-grama

parents b5549e0d 1eda1350
...@@ -32,7 +32,7 @@ int main() ...@@ -32,7 +32,7 @@ int main()
auto t1 = std::make_shared<op::Multiply>(t0, c); auto t1 = std::make_shared<op::Multiply>(t0, c);
// Make the function // Make the function
auto f = std::make_shared<Function>(NodeVector{t1}, auto f = std::make_shared<Function>(OutputVector{t1},
ParameterVector{a, b, c}); ParameterVector{a, b, c});
// Create the backend // Create the backend
......
...@@ -31,7 +31,7 @@ int main() ...@@ -31,7 +31,7 @@ int main()
auto t1 = (a + b) * c; auto t1 = (a + b) * c;
// Make the function // Make the function
auto f = std::make_shared<Function>(NodeVector{t1}, auto f = std::make_shared<Function>(OutputVector{t1},
ParameterVector{a, b, c}); ParameterVector{a, b, c});
// Get the backend // Get the backend
......
...@@ -175,8 +175,8 @@ int main(int argc, char* argv[]) ...@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
auto delta = -learning_rate * loss; auto delta = -learning_rate * loss;
// Updates // Updates
ngraph::autodiff::Adjoints adjoints(NodeVector{loss}, ngraph::autodiff::Adjoints adjoints(OutputVector{loss},
NodeVector{delta}); OutputVector{delta});
auto grad_W0 = adjoints.backprop_node(W0); auto grad_W0 = adjoints.backprop_node(W0);
auto grad_b0 = adjoints.backprop_node(b0); auto grad_b0 = adjoints.backprop_node(b0);
auto grad_W1 = adjoints.backprop_node(W1); auto grad_W1 = adjoints.backprop_node(W1);
...@@ -231,7 +231,7 @@ int main(int argc, char* argv[]) ...@@ -231,7 +231,7 @@ int main(int argc, char* argv[])
NodeMap train_node_map; NodeMap train_node_map;
auto train_function = clone_function( auto train_function = clone_function(
Function( Function(
NodeVector{loss, softmax, W0_next, b0_next, W1_next, b1_next}, OutputVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}), ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}),
train_node_map); train_node_map);
auto train_exec = backend->compile(train_function); auto train_exec = backend->compile(train_function);
...@@ -240,7 +240,7 @@ int main(int argc, char* argv[]) ...@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
// X, W0, b0, W1, b1 -> softmax // X, W0, b0, W1, b1 -> softmax
NodeMap inference_node_map; NodeMap inference_node_map;
auto inference_function = clone_function( auto inference_function = clone_function(
Function(NodeVector{softmax}, ParameterVector{X, W0, b0, W1, b1}), Function(OutputVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
inference_node_map); inference_node_map);
auto inference_exec = backend->compile(inference_function); auto inference_exec = backend->compile(inference_function);
......
...@@ -172,8 +172,8 @@ int main(int argc, const char* argv[]) ...@@ -172,8 +172,8 @@ int main(int argc, const char* argv[])
auto delta = -learning_rate * loss; auto delta = -learning_rate * loss;
// Updates // Updates
ngraph::autodiff::Adjoints adjoints(NodeVector{loss}, ngraph::autodiff::Adjoints adjoints(OutputVector{loss},
NodeVector{delta}); OutputVector{delta});
auto W0_next = W0 + adjoints.backprop_node(W0); auto W0_next = W0 + adjoints.backprop_node(W0);
auto b0_next = b0 + adjoints.backprop_node(b0); auto b0_next = b0 + adjoints.backprop_node(b0);
auto W1_next = W1 + adjoints.backprop_node(W1); auto W1_next = W1 + adjoints.backprop_node(W1);
...@@ -218,7 +218,7 @@ int main(int argc, const char* argv[]) ...@@ -218,7 +218,7 @@ int main(int argc, const char* argv[])
NodeMap train_node_map; NodeMap train_node_map;
auto train_function = clone_function( auto train_function = clone_function(
Function( Function(
NodeVector{loss, softmax, W0_next, b0_next, W1_next, b1_next}, OutputVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}), ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}),
train_node_map); train_node_map);
auto train_exec = backend->compile(train_function); auto train_exec = backend->compile(train_function);
...@@ -227,7 +227,7 @@ int main(int argc, const char* argv[]) ...@@ -227,7 +227,7 @@ int main(int argc, const char* argv[])
// X, W0, b0, W1, b1 -> softmax // X, W0, b0, W1, b1 -> softmax
NodeMap inference_node_map; NodeMap inference_node_map;
auto inference_function = clone_function( auto inference_function = clone_function(
Function(NodeVector{softmax}, ParameterVector{X, W0, b0, W1, b1}), Function(OutputVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
inference_node_map); inference_node_map);
auto inference_exe = backend->compile(inference_function); auto inference_exe = backend->compile(inference_function);
......
...@@ -5,7 +5,7 @@ Build and Test ...@@ -5,7 +5,7 @@ Build and Test
############### ###############
* :ref:`default_ngflags` * :ref:`default_ngflags`
* :ref:`ngraph_plaidml_backend` .. :ref:`ngraph_plaidml_backend`
There are a few common paths to take when manually building the |project| There are a few common paths to take when manually building the |project|
from source code. Today nGraph supports various developers working on all from source code. Today nGraph supports various developers working on all
...@@ -161,17 +161,17 @@ The process documented here will work on CentOS 7.4. ...@@ -161,17 +161,17 @@ The process documented here will work on CentOS 7.4.
$ make && sudo make install $ make && sudo make install
.. _ngraph_plaidml_backend: .. .. _ngraph_plaidml_backend: hide this until announcement is official
Building nGraph-PlaidML from source .. Building nGraph-PlaidML from source
=================================== .. ===================================
The following instructions will create the ``~/ngraph_plaidml_dist`` .. The following instructions will create the ``~/ngraph_plaidml_dist``
locally: .. locally:
#. Ensure you have installed the :ref:`prerequisites` for your OS. .. #. Ensure you have installed the :ref:`prerequisites` for your OS.
#. Install the prerequisites for the backend. Our hybrid ``NGRAPH_PLAIDML`` .. #. Install the prerequisites for the backend. Our hybrid ``NGRAPH_PLAIDML``
backend works best with Python3 versions. We recommend that you use a backend works best with Python3 versions. We recommend that you use a
virtual environment, due to some of the difficulties that users have virtual environment, due to some of the difficulties that users have
seen when trying to install outside of a venv. seen when trying to install outside of a venv.
...@@ -182,20 +182,20 @@ locally: ...@@ -182,20 +182,20 @@ locally:
$ pip install plaidml $ pip install plaidml
$ plaidml-setup $ plaidml-setup
#. Clone the source code, create and enter your build directory: .. #. Clone the source code, create and enter your build directory:
.. code-block:: console .. code-block:: console
$ git clone https://github.com/NervanaSystems/ngraph.git $ git clone https://github.com/NervanaSystems/ngraph.git
$ cd ngraph && mkdir build && cd build $ cd ngraph && mkdir build && cd build
#. Prepare the CMake files as follows: .. #. Prepare the CMake files as follows:
.. code-block:: console .. code-block:: console
$ cmake .. -DCMAKE_INSTALL_PREFIX=~/ngraph_plaidml_dist -DNGRAPH_CPU_ENABLE=OFF -DNGRAPH_PLAIDML_ENABLE=ON $ cmake .. -DCMAKE_INSTALL_PREFIX=~/ngraph_plaidml_dist -DNGRAPH_CPU_ENABLE=OFF -DNGRAPH_PLAIDML_ENABLE=ON
#. Run :command:`make` and ``make install``. Note that if you are building .. #. Run :command:`make` and ``make install``. Note that if you are building
outside a local or user path, you may need to run ``make install`` as the outside a local or user path, you may need to run ``make install`` as the
root user. root user.
...@@ -210,8 +210,8 @@ locally: ...@@ -210,8 +210,8 @@ locally:
tests can be run when PlaidML devices are available at the machine tests can be run when PlaidML devices are available at the machine
level. level.
For more about working with the PlaidML backend from nGraph, see our .. For more about working with the PlaidML backend from nGraph, see our
API documentation :doc:`backends/plaidml-ng-api/index`. .. API documentation :doc:`backends/plaidml-ng-api/index`.
macOS\* development macOS\* development
......
...@@ -99,8 +99,8 @@ Once the graph is built, we need to package it in a ``Function``: ...@@ -99,8 +99,8 @@ Once the graph is built, we need to package it in a ``Function``:
:lines: 35-36 :lines: 35-36
The first argument to the constuctor specifies the nodes that the function will The first argument to the constuctor specifies the nodes that the function will
return; in this case, the product. A ``NodeVector`` is a vector of shared return; in this case, the product. An ``OutputVector`` is a vector of references to
pointers of ``op::Node``. The second argument specifies the parameters of the outputs of ``op::Node``. The second argument specifies the parameters of the
function, in the order they are to be passed to the compiled function. A function, in the order they are to be passed to the compiled function. A
``ParameterVector`` is a vector of shared pointers to ``op::Parameter``. ``ParameterVector`` is a vector of shared pointers to ``op::Parameter``.
......
...@@ -62,14 +62,14 @@ more detail and describe how nGraph addresses them. ...@@ -62,14 +62,14 @@ more detail and describe how nGraph addresses them.
Problem 1: Kernel libraries do not support graph-level optimizations Problem 1: Kernel libraries do not support graph-level optimizations
-------------------------------------------------------------------- --------------------------------------------------------------------
The example diagrams below shows how a deep learning framework, when integrated The example diagrams below show how a deep learning framework, when integrated
with a kernel library, can optimally run each operation in a computational with a kernel library, can optimally run each operation in a computational
graph, but the choice of operations in the graph may not be optimal. graph, but the choice of operations in the graph may not be optimal.
.. _figure-A: .. _figure-A:
.. figure:: ../graphics/kernel-problem-1.png .. figure:: ../graphics/kernel-problem-1.png
:width: 555px :width: 100%
:alt: :alt:
...@@ -95,7 +95,7 @@ diagram. ...@@ -95,7 +95,7 @@ diagram.
.. _figure-B: .. _figure-B:
.. figure:: ../graphics/kernel-problem-2.png .. figure:: ../graphics/kernel-problem-2.png
:width: 555px :width: 100%
:alt: :alt:
Each framework must be manually integrated with each hardware-specific kernel Each framework must be manually integrated with each hardware-specific kernel
...@@ -130,7 +130,7 @@ work for what will ultimately be a fragile setup that is costly to maintain. ...@@ -130,7 +130,7 @@ work for what will ultimately be a fragile setup that is costly to maintain.
.. _figure-C: .. _figure-C:
.. figure:: ../graphics/kernel-problem-3.png .. figure:: ../graphics/kernel-problem-3.png
:width: 555px :width: 100%
:alt: :alt:
......
...@@ -16,14 +16,19 @@ We are pleased to announce the release of version |version|-doc. ...@@ -16,14 +16,19 @@ We are pleased to announce the release of version |version|-doc.
Core updates for |version| Core updates for |version|
~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Better PlaidML support
+ More ONNX ops
+ Elementwise divide defaults to Python semantics
+ GenerateMask seed optional
+ Graph visualization improvements
+ Preserve control dependencies in more places
+ GetOutputElement has single input
Latest doc updates
~~~~~~~~~~~~~~~~~~
+ Add instructions how to build ``NGRAPH_PLAIDML`` backend. .. Latest doc updates
.. ~~~~~~~~~~~~~~~~~~
.. + Add instructions how to build ``NGRAPH_PLAIDML`` backend.
.. important:: Pre-releases (``-rc-0.*``) have newer features, and are less stable. .. important:: Pre-releases (``-rc-0.*``) have newer features, and are less stable.
...@@ -35,7 +40,6 @@ Changelog on Previous Releases ...@@ -35,7 +40,6 @@ Changelog on Previous Releases
0.23 0.23
---- ----
+ PlaidML support
+ More ONNX ops + More ONNX ops
+ Elementwise divide defaults to Python semantics + Elementwise divide defaults to Python semantics
+ GenerateMask seed optional + GenerateMask seed optional
......
:orphan: :orphan:
.. toctree:: .. toctree::
:caption: Sitemap :includehidden:
:maxdepth: 1
frameworks/index
frameworks/index project/index
python_api/index python_api/index
inspection/index inspection/index
core/overview core/overview
backends/index backends/index
project/index project/extras/index
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
namespace py = pybind11; namespace py = pybind11;
static const char* CAPSULE_NAME = "ngraph_function";
void regclass_pyngraph_Function(py::module m) void regclass_pyngraph_Function(py::module m)
{ {
py::class_<ngraph::Function, std::shared_ptr<ngraph::Function>> function(m, "Function"); py::class_<ngraph::Function, std::shared_ptr<ngraph::Function>> function(m, "Function");
...@@ -49,4 +51,41 @@ void regclass_pyngraph_Function(py::module m) ...@@ -49,4 +51,41 @@ void regclass_pyngraph_Function(py::module m)
py::cast(self.get_output_shape(0)).attr("__str__")().cast<std::string>(); py::cast(self.get_output_shape(0)).attr("__str__")().cast<std::string>();
return "<" + class_name + ": '" + self.get_friendly_name() + "' (" + shape + ")>"; return "<" + class_name + ": '" + self.get_friendly_name() + "' (" + shape + ")>";
}); });
function.def_static("from_capsule", [](py::object* capsule) {
// get the underlying PyObject* which is a PyCapsule pointer
auto* pybind_capsule_ptr = capsule->ptr();
// extract the pointer stored in the PyCapsule under the name CAPSULE_NAME
auto* capsule_ptr = PyCapsule_GetPointer(pybind_capsule_ptr, CAPSULE_NAME);
auto* ngraph_function = static_cast<std::shared_ptr<ngraph::Function>*>(capsule_ptr);
if (ngraph_function)
{
return *ngraph_function;
}
else
{
throw std::runtime_error("The provided capsule does not contain an ngraph::Function");
}
});
function.def_static("to_capsule", [](std::shared_ptr<ngraph::Function>& ngraph_function) {
// create a shared pointer on the heap before putting it in the capsule
// this secures the lifetime of the object transferred by the capsule
auto* sp_copy = new std::shared_ptr<ngraph::Function>(ngraph_function);
// a destructor callback that will delete the heap allocated shared_ptr
// when the capsule is destructed
auto sp_deleter = [](PyObject* capsule) {
auto* capsule_ptr = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
auto* function_sp = static_cast<std::shared_ptr<ngraph::Function>*>(capsule_ptr);
if (function_sp)
{
delete function_sp;
}
};
// put the shared_ptr in a new capsule under the same name as in "from_capsule"
auto pybind_capsule = py::capsule(sp_copy, CAPSULE_NAME, sp_deleter);
return pybind_capsule;
});
} }
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "ngraph/op/divide.hpp" #include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp" #include "ngraph/op/dot.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp" #include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/gather.hpp"
#include "ngraph/op/greater.hpp" #include "ngraph/op/greater.hpp"
#include "ngraph/op/less.hpp" #include "ngraph/op/less.hpp"
#include "ngraph/op/maximum.hpp" #include "ngraph/op/maximum.hpp"
...@@ -63,6 +64,7 @@ ...@@ -63,6 +64,7 @@
using llvm::SmallVector; using llvm::SmallVector;
using llvm::StringRef; using llvm::StringRef;
using llvm::make_unique; using llvm::make_unique;
using llvm::ArrayRef;
using namespace ngraph::runtime::ngmlir; using namespace ngraph::runtime::ngmlir;
...@@ -282,11 +284,20 @@ void MLIRCompiler::build_ng_dialect() ...@@ -282,11 +284,20 @@ void MLIRCompiler::build_ng_dialect()
throw unsupported_op{std::string{"The MLIR backend doesn't currently implement the '"} + throw unsupported_op{std::string{"The MLIR backend doesn't currently implement the '"} +
np->description() + "' operation"}; np->description() + "' operation"};
} }
mlir::Value* mlir_value = it->second(*this, np.get()); mlir::Operation* op = it->second(*this, np.get());
// builders that have multiple result values will update the value map, and set their ret values to null // This assumes simple 1:1 mapping between output edges and generated MLIR op results
if (mlir_value) // If the mapping is more complex, the create_op helper can return null operation
// and handles populating the value map itself
if (op)
{ {
update_tensor_value(np->get_output_tensor_ptr().get(), mlir_value); for (auto i = 0; i < op->getNumResults(); i++)
{
mlir::Value* result = op->getResult(i);
if (result)
{
update_tensor_value(np->get_output_tensor_ptr(i).get(), result);
}
}
} }
} }
create_return(); create_return();
...@@ -299,133 +310,125 @@ namespace ngraph ...@@ -299,133 +310,125 @@ namespace ngraph
namespace ngmlir namespace ngmlir
{ {
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Add) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Add)
{ {
return compiler.create_binary_op<mlir::NGAddOp>(ng_node); return compiler.create_generic_op<mlir::NGAddOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Subtract) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Subtract)
{ {
return compiler.create_binary_op<mlir::NGSubOp>(ng_node); return compiler.create_generic_op<mlir::NGSubOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Multiply) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Multiply)
{ {
return compiler.create_binary_op<mlir::NGMulOp>(ng_node); return compiler.create_generic_op<mlir::NGMulOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Divide) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Divide)
{ {
return compiler.create_binary_op<mlir::NGDivOp>(ng_node); return compiler.create_generic_op<mlir::NGDivOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Greater) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Greater)
{ {
return compiler.create_binary_op<mlir::NGGreaterOp>(ng_node); return compiler.create_generic_op<mlir::NGGreaterOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Less) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Less)
{ {
return compiler.create_binary_op<mlir::NGLessOp>(ng_node); return compiler.create_generic_op<mlir::NGLessOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Maximum) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Maximum)
{ {
return compiler.create_binary_op<mlir::NGMaxOp>(ng_node); return compiler.create_generic_op<mlir::NGMaxOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Minimum) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Minimum)
{ {
return compiler.create_binary_op<mlir::NGMinOp>(ng_node); return compiler.create_generic_op<mlir::NGMinOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMax) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMax)
{ {
return compiler.create_index_reduction<mlir::NGArgMaxRedOp>(ng_node); return compiler.create_index_reduction<mlir::NGArgMaxRedOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMin) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMin)
{ {
return compiler.create_index_reduction<mlir::NGArgMinRedOp>(ng_node); return compiler.create_index_reduction<mlir::NGArgMinRedOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Dot) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Dot)
{ {
return compiler.create_binary_op<mlir::NGDotOp>(ng_node); return compiler.create_generic_op<mlir::NGDotOp>(ng_node);
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Concat) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Concat)
{ {
return compiler.create_concat(ng_node); auto ng_node_concat = static_cast<const ngraph::op::Concat*>(ng_node);
auto op = compiler.create_generic_op<mlir::NGConcatOp>(ng_node);
op->setAttr("concatenation_axis",
compiler.m_builder->getI64IntegerAttr(
ng_node_concat->get_concatenation_axis()));
return op;
} }
template <> template <>
mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Relu) mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Gather)
{ {
return compiler.create_unary_op<mlir::NGReluOp>(ng_node); auto ng_node_gather = static_cast<const ngraph::op::Gather*>(ng_node);
auto op = compiler.create_generic_op<mlir::NGGatherOp>(ng_node);
op->setAttr("axis",
compiler.m_builder->getI64IntegerAttr(ng_node_gather->get_axis()));
return op;
} }
} }
} }
} }
const MLIRCompiler::MLIRCompOpMap MLIRCompiler::op_dispatcher{ template <typename Op>
#define MLIR_OP(OP) {TI(ngraph::op::OP), &MLIRCompiler::create_op<ngraph::op::OP>}, mlir::Operation* MLIRCompiler::create_generic_op(const ngraph::Node* ng_node)
#include "ops_supported.inc"
};
template <typename UnaryOp>
mlir::Value* MLIRCompiler::create_unary_op(const ngraph::Node* ng_node)
{
auto lhs = ng_node->get_argument(0)->get_output_tensor_ptr();
auto lhs_v = get_tensor_value(lhs.get()).m_value;
auto res_type = get_mlir_type(ng_node->get_output_tensor_ptr().get());
return m_builder->create<UnaryOp>(mlir::UnknownLoc::get(&m_context), res_type, lhs_v)
.getResult();
}
template <typename BinOp>
mlir::Value* MLIRCompiler::create_binary_op(const ngraph::Node* ng_node)
{
auto lhs = ng_node->get_argument(0)->get_output_tensor_ptr();
auto rhs = ng_node->get_argument(1)->get_output_tensor_ptr();
auto lhs_v = get_tensor_value(lhs.get()).m_value;
auto rhs_v = get_tensor_value(rhs.get()).m_value;
auto res_type = get_mlir_type(ng_node->get_output_tensor_ptr().get());
return m_builder->create<BinOp>(mlir::UnknownLoc::get(&m_context), res_type, lhs_v, rhs_v)
.getResult();
}
mlir::Value* MLIRCompiler::create_concat(const ngraph::Node* ng_node)
{ {
std::vector<mlir::Value*> arg_values; std::vector<mlir::Value*> arg_values;
auto ng_node_concat = static_cast<const ngraph::op::Concat*>(ng_node); std::vector<mlir::Type> res_types;
for (auto& arg : ng_node->get_arguments()) for (auto& arg : ng_node->get_arguments())
{ {
auto arg_tensor = arg->get_output_tensor_ptr(); auto arg_tensor = arg->get_output_tensor_ptr();
auto arg_v = get_tensor_value(arg_tensor.get()).m_value; auto arg_v = get_tensor_value(arg_tensor.get()).m_value;
arg_values.push_back(arg_v); arg_values.push_back(arg_v);
} }
auto res_type = get_mlir_type(ng_node->get_output_tensor_ptr().get());
return m_builder for (auto& output : ng_node->outputs())
->create<mlir::NGConcatOp>( {
mlir::UnknownLoc::get(&m_context), res_types.push_back(get_mlir_type(output.get_tensor_ptr().get()));
res_type, }
arg_values,
m_builder->getI64IntegerAttr(ng_node_concat->get_concatenation_axis())) return (m_builder->create<Op,
.getResult(); ArrayRef<mlir::Type>,
ArrayRef<mlir::Value*>,
ArrayRef<mlir::NamedAttribute>>(
mlir::UnknownLoc::get(&m_context), res_types, arg_values, {/* no attrs */}))
.getOperation();
} }
const MLIRCompiler::MLIRCompOpMap MLIRCompiler::op_dispatcher{
#define MLIR_OP(OP) {TI(ngraph::op::OP), &MLIRCompiler::create_op<ngraph::op::OP>},
#include "ops_supported.inc"
};
void MLIRCompiler::create_return() void MLIRCompiler::create_return()
{ {
std::vector<mlir::Value*> value_list; std::vector<mlir::Value*> value_list;
...@@ -437,21 +440,16 @@ void MLIRCompiler::create_return() ...@@ -437,21 +440,16 @@ void MLIRCompiler::create_return()
} }
template <typename RedOp> template <typename RedOp>
mlir::Value* MLIRCompiler::create_index_reduction(const ngraph::Node* ng_node) mlir::Operation* MLIRCompiler::create_index_reduction(const ngraph::Node* ng_node)
{ {
auto* idx_red = static_cast<const ngraph::op::util::IndexReduction*>(ng_node); auto* idx_red = static_cast<const ngraph::op::util::IndexReduction*>(ng_node);
auto op = create_generic_op<RedOp>(ng_node);
auto arg = idx_red->get_argument(0); mlir::ArrayAttr red_axes_attr =
size_t red_axis = idx_red->get_reduction_axis(); m_builder->getI64ArrayAttr({(int64_t)idx_red->get_reduction_axis()});
op->setAttr("axes", red_axes_attr);
mlir::Value* arg_val = get_tensor_value(arg->get_output_tensor_ptr().get()).m_value; return op;
mlir::ArrayAttr red_axes_attr = m_builder->getI64ArrayAttr({(int64_t)red_axis});
return m_builder
->create<RedOp>(
mlir::UnknownLoc::get(&m_context), get_mlir_type(ng_node), arg_val, red_axes_attr)
.getResult();
} }
// Binds MLIR function arguments to the proper values. This includes externally allocated tensors // Binds MLIR function arguments to the proper values. This includes externally allocated tensors
// helpers to be used inside the function. // helpers to be used inside the function.
void MLIRCompiler::bind_arguments() void MLIRCompiler::bind_arguments()
......
...@@ -98,25 +98,21 @@ namespace ngraph ...@@ -98,25 +98,21 @@ namespace ngraph
void build_ng_dialect(); void build_ng_dialect();
template <typename OP> template <typename Op>
static mlir::Value* create_op(MLIRCompiler& compiler, const ngraph::Node* ng_node) static mlir::Operation* create_op(MLIRCompiler& compiler,
const ngraph::Node* ng_node)
{ {
throw std::runtime_error("Unimplemented op '" + ng_node->description() + throw std::runtime_error("Unimplemented op '" + ng_node->description() +
"' in MLIR Compiler"); "' in MLIR Compiler");
} }
template <typename UnaryOp> // Generic op lowerer to ng dialect.
mlir::Value* create_unary_op(const ngraph::Node* ng_node); // Simply maps ngraph tensors to values and generate an OP. No op-specific logic.
template <typename Op>
template <typename BinOp> mlir::Operation* create_generic_op(const ngraph::Node* ng_node);
mlir::Value* create_binary_op(const ngraph::Node* ng_node);
// TODO(amprocte): Can we have a create_variadic_op that is able to handle the
// attributes?
mlir::Value* create_concat(const ngraph::Node* ng_node);
template <typename RedOp> template <typename RedOp>
mlir::Value* create_index_reduction(const ngraph::Node* ng_node); mlir::Operation* create_index_reduction(const ngraph::Node* ng_node);
void create_return(); void create_return();
...@@ -150,7 +146,7 @@ namespace ngraph ...@@ -150,7 +146,7 @@ namespace ngraph
using TensorToInfo = std::pair<descriptor::Tensor*, TensorInfo>; using TensorToInfo = std::pair<descriptor::Tensor*, TensorInfo>;
using TensorToInfoMap = std::unordered_map<descriptor::Tensor*, TensorInfo>; using TensorToInfoMap = std::unordered_map<descriptor::Tensor*, TensorInfo>;
using MLIRCompOpFunction = using MLIRCompOpFunction =
std::function<mlir::Value*(MLIRCompiler& compiler, const ngraph::Node*)>; std::function<mlir::Operation*(MLIRCompiler& compiler, const ngraph::Node*)>;
using MLIRCompOpMap = std::unordered_map<std::type_index, MLIRCompOpFunction>; using MLIRCompOpMap = std::unordered_map<std::type_index, MLIRCompOpFunction>;
// Maps tensor to the value it represents in the IR // Maps tensor to the value it represents in the IR
......
...@@ -168,6 +168,39 @@ static mlir::LogicalResult verifyCmpOp(T* op) ...@@ -168,6 +168,39 @@ static mlir::LogicalResult verifyCmpOp(T* op)
return mlir::success(); return mlir::success();
} }
template <>
mlir::LogicalResult verifyOp(NGGatherOp* op)
{
Type ty = op->params()->getType();
NGTensorType inputType = ty.cast<NGTensorType>();
ty = op->indices()->getType();
NGTensorType indicesType = ty.cast<NGTensorType>();
// ensure axis < params rank
if (op->axis().getSExtValue() >= inputType.getRank())
return op->emitOpError("Gather axis is larger than input rank");
ty = indicesType.getElementType();
// ensure indices are I32 or I64
if (!ty.isa<NGIntegerType>())
return op->emitOpError("Indices tensor is not of Integer type");
NGIntegerType indicesEltType = ty.cast<NGIntegerType>();
if (!indicesEltType.isInt32() && !indicesEltType.isInt64())
return op->emitOpError("Indices tensor is not of I32 or I64 type");
mlir::Type r0 = op->res()->getType();
NGTensorType resType = r0.cast<NGTensorType>();
// ensure result is compatible with input
if (!resType.getRank() == inputType.getRank() + indicesType.getRank() - 1)
return op->emitOpError("Incompatible result shape and/or type");
return mlir::success();
}
namespace mlir namespace mlir
{ {
#define GET_OP_CLASSES #define GET_OP_CLASSES
......
...@@ -186,8 +186,8 @@ def NGDotOp : NG_Binary_Op<"dot"> ...@@ -186,8 +186,8 @@ def NGDotOp : NG_Binary_Op<"dot">
// class, but I'm not sure how to add concatenation_axis into the args if we // class, but I'm not sure how to add concatenation_axis into the args if we
// do that. // do that.
def NGConcatOp : def NGConcatOp :
NG_OneResult_Op<"concat", [NoSideEffect]>, NG_OneResult_Op<"concat", [NoSideEffect]>,
Arguments<(ins Variadic<NG_TensorType>:$args, I64Attr:$concatenation_axis)> Arguments<(ins Variadic<NG_TensorType>:$args, I64Attr:$concatenation_axis)>
{ {
let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }]; let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
...@@ -200,7 +200,7 @@ class NG_Axis_Reduction_Op<string mnemonic, list<OpTrait> traits = []> : ...@@ -200,7 +200,7 @@ class NG_Axis_Reduction_Op<string mnemonic, list<OpTrait> traits = []> :
{ {
let summary = "Base class for reduction operations that perform a reduction " let summary = "Base class for reduction operations that perform a reduction "
"across the axes of a single tensor."; "across the axes of a single tensor.";
let description = "Axes are represented as an array of I64 attributes."; let description = [{Axes are represented as an array of I64 attributes.}];
let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }]; let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
...@@ -257,6 +257,24 @@ def NGAnyRedOp : NG_Axis_Reduction_Op<"any.red"> ...@@ -257,6 +257,24 @@ def NGAnyRedOp : NG_Axis_Reduction_Op<"any.red">
let verifier = [{ return verifyLogicalReductionOp(this); }]; let verifier = [{ return verifyLogicalReductionOp(this); }];
} }
// Gather
def NGGatherOp :
NG_OneResult_Op<"gather", [NoSideEffect]>,
Arguments<(ins NG_TensorType:$params, NG_TensorType:$indices, I64Attr:$axis)>
{
let summary = "Gather slices from params along the specified axis according to indices";
let description = [{
Gather slices from axis of params according to indices
params The tensor from which slices are gathered
indices Index tensor. Data type must be `element::i32` or `element::i64`
axis Axis in params to gather
}];
let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
let verifier = [{ return verifyOp(this); }];
}
// Terminator Ops // Terminator Ops
def NGReturnOp : NG_Terminator_Op<"return">; def NGReturnOp : NG_Terminator_Op<"return">;
......
...@@ -199,6 +199,7 @@ namespace mlir ...@@ -199,6 +199,7 @@ namespace mlir
} }
Shape getShape() const { return m_shape; } Shape getShape() const { return m_shape; }
int64_t getRank() const { return m_shape.size(); }
EltType getElementType() const { return m_eltType; } EltType getElementType() const { return m_eltType; }
private: private:
NGTensorTypeStorage(EltType eltType, Shape shape) NGTensorTypeStorage(EltType eltType, Shape shape)
......
...@@ -646,6 +646,123 @@ namespace ...@@ -646,6 +646,123 @@ namespace
return matchSuccess(); return matchSuccess();
} }
REWRITER(NGGatherOp)
{
auto gatherOp = cast<NGGatherOp>(op);
auto loc = gatherOp.getLoc();
ScopedContext scope(rewriter, loc);
// Get operands
Value* result = m_pass.buildOutputDefs(op, rewriter)[0];
NGRAPH_CHECK(result, "Unexpected null result in GatherOp");
auto resultTy = result->getType().cast<MemRefType>();
Value* params = operands[0];
Value* indices = operands[1];
auto axis = gatherOp.axis().getSExtValue();
// Create view to write into result.
MemRefView vRes(result), vParams(params), vIndices(indices);
// Indexed Values
IndexedValue iRes(result), iParams(params), iIndices(indices);
// Construct outer loop for params dims. Exclude the axis dim.
SmallVector<ValueHandle, 4> paramsLbs, paramsUbs;
SmallVector<IndexHandle, 4> paramsIVs;
SmallVector<int64_t, 4> paramsSteps;
SmallVector<ValueHandle*, 4> paramsIVPtrs;
for (auto i = 0; i < vParams.rank(); i++)
{
// skip gather axis
if (i == axis)
continue;
paramsLbs.push_back(IndexHandle(vParams.lb(i)));
paramsUbs.push_back(IndexHandle(vParams.ub(i)));
paramsSteps.push_back(vParams.step(i));
}
NGRAPH_CHECK(paramsLbs.size() == vParams.rank() - 1 &&
paramsUbs.size() == paramsLbs.size() &&
paramsSteps.size() == paramsLbs.size(),
"Incorrect loop nest bounds size for gather params");
paramsIVs = IndexHandle::makeIndexHandles(vParams.rank() - 1);
paramsIVPtrs = IndexHandle::makeIndexHandlePointers(paramsIVs);
auto indicesLbs = vIndices.getLbs();
auto indicesUbs = vIndices.getUbs();
auto indicesSteps = vIndices.getSteps();
auto indicesIVs = IndexHandle::makeIndexHandles(vIndices.rank());
auto indicesIVPtrs = IndexHandle::makeIndexHandlePointers(indicesIVs);
SmallVector<IndexHandle, 8> paramsIndices, resIndices;
// Make sure we are going to create loops
NGRAPH_CHECK(vParams.rank() > 0, "Invalid size for indices steps");
// Let params rank : N
// Let indices rank : M
// Let axis be A
// Generate
// params loops
// for P_0: 0 -> params.dim[0]
// for P_1: 0 -> params.dim[1]
// for P_2: 0 -> params.dim[2]
// ...
// for P_(A-1):0 -> params.dim[A-1]
// for P_(A+1):0 -> params.dim[A+1]
// ...
// for P_(N-1):0 -> params.dim[N-1]
// indices loops
// for I_0:0 -> indices.dim[0]
// ...
// for I_(M-1):0 -> indices.dim[M-1]
// res[P_0, P_1, .. P_(A-1), I_0, .., I_(M-1), P_(A+1), ... P_(N-1)] =
// params[P_0, P_1, .. P_(A-1), indices[I_0, .., I_(M-1)], P_(A+1), ... P_(N-1)];
LoopNestBuilder(paramsIVPtrs, paramsLbs, paramsUbs, paramsSteps)([&] {
LoopNestBuilder(indicesIVPtrs, indicesLbs, indicesUbs, indicesSteps)([&] {
// Load axis value from indices array and cast it to Index Type
ValueHandle axisIdx = ValueHandle::create<IndexCastOp>(
(ValueHandle)iIndices(indicesIVs), rewriter.getIndexType());
// construct indices for param
// [P_0, P_1, .. P_axis-1, Indices[I0, I1, .. I_k-1], P_axis+1, P_axis+2, .. P_n-1]
for (auto i = 0, j = 0; i < vParams.rank(); i++)
{
if (i == axis)
{
paramsIndices.push_back(IndexHandle(axisIdx));
}
else
{
paramsIndices.push_back(paramsIVs[j++]);
}
}
// construct indices for result
// [P_0, P_1, .. P_axis-1, I0, I1, .. I_k-1, P_axis+1, P_axis+2, .. P_n-1]
for (auto i = 0, j = 0; i < vParams.rank() + vIndices.rank() - 1;)
{
if (i == axis && indicesIVs.size() > 0)
{
resIndices.append(indicesIVs.begin(), indicesIVs.end());
i += indicesIVs.size();
}
else
{
resIndices.push_back(paramsIVs[j++]);
i++;
}
}
// Store into result
iRes(resIndices) = iParams(paramsIndices);
});
});
rewriter.replaceOp(op, {result});
return matchSuccess();
}
REWRITER(NGReturnOp) REWRITER(NGReturnOp)
{ {
rewriter.replaceOpWithNewOp<ReturnOp>(op); rewriter.replaceOpWithNewOp<ReturnOp>(op);
...@@ -653,7 +770,7 @@ namespace ...@@ -653,7 +770,7 @@ namespace
} }
#undef REWRITER #undef REWRITER
/// End of pattern matchers
template <typename OP> template <typename OP>
void lower_binary_elementwise(Operation* op, void lower_binary_elementwise(Operation* op,
ArrayRef<Value*> operands, ArrayRef<Value*> operands,
......
...@@ -29,6 +29,7 @@ MLIR_OP(NGArgMinRedOp) ...@@ -29,6 +29,7 @@ MLIR_OP(NGArgMinRedOp)
MLIR_OP(NGConcatOp) MLIR_OP(NGConcatOp)
MLIR_OP(NGDivOp) MLIR_OP(NGDivOp)
MLIR_OP(NGDotOp) MLIR_OP(NGDotOp)
MLIR_OP(NGGatherOp)
MLIR_OP(NGGreaterOp) MLIR_OP(NGGreaterOp)
MLIR_OP(NGLessOp) MLIR_OP(NGLessOp)
MLIR_OP(NGMulOp) MLIR_OP(NGMulOp)
......
...@@ -9,6 +9,7 @@ MLIR_OP(ArgMax) ...@@ -9,6 +9,7 @@ MLIR_OP(ArgMax)
MLIR_OP(Divide) MLIR_OP(Divide)
MLIR_OP(Dot) MLIR_OP(Dot)
MLIR_OP(Concat) MLIR_OP(Concat)
MLIR_OP(Gather)
MLIR_OP(Greater) MLIR_OP(Greater)
MLIR_OP(Less) MLIR_OP(Less)
MLIR_OP(Maximum) MLIR_OP(Maximum)
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "ngraph/op/divide.hpp" #include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp" #include "ngraph/op/dot.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp" #include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/gather.hpp"
#include "ngraph/op/get_output_element.hpp" #include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp" #include "ngraph/op/greater.hpp"
#include "ngraph/op/less.hpp" #include "ngraph/op/less.hpp"
......
...@@ -463,6 +463,8 @@ set (SRC ...@@ -463,6 +463,8 @@ set (SRC
runtime/backend.hpp runtime/backend.hpp
runtime/backend_manager.cpp runtime/backend_manager.cpp
runtime/backend_manager.hpp runtime/backend_manager.hpp
runtime/chrome_trace.cpp
runtime/chrome_trace.hpp
runtime/executable.cpp runtime/executable.cpp
runtime/executable.hpp runtime/executable.hpp
runtime/host_tensor.cpp runtime/host_tensor.cpp
......
...@@ -51,11 +51,6 @@ OutputVector make_zeros(std::shared_ptr<Node> x) ...@@ -51,11 +51,6 @@ OutputVector make_zeros(std::shared_ptr<Node> x)
return zeros; return zeros;
} }
autodiff::Adjoints::Adjoints(const NodeVector& ys, const NodeVector& cs)
: Adjoints(OutputVector(ys.begin(), ys.end()), OutputVector(cs.begin(), cs.end()))
{
}
autodiff::Adjoints::Adjoints(const OutputVector& ys, const OutputVector& cs) autodiff::Adjoints::Adjoints(const OutputVector& ys, const OutputVector& cs)
{ {
if (ys.size() != cs.size()) if (ys.size() != cs.size())
......
...@@ -46,8 +46,6 @@ namespace ngraph ...@@ -46,8 +46,6 @@ namespace ngraph
/// \param c An expression for where to evaluate the derivatives /// \param c An expression for where to evaluate the derivatives
Adjoints(const OutputVector& y, const OutputVector& c); Adjoints(const OutputVector& y, const OutputVector& c);
Adjoints(const NodeVector& y, const NodeVector& c);
Adjoints(const Adjoints& adjoints) = default; Adjoints(const Adjoints& adjoints) = default;
Adjoints& operator=(const Adjoints& adjoints) = default; Adjoints& operator=(const Adjoints& adjoints) = default;
Adjoints() = default; Adjoints() = default;
......
...@@ -41,6 +41,30 @@ Function::Function(const ResultVector& results, ...@@ -41,6 +41,30 @@ Function::Function(const ResultVector& results,
init(); init();
} }
Function::Function(const OutputVector& results,
const ParameterVector& parameters,
const std::string& name)
: m_results(results.size())
, m_parameters(parameters)
, m_temporary_pool_size(0)
, m_instance_id(m_next_instance_id.fetch_add(1))
, m_name(name)
, m_unique_name("Function_" + to_string(m_instance_id))
{
if (std::any_of(results.cbegin(), results.cend(), [](Output<Node> n) {
return std::dynamic_pointer_cast<op::Result>(n.get_node_shared_ptr());
}))
{
throw ngraph_error(
" Results already contain op::Results. Use a c-tor that takes a ResultVector");
}
std::transform(results.begin(), results.end(), m_results.begin(), [](Output<Node> n) {
return std::make_shared<op::Result>(n);
});
init();
}
Function::Function(const NodeVector& results, Function::Function(const NodeVector& results,
const ParameterVector& parameters, const ParameterVector& parameters,
const std::string& name) const std::string& name)
...@@ -208,6 +232,11 @@ shared_ptr<Node> Function::get_output_op(size_t i) const ...@@ -208,6 +232,11 @@ shared_ptr<Node> Function::get_output_op(size_t i) const
return m_results.at(i); return m_results.at(i);
} }
Output<Node> Function::output(size_t i) const
{
return m_results.at(i);
}
shared_ptr<Node> Function::get_result() const shared_ptr<Node> Function::get_result() const
{ {
if (m_results.size() != 1) if (m_results.size() != 1)
......
...@@ -37,6 +37,10 @@ namespace ngraph ...@@ -37,6 +37,10 @@ namespace ngraph
const ParameterVector& parameters, const ParameterVector& parameters,
const std::string& name = ""); const std::string& name = "");
Function(const OutputVector& results,
const ParameterVector& parameters,
const std::string& name = "");
Function(const std::shared_ptr<Node>& result, Function(const std::shared_ptr<Node>& result,
const ParameterVector& parameters, const ParameterVector& parameters,
const std::string& name = ""); const std::string& name = "");
...@@ -55,6 +59,8 @@ namespace ngraph ...@@ -55,6 +59,8 @@ namespace ngraph
/// Return the op that generates output i /// Return the op that generates output i
std::shared_ptr<Node> get_output_op(size_t i) const; std::shared_ptr<Node> get_output_op(size_t i) const;
Output<Node> output(size_t i) const;
/// Return the element type of output i /// Return the element type of output i
const element::Type& get_output_element_type(size_t i) const; const element::Type& get_output_element_type(size_t i) const;
......
//*****************************************************************************
// Copyright 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "chrome_trace.hpp"
#include "ngraph/log.hpp"
using namespace std;
using namespace ngraph;
static bool read_tracing_env_var()
{
static const bool is_enabled = (getenv("NGRAPH_ENABLE_TRACING") != nullptr);
return is_enabled;
}
mutex runtime::event::Manager::s_file_mutex;
bool runtime::event::Manager::s_tracing_enabled = read_tracing_env_var();
runtime::event::Duration::Duration(const string& name, const string& category, const string& args)
{
if (Manager::is_tracing_enabled())
{
m_start = Manager::get_current_microseconds();
m_stop = 0;
m_name = name;
m_category = category;
m_args = args;
}
}
void runtime::event::Duration::stop()
{
if (Manager::is_tracing_enabled())
{
m_stop = Manager::get_current_microseconds();
}
}
void runtime::event::Duration::write()
{
if (Manager::is_tracing_enabled())
{
size_t stop_time = (m_stop != 0 ? m_stop : Manager::get_current_microseconds());
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
Manager::get_output_stream() <<
R"({"name":")" << m_name << R"(","cat":")" << m_category << R"(","ph":"X","pid":)"
<< Manager::get_process_id() << R"(,"tid":)"
<< Manager::get_thread_id() <<
R"(,"ts":)" << m_start << R"(,"dur":)" << (stop_time - m_start);
if (!m_args.empty())
{
out <<
R"(,"args":)" << m_args;
}
out << "}";
}
}
runtime::event::Object::Object(const string& name, const string& args)
: m_name{name}
, m_id{static_cast<size_t>(chrono::high_resolution_clock::now().time_since_epoch().count())}
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
out << R"({"name":")" << m_name << R"(","ph":"N","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
if (!args.empty())
{
out <<
R"(,"args":)" << args;
}
out << "}";
write_snapshot(out, args);
}
}
void runtime::event::Object::snapshot(const string& args)
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
write_snapshot(out, args);
}
}
void runtime::event::Object::write_snapshot(ostream& out, const string& args)
{
out << R"({"name":")" << m_name << R"(","ph":"O","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
if (!args.empty())
{
out <<
R"(,"args":)" << args;
}
out << "}";
}
void runtime::event::Object::destroy()
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
out << R"({"name":")" << m_name << R"(","ph":"D","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id()
<< "}";
}
}
void runtime::event::Manager::open(const string& path)
{
ofstream& out = get_output_stream();
if (out.is_open() == false)
{
out.open(path, ios_base::trunc);
out << "[\n";
}
}
void runtime::event::Manager::close()
{
ofstream& out = get_output_stream();
if (out.is_open())
{
out << "\n]\n";
out.close();
}
}
ofstream& runtime::event::Manager::get_output_stream()
{
static ofstream s_event_log;
return s_event_log;
}
const string& runtime::event::Manager::get_process_id()
{
static const string s_pid = to_string(getpid());
return s_pid;
}
void runtime::event::Manager::enable_event_tracing()
{
s_tracing_enabled = true;
}
void runtime::event::Manager::disable_event_tracing()
{
s_tracing_enabled = false;
}
bool runtime::event::Manager::is_event_tracing_enabled()
{
return s_tracing_enabled;
}
string runtime::event::Manager::get_thread_id()
{
thread::id tid = this_thread::get_id();
static map<thread::id, string> tid_map;
auto it = tid_map.find(tid);
string rc;
if (it == tid_map.end())
{
stringstream ss;
ss << "\"" << tid << "\"";
rc = ss.str();
tid_map.insert({tid, rc});
}
else
{
rc = it->second;
}
return rc;
}
//*****************************************************************************
// Copyright 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <chrono>
#include <fstream>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#ifdef _WIN32
#include <windows.h>
// windows.h must be before processthreadsapi.h so we need this comment
#include <processthreadsapi.h>
#define getpid() GetCurrentProcessId()
#else
#include <unistd.h>
#endif
namespace ngraph
{
namespace runtime
{
namespace event
{
class Duration;
class Object;
class Manager;
}
}
}
//
// This class records timestamps for a given user defined event and
// produces output in the chrome tracing format that can be used to view
// the events of a running program
//
// Following is the format of a trace event
//
// {
// "name": "myName",
// "cat": "category,list",
// "ph": "B",
// "ts": 12345,
// "pid": 123,
// "tid": 456,
// "args": {
// "someArg": 1,
// "anotherArg": {
// "value": "my value"
// }
// }
// }
//
// The trace file format is defined here:
// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
//
// The trace file can be viewed by Chrome browser using the
// URL: chrome://tracing/
//
// More information about this is at:
// http://dev.chromium.org/developers/how-tos/trace-event-profiling-tool
class ngraph::runtime::event::Manager
{
friend class Duration;
friend class Object;
public:
static void open(const std::string& path = "runtime_event_trace.json");
static void close();
static bool is_tracing_enabled() { return s_tracing_enabled; }
static void enable_event_tracing();
static void disable_event_tracing();
static bool is_event_tracing_enabled();
private:
static std::ofstream& get_output_stream();
static const std::string& get_process_id();
static size_t get_current_microseconds()
{
return std::chrono::high_resolution_clock::now().time_since_epoch().count() / 1000;
}
static std::string get_thread_id();
static std::mutex& get_mutex() { return s_file_mutex; }
static std::ostream s_ostream;
static std::mutex s_file_mutex;
static bool s_tracing_enabled;
};
class ngraph::runtime::event::Duration
{
public:
explicit Duration(const std::string& name,
const std::string& category,
const std::string& args = "");
~Duration() { write(); }
/// \brief stop the timer without writing the data to the log file. To write the data
/// call the `write` method
/// Calls to stop() are optional
void stop();
/// \brief write the log data to the log file for this event
/// This funtion has an implicit stop() if stop() has not been previously called
void write();
Duration(const Duration&) = delete;
Duration& operator=(Duration const&) = delete;
private:
std::string to_json() const;
size_t m_start;
size_t m_stop;
std::string m_name;
std::string m_category;
std::string m_args;
};
class ngraph::runtime::event::Object
{
public:
Object(const std::string& name, const std::string& args);
void snapshot(const std::string& args);
void destroy();
private:
void write_snapshot(std::ostream& out, const std::string& args);
const std::string m_name;
size_t m_id;
};
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <memory> #include <memory>
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp" #include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
#include "ngraph/runtime/chrome_trace.hpp"
#include "ngraph/runtime/host_tensor.hpp" #include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
...@@ -96,6 +97,8 @@ const char* runtime::HostTensor::get_data_ptr() const ...@@ -96,6 +97,8 @@ const char* runtime::HostTensor::get_data_ptr() const
void runtime::HostTensor::write(const void* source, size_t n) void runtime::HostTensor::write(const void* source, size_t n)
{ {
runtime::event::Duration d1("write", "HostTensor");
if (n > m_buffer_size) if (n > m_buffer_size)
{ {
throw out_of_range("write access past end of tensor"); throw out_of_range("write access past end of tensor");
...@@ -106,6 +109,7 @@ void runtime::HostTensor::write(const void* source, size_t n) ...@@ -106,6 +109,7 @@ void runtime::HostTensor::write(const void* source, size_t n)
void runtime::HostTensor::read(void* target, size_t n) const void runtime::HostTensor::read(void* target, size_t n) const
{ {
runtime::event::Duration d1("read", "HostTensor");
if (n > m_buffer_size) if (n > m_buffer_size)
{ {
throw out_of_range("read access past end of tensor"); throw out_of_range("read access past end of tensor");
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "ngraph/pass/manager.hpp" #include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp" #include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/backend_manager.hpp" #include "ngraph/runtime/backend_manager.hpp"
#include "ngraph/runtime/chrome_trace.hpp"
#include "ngraph/serializer.hpp" #include "ngraph/serializer.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
...@@ -74,6 +75,8 @@ runtime::interpreter::INTExecutable::INTExecutable(const std::string& model_stri ...@@ -74,6 +75,8 @@ runtime::interpreter::INTExecutable::INTExecutable(const std::string& model_stri
bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs, bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs) const vector<shared_ptr<runtime::Tensor>>& inputs)
{ {
runtime::event::Duration d1("call", "Interpreter");
// convert inputs to HostTensor // convert inputs to HostTensor
vector<shared_ptr<HostTensor>> func_inputs; vector<shared_ptr<HostTensor>> func_inputs;
for (auto tensor : inputs) for (auto tensor : inputs)
...@@ -122,6 +125,7 @@ bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime:: ...@@ -122,6 +125,7 @@ bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::
for (const NodeWrapper& wrapped : m_wrapped_nodes) for (const NodeWrapper& wrapped : m_wrapped_nodes)
{ {
auto op = wrapped.get_node(); auto op = wrapped.get_node();
runtime::event::Duration d2(op->description(), "Interpreter");
auto type_id = wrapped.get_typeid(); auto type_id = wrapped.get_typeid();
if (type_id == OP_TYPEID::Parameter) if (type_id == OP_TYPEID::Parameter)
{ {
...@@ -304,3 +308,74 @@ void runtime::interpreter::INTExecutable::save(ostream& out) ...@@ -304,3 +308,74 @@ void runtime::interpreter::INTExecutable::save(ostream& out)
string model = serialize(m_function, 0); string model = serialize(m_function, 0);
writer.write("model", model.data(), model.size()); writer.write("model", model.data(), model.size());
} }
shared_ptr<ngraph::op::Parameter>
runtime::interpreter::INTExecutable::get_parameter(size_t index) const
{
const ParameterVector& parameters = get_parameters();
NGRAPH_CHECK(index < parameters.size(), "create_tensor for input out of bounds");
return parameters[index];
}
shared_ptr<ngraph::op::Result> runtime::interpreter::INTExecutable::get_result(size_t index) const
{
const ResultVector& results = get_results();
NGRAPH_CHECK(index < results.size(), "create_tensor for input out of bounds");
return results[index];
}
shared_ptr<runtime::Tensor>
runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index)
{
shared_ptr<op::Parameter> parameter = get_parameter(input_index);
return make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
}
shared_ptr<runtime::Tensor>
runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index)
{
shared_ptr<op::Result> result = get_result(output_index);
return make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
}
vector<shared_ptr<runtime::Tensor>>
runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index,
size_t pipeline_depth)
{
vector<shared_ptr<runtime::HostTensor>> tensors;
shared_ptr<op::Parameter> parameter = get_parameter(input_index);
for (size_t i = 0; i < pipeline_depth; i++)
{
shared_ptr<runtime::HostTensor> tensor;
auto t =
make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
tensor = static_pointer_cast<runtime::HostTensor>(t);
tensors.push_back(tensor);
}
vector<shared_ptr<runtime::Tensor>> result_tensors;
for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
{
result_tensors.push_back(tensor);
}
return result_tensors;
}
vector<shared_ptr<runtime::Tensor>>
runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index,
size_t pipeline_depth)
{
vector<shared_ptr<runtime::HostTensor>> tensors;
shared_ptr<op::Result> result = get_result(output_index);
for (size_t i = 0; i < pipeline_depth; i++)
{
shared_ptr<runtime::HostTensor> tensor;
auto t = make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
tensor = static_pointer_cast<runtime::HostTensor>(t);
tensors.push_back(tensor);
}
vector<shared_ptr<runtime::Tensor>> result_tensors;
for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
{
result_tensors.push_back(tensor);
}
return result_tensors;
}
...@@ -186,9 +186,21 @@ public: ...@@ -186,9 +186,21 @@ public:
std::vector<PerformanceCounter> get_performance_data() const override; std::vector<PerformanceCounter> get_performance_data() const override;
std::shared_ptr<runtime::Tensor> create_input_tensor(size_t input_index) override;
std::shared_ptr<runtime::Tensor> create_output_tensor(size_t output_index) override;
std::vector<std::shared_ptr<runtime::Tensor>>
create_input_tensor(size_t input_index, size_t pipeline_depth) override;
std::vector<std::shared_ptr<runtime::Tensor>>
create_output_tensor(size_t output_index, size_t pipeline_depth) override;
private: private:
INTExecutable(const std::string& model_string); INTExecutable(const std::string& model_string);
std::shared_ptr<ngraph::op::Parameter> get_parameter(size_t index) const;
std::shared_ptr<ngraph::op::Result> get_result(size_t index) const;
int get_alignment() const { return 64; } int get_alignment() const { return 64; }
bool m_is_compiled = false; bool m_is_compiled = false;
bool m_nan_check_enabled = false; bool m_nan_check_enabled = false;
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
set (SRC set (SRC
nbench.cpp nbench.cpp
benchmark.cpp benchmark.cpp
benchmark_pipelined.cpp
benchmark_utils.cpp
) )
add_executable(nbench ${SRC}) add_executable(nbench ${SRC})
......
...@@ -14,12 +14,8 @@ ...@@ -14,12 +14,8 @@
// limitations under the License. // limitations under the License.
//***************************************************************************** //*****************************************************************************
#include <random>
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark.hpp" #include "benchmark.hpp"
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp" #include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp" #include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp" #include "ngraph/runtime/host_tensor.hpp"
...@@ -30,106 +26,6 @@ ...@@ -30,106 +26,6 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
static default_random_engine s_random_engine;
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}
template <typename T>
void init_int_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
}
template <>
void init_int_tv<char>(shared_ptr<runtime::Tensor> tv, char min, char max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tv<int8_t>(shared_ptr<runtime::Tensor> tv, int8_t min, int8_t max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tv<uint8_t>(shared_ptr<runtime::Tensor> tv, uint8_t min, uint8_t max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(uint8_t));
}
template <typename T>
void init_real_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
{
size_t size = tv->get_element_count();
uniform_real_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
}
static void random_init(shared_ptr<runtime::Tensor> tv)
{
element::Type et = tv->get_element_type();
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tv<char>(tv, 0, 1); break;
case element::Type_t::f32: init_real_tv<float>(tv, -1, 1); break;
case element::Type_t::f64: init_real_tv<double>(tv, -1, 1); break;
case element::Type_t::i8: init_int_tv<int8_t>(tv, -1, 1); break;
case element::Type_t::i16: init_int_tv<int16_t>(tv, -1, 1); break;
case element::Type_t::i32: init_int_tv<int32_t>(tv, 0, 1); break;
case element::Type_t::i64: init_int_tv<int64_t>(tv, 0, 1); break;
case element::Type_t::u8: init_int_tv<uint8_t>(tv, 0, 1); break;
case element::Type_t::u16: init_int_tv<uint16_t>(tv, 0, 1); break;
case element::Type_t::u32: init_int_tv<uint32_t>(tv, 0, 1); break;
case element::Type_t::u64: init_int_tv<uint64_t>(tv, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
}
vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
const string& backend_name, const string& backend_name,
size_t iterations, size_t iterations,
...@@ -140,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, ...@@ -140,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
stopwatch timer; stopwatch timer;
timer.start(); timer.start();
auto backend = runtime::Backend::create(backend_name); auto backend = runtime::Backend::create(backend_name);
auto compiled_func = backend->compile(f, timing_detail); auto exec = backend->compile(f, timing_detail);
timer.stop(); timer.stop();
cout.imbue(locale("")); cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl; cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
...@@ -201,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, ...@@ -201,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
} }
} }
} }
compiled_func->call(results, args); exec->call(results, args);
if (copy_data) if (copy_data)
{ {
for (size_t result_index = 0; result_index < results.size(); result_index++) for (size_t result_index = 0; result_index < results.size(); result_index++)
...@@ -217,6 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, ...@@ -217,6 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
float time = t1.get_milliseconds(); float time = t1.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl; cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data(); vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
return perf_data; return perf_data;
} }
...@@ -24,10 +24,6 @@ ...@@ -24,10 +24,6 @@
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp" #include "ngraph/runtime/performance_counter.hpp"
/// performance test utilities
std::multimap<size_t, std::string>
aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f, std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name, const std::string& backend_name,
size_t iterations, size_t iterations,
......
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <array>
#include <condition_variable>
#include <mutex>
#include <thread>
#include "benchmark.hpp"
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
class TensorCollection
{
public:
vector<shared_ptr<runtime::HostTensor>> parameter_data;
vector<shared_ptr<runtime::HostTensor>> result_data;
vector<shared_ptr<runtime::Tensor>> input_tensors;
vector<shared_ptr<runtime::Tensor>> output_tensors;
private:
};
static mutex s_mutex;
static condition_variable s_condition;
static size_t current_iteration = 0;
static size_t s_iterations;
static size_t s_warmup_iterations;
static stopwatch s_timer;
static void
thread_entry(runtime::Executable* exec, const TensorCollection& tensors, size_t pipeline_stage)
{
bool data_written = false;
const vector<shared_ptr<runtime::Tensor>>& args = tensors.input_tensors;
const vector<shared_ptr<runtime::Tensor>>& results = tensors.output_tensors;
while (current_iteration < s_iterations + s_warmup_iterations)
{
if (!data_written)
{
for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
{
const shared_ptr<runtime::Tensor>& arg = args[arg_index];
if (arg->get_stale())
{
const shared_ptr<runtime::HostTensor>& data = tensors.parameter_data[arg_index];
arg->write(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size());
}
}
data_written = true;
}
unique_lock<mutex> lock(s_mutex);
if ((current_iteration & 1) != pipeline_stage)
{
s_condition.wait(lock);
}
else
{
if (current_iteration == s_warmup_iterations)
{
s_timer.start();
}
// our turn to run
exec->call(results, args);
current_iteration++;
data_written = false;
s_condition.notify_all();
lock.unlock();
for (size_t result_index = 0; result_index < results.size(); result_index++)
{
const shared_ptr<runtime::HostTensor>& data = tensors.result_data[result_index];
const shared_ptr<runtime::Tensor>& result = results[result_index];
result->read(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size());
}
}
}
}
vector<runtime::PerformanceCounter> run_benchmark_pipelined(shared_ptr<Function> f,
const string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data)
{
constexpr size_t pipeline_depth = 2;
s_iterations = iterations;
s_warmup_iterations = warmup_iterations;
array<TensorCollection, pipeline_depth> tensor_collections;
stopwatch timer;
timer.start();
auto backend = runtime::Backend::create(backend_name);
auto exec = backend->compile(f, timing_detail);
timer.stop();
cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
set_denormals_flush_to_zero();
// Create random input data for all input tensors
for (size_t i = 0; i < pipeline_depth; i++)
{
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto tensor_data =
make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
random_init(tensor_data);
tensor_collections[i].parameter_data.push_back(tensor_data);
}
}
// Create output tensors for all outputs
for (size_t i = 0; i < pipeline_depth; i++)
{
for (shared_ptr<Node> result : f->get_results())
{
auto tensor_data =
make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
tensor_collections[i].result_data.push_back(tensor_data);
}
}
// Create input tensors for all Parameters
array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> input_tensors_array;
size_t input_index = 0;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto input_tensors = exec->create_input_tensor(input_index++, pipeline_depth);
for (size_t i = 0; i < pipeline_depth; i++)
{
tensor_collections[i].input_tensors.push_back(input_tensors[i]);
}
}
// Create output tensors for all Results
array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
size_t output_index = 0;
for (shared_ptr<Node> result : f->get_results())
{
auto output_tensors = exec->create_output_tensor(output_index++, pipeline_depth);
for (size_t i = 0; i < pipeline_depth; i++)
{
tensor_collections[i].output_tensors.push_back(output_tensors[i]);
}
}
thread threads[pipeline_depth];
for (size_t i = 0; i < pipeline_depth; i++)
{
threads[i] = thread(thread_entry, exec.get(), tensor_collections[i], i);
}
for (size_t i = 0; i < pipeline_depth; i++)
{
threads[i].join();
}
s_timer.stop();
float time = s_timer.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
return perf_data;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
std::vector<ngraph::runtime::PerformanceCounter>
run_benchmark_pipelined(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data);
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
template <>
void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
}
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}
void random_init(shared_ptr<runtime::Tensor> tensor)
{
element::Type et = tensor->get_element_type();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
default_random_engine& get_random_engine()
{
static std::default_random_engine s_random_engine;
return s_random_engine;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <random>
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
void set_denormals_flush_to_zero();
void random_init(std::shared_ptr<ngraph::runtime::Tensor> tensor);
std::default_random_engine& get_random_engine();
template <typename T>
void init_int_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
std::uniform_int_distribution<T> dist(min, max);
std::vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
template <typename T>
void init_real_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
std::uniform_real_distribution<T> dist(min, max);
std::vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <iomanip> #include <iomanip>
#include "benchmark.hpp" #include "benchmark.hpp"
#include "benchmark_pipelined.hpp"
#include "ngraph/distributed.hpp" #include "ngraph/distributed.hpp"
#include "ngraph/except.hpp" #include "ngraph/except.hpp"
#include "ngraph/file_util.hpp" #include "ngraph/file_util.hpp"
...@@ -181,6 +182,7 @@ int main(int argc, char** argv) ...@@ -181,6 +182,7 @@ int main(int argc, char** argv)
int warmup_iterations = 1; int warmup_iterations = 1;
bool copy_data = true; bool copy_data = true;
bool dot_file = false; bool dot_file = false;
bool double_buffer = false;
for (size_t i = 1; i < argc; i++) for (size_t i = 1; i < argc; i++)
{ {
...@@ -229,6 +231,10 @@ int main(int argc, char** argv) ...@@ -229,6 +231,10 @@ int main(int argc, char** argv)
{ {
directory = argv[++i]; directory = argv[++i];
} }
else if (arg == "--double_buffer")
{
double_buffer = true;
}
else if (arg == "-w" || arg == "--warmup_iterations") else if (arg == "-w" || arg == "--warmup_iterations")
{ {
try try
...@@ -283,6 +289,7 @@ OPTIONS ...@@ -283,6 +289,7 @@ OPTIONS
-w|--warmup_iterations Number of warm-up iterations -w|--warmup_iterations Number of warm-up iterations
--no_copy_data Disable copy of input/result data every iteration --no_copy_data Disable copy of input/result data every iteration
--dot Generate Graphviz dot file --dot Generate Graphviz dot file
--double_buffer Double buffer inputs and outputs
)###"; )###";
return 1; return 1;
} }
...@@ -420,8 +427,17 @@ OPTIONS ...@@ -420,8 +427,17 @@ OPTIONS
{ {
cout << "\n---- Benchmark ----\n"; cout << "\n---- Benchmark ----\n";
shared_ptr<Function> f = deserialize(model); shared_ptr<Function> f = deserialize(model);
auto perf_data = run_benchmark( vector<runtime::PerformanceCounter> perf_data;
f, backend, iterations, timing_detail, warmup_iterations, copy_data); if (double_buffer)
{
perf_data = run_benchmark_pipelined(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
else
{
perf_data = run_benchmark(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
auto perf_shape = to_perf_shape(f, perf_data); auto perf_shape = to_perf_shape(f, perf_data);
aggregate_perf_data.insert( aggregate_perf_data.insert(
aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end()); aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end());
......
...@@ -733,8 +733,8 @@ NGRAPH_TEST(${BACKEND_NAME}, batch_norm_bprop_n4c3h2w2) ...@@ -733,8 +733,8 @@ NGRAPH_TEST(${BACKEND_NAME}, batch_norm_bprop_n4c3h2w2)
auto C = std::make_shared<op::Parameter>(element::f32, shape_r); auto C = std::make_shared<op::Parameter>(element::f32, shape_r);
auto zero = ngraph::make_zero(bn_dgamma->get_element_type(), bn_dgamma->get_shape()); auto zero = ngraph::make_zero(bn_dgamma->get_element_type(), bn_dgamma->get_shape());
ngraph::autodiff::Adjoints adjoints(NodeVector{bn_dx, bn_dgamma, bn_dbeta}, ngraph::autodiff::Adjoints adjoints(OutputVector{bn_dx, bn_dgamma, bn_dbeta},
NodeVector{C, zero, zero}); OutputVector{C, zero, zero});
auto dinput = adjoints.backprop_node(input); auto dinput = adjoints.backprop_node(input);
auto dgamma = adjoints.backprop_node(gamma); auto dgamma = adjoints.backprop_node(gamma);
......
...@@ -257,10 +257,10 @@ NGRAPH_TEST(${BACKEND_NAME}, divide_adjoint_stability) ...@@ -257,10 +257,10 @@ NGRAPH_TEST(${BACKEND_NAME}, divide_adjoint_stability)
auto B = make_shared<op::Parameter>(element::f32, shape); auto B = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Divide>(A, B), ParameterVector{A, B}); auto f = make_shared<Function>(make_shared<op::Divide>(A, B), ParameterVector{A, B});
auto Y_out = f->get_output_op(0); auto Y_out = f->output(0);
auto Xs = f->get_parameters(); auto Xs = f->get_parameters();
auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape()); auto C = std::make_shared<op::Parameter>(Y_out.get_element_type(), Y_out.get_shape());
ngraph::autodiff::Adjoints adjoints(NodeVector{Y_out}, NodeVector{C}); ngraph::autodiff::Adjoints adjoints(OutputVector{Y_out}, OutputVector{C});
std::vector<std::shared_ptr<Node>> dYdXs(Xs.size()); std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
transform( transform(
Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) { Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) {
......
...@@ -507,7 +507,8 @@ TEST(cpu_fusion, conv_bias_bprop_n1c1h3w3) ...@@ -507,7 +507,8 @@ TEST(cpu_fusion, conv_bias_bprop_n1c1h3w3)
auto f = make_shared<Function>( auto f = make_shared<Function>(
convolution_bias, ParameterVector{conv_test.data, conv_test.weights, conv_test.bias}); convolution_bias, ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
ngraph::autodiff::Adjoints adjoints(NodeVector{convolution_bias}, NodeVector{conv_test.delta}); ngraph::autodiff::Adjoints adjoints(OutputVector{convolution_bias},
OutputVector{conv_test.delta});
auto d_data = adjoints.backprop_node(conv_test.data); auto d_data = adjoints.backprop_node(conv_test.data);
auto d_weights = adjoints.backprop_node(conv_test.weights); auto d_weights = adjoints.backprop_node(conv_test.weights);
...@@ -546,7 +547,7 @@ TEST(cpu_fusion, conv_bias_bprop) ...@@ -546,7 +547,7 @@ TEST(cpu_fusion, conv_bias_bprop)
pass_manager.register_pass<pass::VisualizeTree>("conv_bias_bprop_fusion.png"); pass_manager.register_pass<pass::VisualizeTree>("conv_bias_bprop_fusion.png");
auto f = make_shared<Function>(conv_bias, ParameterVector{data_batch, filters, bias}); auto f = make_shared<Function>(conv_bias, ParameterVector{data_batch, filters, bias});
ngraph::autodiff::Adjoints adjoints(NodeVector{conv_bias}, NodeVector{delta}); ngraph::autodiff::Adjoints adjoints(OutputVector{conv_bias}, OutputVector{delta});
auto d_data = adjoints.backprop_node(data_batch); auto d_data = adjoints.backprop_node(data_batch);
auto d_weights = adjoints.backprop_node(filters); auto d_weights = adjoints.backprop_node(filters);
...@@ -1452,7 +1453,7 @@ TEST(cpu_fusion, max_pool_with_indices) ...@@ -1452,7 +1453,7 @@ TEST(cpu_fusion, max_pool_with_indices)
auto max_pool = std::make_shared<op::MaxPool>(input, window_shape); auto max_pool = std::make_shared<op::MaxPool>(input, window_shape);
auto C = std::make_shared<op::Parameter>(element::f32, max_pool->get_shape()); auto C = std::make_shared<op::Parameter>(element::f32, max_pool->get_shape());
ngraph::autodiff::Adjoints adjoints(NodeVector{max_pool}, NodeVector{C}); ngraph::autodiff::Adjoints adjoints(ngraph::OutputVector{max_pool}, ngraph::OutputVector{C});
auto dinput = adjoints.backprop_node(input); auto dinput = adjoints.backprop_node(input);
...@@ -1789,14 +1790,14 @@ static std::shared_ptr<ngraph::Function> make_forward_function() ...@@ -1789,14 +1790,14 @@ static std::shared_ptr<ngraph::Function> make_forward_function()
return std::make_shared<Function>(NodeVector{max_pool, neg, absn}, ParameterVector{input}); return std::make_shared<Function>(NodeVector{max_pool, neg, absn}, ParameterVector{input});
} }
static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<ngraph::Node>>> static std::pair<std::shared_ptr<ngraph::Function>, OutputVector>
make_backward_function(std::shared_ptr<ngraph::Function> f) make_backward_function(std::shared_ptr<ngraph::Function> f)
{ {
// get parameters // get parameters
std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = f->get_parameters(); std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = f->get_parameters();
ngraph::NodeVector adjoints; ngraph::OutputVector adjoints;
ngraph::NodeVector outputs; ngraph::OutputVector outputs;
for (auto Y : f->get_results()) for (auto Y : f->get_results())
{ {
// Get the output // Get the output
...@@ -1809,7 +1810,7 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr< ...@@ -1809,7 +1810,7 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<
ngraph::autodiff::Adjoints adjoint{outputs, adjoints}; ngraph::autodiff::Adjoints adjoint{outputs, adjoints};
// Perform autodiff // Perform autodiff
std::vector<std::shared_ptr<Node>> dYdXs(back_parameters.size()); OutputVector dYdXs(back_parameters.size());
transform(back_parameters.begin(), transform(back_parameters.begin(),
back_parameters.end(), back_parameters.end(),
dYdXs.begin(), dYdXs.begin(),
...@@ -1818,7 +1819,8 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr< ...@@ -1818,7 +1819,8 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<
// create the backward function // create the backward function
std::vector<std::shared_ptr<ngraph::op::Parameter>> param_adjoints; std::vector<std::shared_ptr<ngraph::op::Parameter>> param_adjoints;
for (auto n : adjoints) for (auto n : adjoints)
param_adjoints.push_back(std::dynamic_pointer_cast<ngraph::op::Parameter>(n)); param_adjoints.push_back(
std::dynamic_pointer_cast<ngraph::op::Parameter>(n.get_node_shared_ptr()));
back_parameters.insert(back_parameters.begin(), param_adjoints.begin(), param_adjoints.end()); back_parameters.insert(back_parameters.begin(), param_adjoints.begin(), param_adjoints.end());
return {std::make_shared<ngraph::Function>(dYdXs, back_parameters), adjoints}; return {std::make_shared<ngraph::Function>(dYdXs, back_parameters), adjoints};
...@@ -2703,7 +2705,7 @@ void sigmoid_multiply_fusion_backward_compute(runtime::Backend* backend, ...@@ -2703,7 +2705,7 @@ void sigmoid_multiply_fusion_backward_compute(runtime::Backend* backend,
auto sigmoid_mul = auto sigmoid_mul =
make_shared<op::SigmoidMultiply>(input_0_alt, input_1_alt, input_0_type, input_1_type); make_shared<op::SigmoidMultiply>(input_0_alt, input_1_alt, input_0_type, input_1_type);
ngraph::autodiff::Adjoints adjoints(NodeVector{sigmoid_mul}, NodeVector{delta_param}); ngraph::autodiff::Adjoints adjoints(OutputVector{sigmoid_mul}, OutputVector{delta_param});
auto d_input_0 = adjoints.backprop_node(input_0_adjoint); auto d_input_0 = adjoints.backprop_node(input_0_adjoint);
auto d_input_1 = adjoints.backprop_node(input_1_adjoint); auto d_input_1 = adjoints.backprop_node(input_1_adjoint);
auto df = make_shared<Function>(NodeVector{d_input_0, d_input_1}, back_params); auto df = make_shared<Function>(NodeVector{d_input_0, d_input_1}, back_params);
......
...@@ -144,7 +144,7 @@ namespace ngraph ...@@ -144,7 +144,7 @@ namespace ngraph
// df/dX* // df/dX*
std::vector<std::shared_ptr<Node>> df_output_params; std::vector<std::shared_ptr<Node>> df_output_params;
Adjoints adjoints(NodeVector{f->get_output_op(0)}, NodeVector{c_param}); Adjoints adjoints(OutputVector{f->output(0)}, OutputVector{c_param});
// for each x "of interest" // for each x "of interest"
for (auto x : indep_params) for (auto x : indep_params)
......
...@@ -32,10 +32,10 @@ using namespace ngraph; ...@@ -32,10 +32,10 @@ using namespace ngraph;
std::shared_ptr<Function> autodiff::backprop_function(const std::shared_ptr<Function>& f) std::shared_ptr<Function> autodiff::backprop_function(const std::shared_ptr<Function>& f)
{ {
auto Y_out = f->get_output_op(0); auto Y_out = f->output(0);
auto Xs = f->get_parameters(); auto Xs = f->get_parameters();
auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape()); auto C = std::make_shared<op::Parameter>(Y_out.get_element_type(), Y_out.get_shape());
Adjoints adjoints(NodeVector{Y_out}, NodeVector{C}); Adjoints adjoints(OutputVector{Y_out}, OutputVector{C});
std::vector<std::shared_ptr<Node>> dYdXs(Xs.size()); std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
transform(Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) { transform(Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) {
return adjoints.backprop_node(X); return adjoints.backprop_node(X);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment