Unverified Commit e165a460 authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

Merge branch 'master' into master

parents e985e98f ce7e168a
......@@ -115,3 +115,7 @@ python/pybind11/
CMakeCache.txt
CMakeFiles/
CMakeSettings.json
# don't add dot-save files!
.save
......@@ -96,7 +96,7 @@ to improve it:
[framework integration guides]: http://ngraph.nervanasys.com/docs/latest/framework-integration-guides.html
[release notes]: https://ngraph.nervanasys.com/docs/latest/project/release-notes.html
[Github issues]: https://github.com/NervanaSystems/ngraph/issues
[contrib guide]: https://ngraph.nervanasys.com/docs/latest/project/code-contributor-README.html
[contrib guide]: https://ngraph.nervanasys.com/docs/latest/project/contribution-guide.html
[pull request]: https://github.com/NervanaSystems/ngraph/pulls
[how to import]: https://ngraph.nervanasys.com/docs/latest/howto/import.html
[ngraph_wireframes_with_notice]: doc/sphinx/source/graphics/readme_stack.png "nGraph wireframe"
......
......@@ -15,7 +15,7 @@
PROJECT_NAME = "Intel® nGraph Library and API docs"
PROJECT_NAME = "Intel® nGraph Compiler stack and API docs"
PROJECT_BRIEF = "Code reference for the Intel® nGraph C++ Library"
OUTPUT_DIRECTORY = @CMAKE_CURRENT_BINARY_DIR@
......@@ -240,7 +240,7 @@ EXTENSION_MAPPING =
# case of backward compatibilities issues.
# The default value is: YES.
# MARKDOWN_SUPPORT = NO
MARKDOWN_SUPPORT = NO
# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
# to that level are automatically included in the table of contents, even if
......@@ -405,7 +405,7 @@ EXTRACT_LOCAL_METHODS = NO
# the file that contains the anonymous namespace. By default anonymous namespace
# are hidden.
EXTRACT_ANON_NSPACES = YES
EXTRACT_ANON_NSPACES = NO
# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
# undocumented members inside documented classes or files. If set to NO these
......@@ -615,7 +615,7 @@ LAYOUT_FILE =
# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
# search path. See also \cite for info how to create references.
CITE_BIB_FILES =
# CITE_BIB_FILES =
#---------------------------------------------------------------------------
# Configuration options related to warning and progress messages
......@@ -626,7 +626,7 @@ CITE_BIB_FILES =
# messages are off.
# The default value is: NO.
QUIET = NO
QUIET = YES
# The WARNINGS tag can be used to turn on/off the warning messages that are
# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
......@@ -635,14 +635,14 @@ QUIET = NO
# Tip: Turn warnings on while writing the documentation.
# The default value is: YES.
WARNINGS = YES
WARNINGS = NO
# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
# will automatically be disabled.
# The default value is: YES.
WARN_IF_UNDOCUMENTED = YES
WARN_IF_UNDOCUMENTED = NO
# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
# potential errors in the documentation, such as not documenting some parameters
......@@ -786,7 +786,8 @@ EXCLUDE_SYMLINKS = NO
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories for example use the pattern */test/*
EXCLUDE_PATTERNS =
EXCLUDE_PATTERNS = */tools/*
*/resource/*
# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
# (namespaces, classes, functions, etc.) that should be excluded from the
......@@ -1176,7 +1177,7 @@ DOCSET_PUBLISHER_ID = org.doxygen.Publisher
# The default value is: Publisher.
# This tag requires that the tag GENERATE_DOCSET is set to YES.
DOCSET_PUBLISHER_NAME = Publisher
DOCSET_PUBLISHER_NAME = AIPG Intel Corporation
# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
......@@ -1897,7 +1898,7 @@ HAVE_DOT = YES
# speed.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_NUM_THREADS = 32
DOT_NUM_THREADS = 0
# When you want a differently looking font in the dot files that doxygen
# generates you can specify the font name using DOT_FONTNAME. You need to make
......@@ -2144,4 +2145,4 @@ GENERATE_LEGEND = YES
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_CLEANUP = YES
\ No newline at end of file
DOT_CLEANUP = YES
......@@ -77,9 +77,6 @@
{# Keep modernizr in head - http://modernizr.com/docs/#installing #}
<script src="{{ pathto('_static/js/modernizr.min.js', 1) }}"></script>
{# swagger-ui loading for API doc display #}
<link rel="stylesheet" href="{{ pathto('_static/css/swagger.css', 1) }}" type="text/css" />
</head>
<body class="wy-body-for-nav" role="document">
......
This diff is collapsed.
......@@ -1638,9 +1638,9 @@ html {
body {
font-family: "RobotoSlab", Sans, sans-serif;
font-weight: normal;
color: #38403f;
color: #3f3c3d;
min-height: 100%;
overflow-x: hidden;
overflow-x: visible;
background: #fcfcfc;
}
......@@ -1731,11 +1731,13 @@ p {
h1 {
font-size: 153%;
color: #585351;
}
h2, .rst-content .toctree-wrapper p.caption {
font-size: 139%;
font-weight: lighter;
color: #826c62;
}
h3 {
......@@ -1854,7 +1856,7 @@ div[class^='highlight'] td.code {
code, p.caption {
font-family: "NeoSansIntel-Light", sans, monospace;
color: #A79992;
color: #585351;
font-size: 0.99em;
line-height: 1.39em;
}
......@@ -1875,7 +1877,7 @@ caption-text {
padding-top: 0.29em;
padding-left: 0.11em;
padding-bottom: 0.23em;
text-align: rig6ht;
text-align: right;
}
div[class^='highlight'] pre {
......@@ -2771,7 +2773,7 @@ span[id*='MathJax-Span'] {
padding: 0 1.618em;
margin-bottom: 0;
display: block;
font-family: NeoSansIntel, sans;
font-family: "NeoSansIntel", sans;
font-weight: bold;
text-transform: uppercase;
font-size: 80%;
......@@ -2925,11 +2927,12 @@ span[id*='MathJax-Span'] {
padding: 0.809em;
margin-bottom: 0.809em;
z-index: 200;
background-color: #0071c5;
background-color: #fcfcfc;
text-align: center;
padding: 0.809em;
display: block;
color: #fcfcfc;
color: #585351;
font-family: "NeoSansIntel-Regular", sans;
margin-bottom: 0.809em;
}
.wy-side-nav-search input[type=text] {
......@@ -2937,14 +2940,14 @@ span[id*='MathJax-Span'] {
line-height: 1.3em;
border-radius: 50px;
padding: 6px 12px;
border-color: #8eb0af;
border-color: #aba09b;
}
.wy-side-nav-search img {
display: block;
margin: auto auto 0.809em auto;
height: 45px;
width: 45px;
background-color: #8eb0af;
background-color: #aba09b;
padding: 5px;
border-radius: 100%;
}
......@@ -2975,7 +2978,7 @@ span[id*='MathJax-Span'] {
margin-top: -0.4045em;
margin-bottom: 0.809em;
font-weight: normal;
color: rgba(255, 255, 255, 0.3);
color: #3f3c3d;
}
.wy-nav .wy-menu-vertical header {
......@@ -2985,7 +2988,7 @@ span[id*='MathJax-Span'] {
color: #dadada;
}
.wy-nav .wy-menu-vertical a:hover {
background-color: #8eb0af;
background-color: #aba09b;
color: #fff;
}
......@@ -3035,7 +3038,7 @@ span[id*='MathJax-Span'] {
overflow-x: hidden;
overflow-y: hidden;
min-height: 100%;
background: #5f5f5f;
background: #585351;
z-index: 200;
}
......@@ -3049,7 +3052,7 @@ span[id*='MathJax-Span'] {
.wy-nav-top {
display: none;
background: #0071c5;
background: #fcfcfc;
color: #fff;
padding: 0.4045em 0.809em;
position: relative;
......@@ -3066,14 +3069,14 @@ span[id*='MathJax-Span'] {
clear: both;
}
.wy-nav-top a {
color: #fff;
font-weight: bold;
color: #585351;
font-family: "NeoSansIntel-Regular", sans;
}
.wy-nav-top img {
margin-right: 12px;
height: 45px;
width: 45px;
background-color: #0071c5;
background-color: #fcfcfc;
padding: 5px;
border-radius: 100%;
}
......@@ -3092,15 +3095,15 @@ span[id*='MathJax-Span'] {
.wy-nav-content {
padding: 1.618em 3.236em;
height: 100%;
max-width: 850px !important;
max-width: 979px !important;
margin: auto;
}
.wy-body-mask {
position: fixed;
width: 100%;
width: auto;
height: 100%;
background: rgba(0, 0, 0, 0.2);
background: ;
display: none;
z-index: 499;
}
......@@ -3137,11 +3140,11 @@ footer span.commit code, footer span.commit .rst-content tt, .rst-content footer
#search-results .search li {
margin-bottom: 24px;
border-bottom: solid 1px #e1e4e5;
border-bottom: solid 1px #f3cdba;
padding-bottom: 24px;
}
#search-results .search li:first-child {
border-top: solid 1px #e1e4e5;
border-top: solid 1px #f3cdba;
padding-top: 24px;
}
#search-results .search li a {
......
......@@ -2554,9 +2554,10 @@ div[class^='highlight'] pre {
}
.function {
border-right: dashed 0.19em #f4f2f0;
border-bottom: solid 0.23em #f4f2f0;
border-top: solid 0.31em #77a29f;
border-right: dashed 0.19em #f3cdba;
border-bottom: solid 0.23em #826c62;
border-top: solid 0.31em #826c62;
background: #aba09b;
}
......
......@@ -9,4 +9,4 @@ sticky_navigation = True
logo_only =
collapse_navigation = False
display_version = True
use_bower = FALSE
use_bower = False
.. backend-support/cpp-api.rst:
Runtime Backends
################
.. figure:: ../graphics/backend-dgm.png
:width: 650px
Various backends are accessible via nGraph core APIs
Backend
=======
.. doxygenclass:: ngraph::runtime::Backend
:project: ngraph
:members:
Tensor
======
.. doxygenclass:: ngraph::runtime::Tensor
:project: ngraph
:members:
HostTensor
==========
.. doxygenclass:: ngraph::runtime::HostTensor
:project: ngraph
:members:
PlaidML
=======
.. doxygenclass:: ngraph::runtime::plaidml::PlaidML_Backend
:project: ngraph
:members:
\ No newline at end of file
.. index.rst
.. backend-support/index.rst
#######################
Interact with Backends
#######################
Transformer, CPU, GPU, PlaidML
###############################
Backend
========
* :ref:`hybrid_transformer`
* :ref:`cpu_backend`
* :ref:`plaidml_backend`
* :ref:`gpu_backend`
What is a backend?
------------------
Backends are responsible for function execution and value allocation. They
can be used to :doc:`carry out a programmed computation<../howto/execute>`
......@@ -14,32 +22,38 @@ from a framework by using a CPU or GPU; or they can be used with an *Interpreter
mode, which is primarily intended for testing, to analyze a program, or for a
framework developer to develop customizations. Experimental APIs to support
current and future nGraph Backends are also available; see, for example, the
section on :ref:`plaidml_`.
section on :ref:`plaidml_backend`.
.. figure:: ../graphics/backend-dgm.png
:width: 650px
.. _hybrid_transformer:
Hybrid Transformer
==================
.. doxygenclass:: ngraph::runtime::Backend
:project: ngraph
:members:
Lorem ipsum
.. _cpu_backend:
Tensor
=======
CPU Backend
===========
.. doxygenclass:: ngraph::runtime::Tensor
:project: ngraph
:members:
Lorem ipsum
.. _gpu_backend:
.. _plaidml_:
GPU Backend
===========
PlaidML
========
Lorem ipsum
.. _plaidml_backend:
PlaidML Backend
===============
The nGraph ecosystem has recently added initial (experimental) support for `PlaidML`_,
which is an advanced :abbr:`Machine Learning (ML)` library that can further
......@@ -47,10 +61,4 @@ accelerate training models built on GPUs. When you select the ``PlaidML`` option
as a backend, it behaves as an advanced tensor compiler that can further speed up
training with large data sets.
.. doxygenclass:: ngraph::runtime::plaidml::PlaidML_Backend
:project: ngraph
:members:
.. _PlaidML: https://github.com/plaidml
......@@ -220,12 +220,13 @@ paths for what you'll want to do next: either compile a framework to run a DL
training model, or load an import of an "already-trained" model for inference
on an Intel nGraph-enabled backend.
For the former case, this early |version|, :doc:`framework-integration-guides`,
For the former case, this early |version|, :doc:`frameworks/index`,
can help you get started with a training a model on a supported framework.
* :doc:`MXNet<framework-integration-guides>` framework,
* :doc:`TensorFlow<framework-integration-guides>` framework, and
* :doc:`neon<framework-integration-guides>` framework,
* :doc:`MXNet<frameworks/tensorflow_integ>` framework,
* :doc:`TensorFlow<frameworks/mxnet_integ>` framework,
* :doc:`ONNX & ONNXIFI<frameworks/onnx_integ>`, and
* :doc:`PaddlePaddle<frameworks/paddle_integ>` framework.
For the latter case, if you've followed a tutorial from `ONNX`_, and you have an
......
......@@ -73,11 +73,11 @@ author = 'Intel Corporation'
# built documents.
#
# The short X.Y version.
version = '0.12'
version = '0.13'
# The Documentation full version, including alpha/beta/rc tags. Some features
# available in the latest code will not necessarily be documented first
release = '0.12.1'
release = '0.13-0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
......
.. core/overview.rst:
Overview
========
.. debug:
Debugging
#########
Error hints
===========
Common mistakes
===============
Wiki, other help
================
.. nbench:
###################
Working with nBench
###################
.. performance-profile:
Performance profiling
#####################
.. FW-specific validation performance comparisons via nbench
\ No newline at end of file
.. visualize:
Visualization
#############
.. framework-integration-guides:
###############################
Integrate Supported Frameworks
###############################
* :ref:`mxnet_intg`
* :ref:`tensorflow_intg`
A framework is "supported" when there is a framework :term:`bridge` that can be
cloned from one of our GitHub repos and built to connect to nGraph device backends,
all the while maintaining the framework's programmatic or user interface. Bridges
currently exist for the TensorFlow\* and MXNet\* frameworks.
.. figure:: graphics/bridge-to-graph-compiler.png
:width: 733px
:alt: JiT compiling of a computation
:abbr:`Just-in-Time (JiT)` Compiling for computation
Once connected via the bridge, the framework can then run and train a deep
learning model with various workloads on various backends using nGraph Compiler
as an optimizing compiler available through the framework.
.. _mxnet_intg:
MXNet\* bridge
===============
* See the README on `nGraph-MXNet`_ Integration.
* **Testing latency for Inference**: See the :doc:`frameworks/testing-latency`
doc for a fully-documented example how to compile and test latency with an
MXNet-supported model.
* **Training**: For experimental or alternative approaches to distributed training
methodologies, including data parallel training, see the MXNet-relevant sections
of the docs on :doc:`distr/index` and :doc:`How to <howto/index>` topics like
:doc:`howto/distribute-train`.
.. _tensorflow_intg:
TensorFlow\* bridge
===================
See the `ngraph tensorflow bridge README`_ for how to install the `DSO`_ for the
nGraph-TensorFlow bridge.
.. _nGraph-MXNet: https://github.com/NervanaSystems/ngraph-mxnet/blob/master/README.md
.. _MXNet: http://mxnet.incubator.apache.org
.. _DSO: http://csweb.cs.wfu.edu/%7Etorgerse/Kokua/More_SGI/007-2360-010/sgi_html/ch03.html
.. _being the fastest: https://github.com/soumith/convnet-benchmarks
.. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf/blob/master/README.md
.. frameworks/generic-configs.rst:
Configurations available to any framework
#########################################
Enabling Deep Learning paradigms
================================
Framework architects or engineers who can't quite find what they need among
the existing DL tools may need to build something new off a "stock" framework,
or someting entirely from scratch. For this category of developer, we have
:doc:`documented several ways <../howto/index>` you can incorporate built-in
compiler support for users of your framework; this includes out-of-box support
for things like Intel® MKL-DNN and PlaidML when your framework supports nGraph
as a "backend" or engine.
.. important:: nGraph does not provide an interface for "users" of frameworks
(for example, we cannot dictate or control how Tensorflow* or MXNet* presents
interfaces to users). Please keep in mind that designing and documenting
the :abbr:`User Interface (UI)` of step 3 above is entirely in the realm
of the framework owner or developer and beyond the scope of the nGraph
Compiler stack. However, any framework can be designed to make direct use
of nGraph Compiler stack-based features and then expose an accompanying UI,
output message, or other detail to a user.
The nGraph :abbr:`IR Intermediate Representation` is format that can understand
inputs from a framework. Today, there are two primary tasks that can be accomplished
in the “bridge code” space of the nGraph IR:
#. Compiling a dataflow graph
#. Executing a pre-compiled graph.
See the :doc:`../framework-integration-guides` for how we built bridges with our
initially-supported frameworks. For more in-depth help in writing things like
graph optimizations and bridge code, we provide articles on how to
:doc:`../fusion/index`, and programmatically :doc:`../howto/execute` that can
target various compute resources using nGraph when a framework provides some
inputs to be computed.
.. note:: Configuration options can be added manually on the command line or via
scripting. Please keep in mind that fine-tuning of parameters is as much of
an art as it is a science; there are virtually limitless ways to do so and
our documentation provides only a sampling.
Integrating nGraph with new frameworks
======================================
Integrating new frameworks
==========================
This section details some of the *configuration options* and some of the
*environment variables* that can be used to tune for optimal performance when
......@@ -78,7 +33,7 @@ something like:
FMV
===
---
FMV stands for :abbr:`Function Multi-Versioning`, and it can also provide a
number of generic ways to patch or bring architecture-based optimizations to
......@@ -90,9 +45,8 @@ for Intel® Architecture, and it includes at least one older CPU, the
`following article may be helpful`_.
Training Deep Neural Networks
==============================
-----------------------------
Before tweaking various environment variables, be aware that how the computation
gets executed depends upon the ordering of the data format that the model is
......@@ -108,6 +62,7 @@ For CPU (and most cuDNN) backends, the preferred layout is currently ``NCHW``.
* **H** -- Height of the image
* **W** -- Width of the image
Intel® Math Kernel Library for Deep Neural Networks
---------------------------------------------------
......@@ -127,7 +82,7 @@ additional component to be able to use these configuration settings.
nGraph-enabled Intel® Xeon®
============================
---------------------------
The list below includes recommendations on data layout, parameters, and
application configuration to achieve best performance running DNN workloads on
......
.. frameworks/index.rst:
#####################
Connecting Frameworks
#####################
.. TODO update CODEOWNERS for this new structure
Current framework integrations
==============================
.. toctree::
:maxdepth: 1
tensorflow_integ.rst
mxnet_integ.rst
onnx_integ.rst
paddle_integ.rst
validated/testing-latency.rst
A framework is "supported" when there is a framework :term:`bridge` that can be
cloned from one of our GitHub repos and built to connect to nGraph device backends,
all the while maintaining the framework's programmatic or user interface. Bridges
currently exist for the TensorFlow\* and MXNet\* frameworks.
.. figure:: ../graphics/bridge-to-graph-compiler.png
:width: 733px
:alt: JiT compiling of a computation
:abbr:`Just-in-Time (JiT)` Compiling for computation
Once connected via the bridge, the framework can then run and train a deep
learning model with various workloads on various backends using nGraph Compiler
as an optimizing compiler available through the framework.
While a :abbr:`Deep Learning (DL)` :term:`framework` is ultimately meant for
end use by data scientists, or for deployment in cloud container environments,
nGraph Core ops and the nGraph C++ Library are designed for framework builders
themselves. We invite anyone working on new and novel frameworks or neural
network designs to explore our highly-modular stack of components that can
network designs to explore our highly-modularized stack of components that can
be implemented or integrated in virtually limitless ways.
Please read the articles in this section if you are considering incorporating
......@@ -17,61 +45,10 @@ design. Articles here are also useful if you are working on something
built-from-scratch, or on an existing framework that is less widely-supported
than the popular frameworks like TensorFlow and PyTorch.
.. toctree::
:maxdepth: 1
generic-configs.rst
testing-latency.rst
validation.rst
Understanding users of frameworks
=================================
A data scientist or ML engineer may not initially know which framework is the
"best" framework to use to start working on his or her problem set. While there
are several to choose from, it can be daunting and time consuming to scope the
wide array of features and customization options offered by some of the more
popular frameworks:
#. First **find** a tested and working DL model that does something *similar*
to what the data scientist or ML engineer wants to do. To assist with this
stage, we've already provided organized tables of :doc:`validation` examples.
#. Next, **replicate** that result using well-known datasets to confirm that the
model does indeed work. To assist with this stage, we've released several
:doc:`pip installation options <../framework-integration-guides>` that can
be used to test basic examples.
#. Finally, **modify** some aspect: add new datasets, or adjust an algorithm's
parameters to hone in on specifics that can better train, forecast, or predict
scenarios modeling the real-world problem. This is also the stage where it
makes sense to `tune the workload to extract best performance`_.
.. important:: nGraph does not provide an interface for "users" of frameworks
(for example, we cannot dictate or control how Tensorflow* or MXNet* presents
interfaces to users). Please keep in mind that designing and documenting
the :abbr:`User Interface (UI)` is entirely in the realm of the framework owner
or developer and beyond the scope of the nGraph Compiler stack. However, any
framework can be designed to make direct use of nGraph Compiler stack-based
features and then expose an accompanying UI, output message, or other detail
to a user.
Clearly, one challenge of the framework developer is to differentiate from
the pack by providing a means for the data scientist to obtain reproducible
results. The other challenge is to provide sufficient documentation, or to
provide sufficient hints for how to do any "fine-tuning" for specific use cases.
With the nGraph Compiler stack powering your framework, it becomes much easier
to help your users get reproducible results with nothing more complex than the
CPU that powers their operating system.
In general, the larger and more complex a framework is, the harder it becomes
to navigate and extract the best performance; configuration options that are
enabled by "default" from the framework side can sometimes slow down compilation
without the developer being any the wiser. Sometimes only `a few small`_
adjustments can increase performance. Likewise, a minimalistic framework that
is designed around one specific kind of model can sometimes offer significant
performance-improvement opportunities by lowering overhead.
See :doc:`generic-configs` to get started.
.. figure:: ../graphics/translation-flow-to-ng-fofx.png
:width: 725px
:alt: Translation flow to nGraph function graph
.. _tune the workload to extract best performance: https://ai.intel.com/accelerating-deep-learning-training-inference-system-level-optimizations
......
.. mxnet_integ.rst:
MXNet\* bridge
===============
* See the `README`_ on nGraph-MXNet repo.
* **Testing inference latency**: See the :doc:`validated/testing-latency`
doc for a fully-documented example how to compile and test latency with an
MXNet-supported model.
* **Training**: For experimental or alternative approaches to distributed
training methodologies, including data parallel training, see the
MXNet-relevant sections of the docs on :doc:`../distr/index` and
:doc:`How to <../howto/index>` topics like :doc:`../howto/distribute-train`.
.. _README: https://github.com/NervanaSystems/ngraph-mxnet/blob/master/README.md
\ No newline at end of file
.. onnx_integ.rst:
ONNX & ONNXIFI
==============
\ No newline at end of file
.. paddle_integ.rst:
PaddlePaddle
============
.. tensorflow_integ.rst:
TensorFlow\* bridge
===================
See the `ngraph tensorflow bridge README`_ for how to install the `DSO`_ for the
nGraph-TensorFlow bridge.
.. _DSO: http://csweb.cs.wfu.edu/%7Etorgerse/Kokua/More_SGI/007-2360-010/sgi_html/ch03.html
.. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf/blob/master/README.md
.. frameworks/validation.rst:
.. frameworks/validated/list.rst:
##############################
Validated Models and Workloads
##############################
#################################
Validated workloads by framework
#################################
We validated performance [#f1]_ for the following TensorFlow\* and MXNet\* workloads:
We validated performance [#f1]_ for the following TensorFlow\* and MXNet\*
workloads:
* :ref:`tensorflow_valid`
* :ref:`mxnet_valid`
* :ref:`onnx_valid`
* :doc:`testing-latency`
.. _tensorflow_valid:
TensorFlow
==========
......@@ -36,6 +46,9 @@ TensorFlow
DRAW, Image generation
A3C, Reinforcement learning
.. _mxnet_valid:
MXNet
=====
......@@ -63,8 +76,11 @@ MXNet
DCGAN, Generative adversarial network
A3C, Reinforcement learning
.. _onnx_valid:
ONNX
=====
====
Additionally, we validated the following workloads are functional through
`nGraph ONNX importer`_:
......
.. frameworks/testing_latency:
.. frameworks/validated/testing-latency.rst:
Testing latency
......@@ -23,7 +23,7 @@ reasons outlined in our `features`_ documentation.
the experimental backend if you already use the ngraph-mxnet Github repo
.. figure:: ../graphics/ngraph-mxnet-models.png
.. figure:: ../../graphics/ngraph-mxnet-models.png
:width: 533px
:alt: Up to 45X faster
......@@ -85,14 +85,14 @@ into a static graph. Also note that any model with a saved checkpoint can be
considered a "static graph" in nGraph. For this example, we'll presume that the
model is pre-trained.
.. literalinclude:: ../../../examples/subgraph_snippets/mxnet-gluon-example.py
.. literalinclude:: ../../../../examples/subgraph_snippets/mxnet-gluon-example.py
:language: python
:lines: 17-32
To load the model into nGraph, we simply bind the symbol into an Executor.
.. literalinclude:: ../../../examples/subgraph_snippets/mxnet-gluon-example.py
.. literalinclude:: ../../../../examples/subgraph_snippets/mxnet-gluon-example.py
:language: python
:lines: 34-35
......@@ -101,7 +101,7 @@ the graph, and in the case of Resnet, sends the entire graph to nGraph for
compilation. This produces a single call to an NNVM ``NGraphSubgraphOp`` embedded
with the compiled model. At this point, we can test the model's performance.
.. literalinclude:: ../../../examples/subgraph_snippets/mxnet-gluon-example.py
.. literalinclude:: ../../../../examples/subgraph_snippets/mxnet-gluon-example.py
:language: python
:lines: 40-48
......
.. fusion/index.rst:
Optimize Graphs
===============
with nGraph Compiler fusions
----------------------------
The nGraph Compiler is an optimizing compiler. As such, it provides a way to
capture a given :term:`function graph` and perform a series of optimization
passes over that graph. The result is a semantically-equivalent graph that, when
executed using any |InG| :doc:`backend <../programmable/index>`, has optimizations
inherent at the hardware level: superior runtime characteristics to increase
training performance or reduce inference latency.
There are several ways to describe what happens when we capture and translate
the framework's output of ops into an nGraph graph. :term:`Fusion` is the term
we shall use in our documentation; the action also can be described as:
*combining*, *folding*, *squashing*, *collapsing*, or *merging* of graph
functions.
Optimization passes may include algebraic simplifications, domain-specific
simplifications, and fusion. Most passes share the same mode of operation (or
the same operational structure) and consist of various stages (each one a
:term:`step`) where a developer can experiment with the intercepted or dynamic
graph. These steps may be cycled or recycled as needed:
#. Locate a list of potentially-transformable subgraphs in the given graph.
#. Transform the selected candidates into semantically-equivalent subgraphs
that execute faster, or with less memory (or both).
#. Verify that the optimization pass performs correctly, with any or all expected
transformations, with the ``NGRAPH_SERIALIZE_TRACING`` option, which
serializes a graph in the `json` format after a pass.
#. Measure and evaluate your performance improvements with ``NGRAPH_CPU_TRACING``,
which produces timelines compatible with ``chrome://tracing``.
Optimizations can be experimented upon without using any backend by registering
a pass with pass manager (``Manager``), calling ``run_passes`` on a function, and
then inspecting the transformed graph.
Optimization passes can be programmed ahead of time if you know or can predict
what your graph will look like when it's ready to be executed (in other words:
which `ops` can be automatically translated into :doc:`nGraph Core ops <../ops/index>`).
The ``Interpreter`` is simply a backend providing reference implementations of
ngraph ops in C++, with the focus on simplicity over performance.
Example
-------
.. fusion/index.rst:
Pattern matcher
###############
* :ref:`overview`
* :ref:`passes_list`
* :ref:`more_detail`
* :ref:`passes_examples`
* :doc:`optimize-graphs`
.. _overview:
Generic graph optimizers: Optimization passes
=============================================
The pass manager infrastructure in nGraph makes it easy to reuse and mix the
generic optimization passes. It also permits you to roll your own device-specific
optimizations; that is, the same unified interface and APIs may be used to
cover both things.
Invoking these passes is fairly straightforward:
#. Create a "pass manager" object.
#. Populate it with the desired passes.
#. Pass to it a pointer to your unoptimized graph, and it’ll return a pointer
to an optimized graph.
nGraph Core includes a large library of hardware-agnostic passes -- passes useful
for almost any kind of hardware backend. Some of these passes should be familiar
to people who are comfortable with classical compiler designs. Others, like the
reshape/transpose elimination and sinking passes, are quite specific to deep
learning.
Let’s take a look at some of these passes.
.. _passes_list:
List of Passes
==============
* :ref:`algebraic_simpl`
* :ref:`common_subex_elim`
* :ref:`constant_fold`
* :ref:`reshape_transpose_elim`
* :ref:`reshape_transpose_sink`
.. _algebraic_simpl:
Algebraic Simplification
------------------------
The **Algebraic Simplification** pass implements what amounts to a "grab bag" of
algebraic simplification rules. It does some basic things like rewrite "zero
times x" to simply "zero", or "zero plus x" to plain "x".
It can also do a number of tricks more specific to deep learning. For example,
if we discover that a tensor is being sliced up by adjacent segments, only to
have those slices concatenated back together again, we can skip the slicing and
concatting altogether.
Or, if a tensor is being padded, but the actual width of the padding is zero
all around, we can skip the padding step entirely.
Several other transformations like this are implemented in the algebraic
simplification pass. And while none of these transformations might seem
particularly impressive on their own, when everything comes together the
results of this pass often yield improvement even on the initial graph straight
out of the bridge. This pass is also quite important as a "glue" pass that can
be used to clean up and/or re-simplify after other passes have done their own
tricks.
.. _common_subex_elim:
Common Subexpression Elimination
--------------------------------
.. _constant_fold:
Constant Folding
----------------
.. _core_fusion:
Core Fusion
-----------
.. _reshape_transpose_elim:
Reshape/Transpose Elimination
-----------------------------
The pass called **Reshape/Transpose Elimination** will find and optimize where
we can "push" two ``Transpose`` ops through a matrix multiplication. For example,
if you have two matrices (say, *foo* and *bar*), both of these matrices will be
transposed (to produce *foo.t* and *bar.t*, respectively), aftew which *foo.t*
and *bar.t* get multiplied together.
Often a more efficient way to implement this is to switch the order of the
arguments *foo* and *bar*, multiply them together, and then transpose the output
of the matmul. Effectively, this cuts two `Transpose` operations down to just
one, where the **Reshape/Transpose** elimination will do that rewrite for you.
Another common pattern can be optimized via nGraph is the case where two
transpositions cancel each other out. One example of this is taking the
"Transpose" of the transpose of a matrix, though actually a more common case is
when the graph is translating among different batch formats. We can often move
these operations around through a process called **Reshape sinking/swimming**,
and in cases where two transposes wind up canceling each other out, we can cut
them both out of the graph.
.. _reshape_transpose_sink:
``Reshape/Transpose Sinking``
-----------------------------
.. _elementzero_tensor_elim:
``Zero-Element Tensor Elimination``
-----------------------------------
.. _more_detail:
More detail
-----------
Let us first consider a simple example. A user would like to execute a graph
that describes the following arithmetic expression:
......@@ -80,4 +171,35 @@ optimizer.
graph-rewrite.rst
passes-that-use-matcher.rst
optimize-graphs.rst
.. _passes_examples:
Examples of Passes
==================
The effectiveness of these passes is more striking to look at in terms of an
actual input graph, such as one from the framework bridge.
*Figure 0* shows an excerpt from ``MobileNet v1``, a topology which makes heavy
use of group convolution.
.. _figure-mobilenet-gc:
.. figure:: ../graphics/mobilenet-group-conv.png
:width: 700px
:alt:
Figure 0: Each of these grouped convolution complexes -- the
operations within the rectangles on the left -- is very wide; each is too
wide to fit legibly on the illustration.
The group convolution fusion is able to replace each of those giant subgraphs
with a single CPU group convolution node. This ends up being a win in several
ways:
* sheer node count,
* mappability to MKL-DNN (which has an accelerated group convolution implementation),
* elimination of unnecessary temporaries, and so on.
\ No newline at end of file
digraph G {
Label_0 -> Max_2
Constant_1 -> Max_2
Label_0 [shape=ellipse color=black]
Constant_1 [shape=ellipse color=black]
Max_2 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Negative_1 -> Negative_2;
Parameter_0 [shape=box color=blue]
Negative_1 [shape=ellipse color=black]
Negative_2 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Abs_1 -> Negative_2 -> Negative_3;
Parameter_0 [shape=box color=blue]
Abs_1 [shape=ellipse color=black]
Negative_2 [shape=ellipse color=black]
Negative_3 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Add_2
Parameter_1 -> Add_2
Add_2 -> Abs_3 -> Negative_4 -> Negative_5
Parameter_0 [shape=box color=blue]
Parameter_1 [shape=box color=blue]
Add_2 [shape=ellipse color=black]
Abs_3 [shape=ellipse color=black]
Negative_4 [shape=ellipse color=black]
Negative_5 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Add_2
Parameter_1 -> Add_2
Add_2 -> Negative_3 -> Negative_4
Parameter_0 [shape=box color=blue]
Parameter_1 [shape=box color=blue]
Add_2 [shape=ellipse color=black]
Negative_3 [shape=ellipse color=black]
Negative_4 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Sub_2
Parameter_1 -> Sub_2
Sub_2 -> Negative_3 -> Negative_4
Parameter_0 [shape=box color=blue]
Parameter_1 [shape=box color=blue]
Sub_2 [shape=ellipse color=black]
Negative_3 [shape=ellipse color=black]
Negative_4 [shape=ellipse color=black]
}
digraph G {
Parameter_1 -> Negative_2 -> Negative_3;
Parameter_1 [shape=box color=blue]
Negative_2 [shape=ellipse color=black]
Negative_3 [shape=ellipse color=black]
}
digraph G {
Label_0 -> Negative_1 -> Negative_2;
Label_0 [shape=ellipse color=black]
Negative_1 [shape=ellipse color=black]
Negative_2 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Add_2
Constant_1 -> Add_2
Parameter_0 [shape=box color=blue]
Constant_1 [shape=ellipse color=black]
Add_2 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Add_3
Constant_1 -> Broadcast_2
Broadcast_2 -> Add_3
Parameter_0 [shape=box color=blue]
Constant_1 [shape=ellipse color=black]
Broadcast_2 [shape=ellipse color=black]
Add_3 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Add_2
Constant_1 -> Broadcast_2
Constant_1 -> Add_3
Parameter_0 [shape=box color=blue]
Constant_1 [shape=ellipse color=black]
Broadcast_2 [shape=ellipse color=black]
Add_3 [shape=ellipse color=black]
}
digraph G {
Constant_1 -> Skip_2
Label_3 -> Add_4
Skip_2 -> Add_4
Constant_1 [shape=ellipse color=black]
Skip_2 [shape=ellipse color=black]
Label_3 [shape=ellipse color=black]
Add_4 [shape=ellipse color=black]
}
digraph G {
Parameter_0 -> Add_2
Constant_1 -> Add_2
Add_2 -> Add_3
Constant_2 -> Add_3
Add_3 -> Add_4
Constant_3 -> Add_4
Parameter_0 [shape=box color=blue]
Constant_1 [shape=ellipse color=black]
Constant_2 [shape=ellipse color=black]
Constant_3 [shape=ellipse color=black]
Add_2 [shape=ellipse color=black]
Add_3 [shape=ellipse color=black]
Add_4 [shape=ellipse color=black]
}
digraph G {
Label_0 -> Add_2
Constant_1 -> Add_2
Label_0 [shape=ellipse color=black]
Constant_1 [shape=ellipse color=black]
Add_2 [shape=ellipse color=black]
}
.. fusion/optimize-graphs:
Optimize Graphs
===============
with nGraph Compiler fusions
----------------------------
The nGraph Compiler is an optimizing compiler. As such, it provides a way to
capture a given :term:`function graph` and perform a series of optimization
passes over that graph. The result is a semantically-equivalent graph that, when
executed using any :doc:`backend <../backend-support/index>`, has optimizations
inherent at the hardware level: superior runtime characteristics to increase
training performance or reduce inference latency.
There are several ways to describe what happens when we capture and translate
the framework's output of ops into an nGraph graph. :term:`Fusion` is the term
we shall use in our documentation; the action also can be described as:
*combining*, *folding*, *squashing*, *collapsing*, or *merging* of graph
functions.
Optimization passes may include algebraic simplifications, domain-specific
simplifications, and fusion. Most passes share the same mode of operation (or
the same operational structure) and consist of various stages (each one a
:term:`step`) where a developer can experiment with the intercepted or dynamic
graph. These steps may be cycled or recycled as needed:
#. Locate a list of potentially-transformable subgraphs in the given graph.
#. Transform the selected candidates into semantically-equivalent subgraphs
that execute faster, or with less memory (or both).
#. Verify that the optimization pass performs correctly, with any or all expected
transformations, with the ``NGRAPH_SERIALIZE_TRACING`` option, which
serializes a graph in the `json` format after a pass.
#. Measure and evaluate your performance improvements with ``NGRAPH_CPU_TRACING``,
which produces timelines compatible with ``chrome://tracing``.
Optimizations can be experimented upon without using any backend by registering
a pass with pass manager (``Manager``), calling ``run_passes`` on a function, and
then inspecting the transformed graph.
Optimization passes can be programmed ahead of time if you know or can predict
what your graph will look like when it's ready to be executed (in other words:
which `ops` can be automatically translated into :doc:`nGraph Core ops <../ops/index>`).
The ``Interpreter`` is simply a backend providing reference implementations of
ngraph ops in C++, with the focus on simplicity over performance.
......@@ -41,6 +41,25 @@ Glossary
of a graph's functional operations (``ops``) into one or more of
nGraph's core ops.
ISA
An acronym for "Instruction Set Architecture," an ISA is machine code that
is compatible with the underlying silicon architecture. A realization of
an ISA is called an *implementation*. An ISA permits multiple
implementations that may vary in performance, physical size, memory use or
reuse, and monetary cost among other things. An ISA defines everything a
machine-language programmer needs to know in order to program a particular
backend device. What an ISA defines will differ among ISAs; in general, it
defines things like:
- supported *data types*;
- physical *states* available, such as the main memory and registers;
- *semantics*, such as the memory consistency and addressing modes;
- *low-level machine instructions* that comprise a machine language;
- and the *input/output model*.
Be careful to not confuse ISAs with microarchitectures.
op
An op represents an operation. Ops are stateless and have zero
......
This diff is collapsed.
......@@ -137,7 +137,7 @@ update computation for ``N`` will be given by the node
The different update nodes will share intermediate computations. So to
get the updated values for the weights as computed with the specified
:doc:`backend <../programmable/index>`,
:doc:`backend <../backend-support/index>`.
.. literalinclude:: ../../../examples/mnist_mlp/mnist_mlp.cpp
:language: cpp
......
......@@ -6,11 +6,11 @@ Train using multiple nGraph CPU backends with data parallel
In the :doc:`previous section <../howto/derive-for-training>`, we described the
steps needed to create a "trainable" nGraph model. Here we demonstrate how to
train a data parallel model by distributing the graph across devices.
train a data parallel model by distributing the graph to more than one device.
As of release version 0.12, the default build is with OpenMPI. To use the
`Intel MLSL`_ library, set the following compilation flag at build time:
To use this mode of training, create an nGraph build with the cmake flag
``-DNGRAPH_DISTRIBUTED_ENABLE=TRUE``.
To deploy data-parallel training on backends supported by nGraph API, the
......@@ -20,14 +20,13 @@ To deploy data-parallel training on backends supported by nGraph API, the
.. literalinclude:: ../../../examples/mnist_mlp/dist_mnist_mlp.cpp
:language: cpp
:lines: 180-196
:emphasize-lines: 9-12
:emphasize-lines: 8-11
We need to initialize and finalize distributed training with ``Distributed`` object;
see the `full raw code`_.
Finally, to run the training using two nGraph devices, invoke :command:`mpirun` which is a distributed with
`Intel MLSL`_ library.
This will launch two nGraph CPU backends.
Finally, to run the training using two nGraph devices, invoke :command:`mpirun` which
is distributed with `Intel MLSL`_ library. This will launch two nGraph CPU backends.
.. code-block:: console
......
......@@ -18,9 +18,9 @@ directory. We'll be deconstructing the steps that must happen (either programmat
or manually) in order to successfully execute a computation:
* :ref:`define_cmp`
* :ref:`specify_bkd`
* :ref:`specify_backend`
* :ref:`compile_cmp`
* :ref:`allocate_bkd_storage`
* :ref:`allocate_backend_storage`
* :ref:`initialize_inputs`
* :ref:`invoke_cmp`
* :ref:`access_outputs`
......@@ -108,7 +108,7 @@ function, in the order they are to be passed to the compiled function. A
the computation of the results.
.. _specify_bkd:
.. _specify_backend:
Specify the backend upon which to run the computation
=====================================================
......@@ -126,10 +126,10 @@ There are two backends for the CPU: the optimized ``"CPU"`` backend, which uses
the `Intel MKL-DNN`_, and the ``"INTERPRETER"`` backend, which runs reference
versions of kernels that favor implementation clarity over speed. The
``"INTERPRETER"`` backend can be slow, and is primarily intended for testing.
See the documentation on :doc:`runtime options for various backends <../programmable/index>`
See the documentation on :doc:`runtime options for various backends <../backend-support/index>`
for additional details.
To continue with our original example and select the ``"CPU"`` backend:
To continue with our original example and select the ``"CPU_Backend"``:
.. literalinclude:: ../../../examples/abc/abc.cpp
:language: cpp
......@@ -149,7 +149,7 @@ thread needs to execute the function at the same time, create multiple
``CallFrame`` objects from the ``ExternalFunction``.
.. _allocate_bkd_storage:
.. _allocate_backend_storage:
Allocate backend storage for the inputs and outputs
===================================================
......@@ -175,7 +175,8 @@ the three parameters and the return value.
Each tensor is a shared pointer to a :term:`Tensorview`, which is the interface
backends implement for tensor use. When there are no more references to the
tensor view, it will be freed when convenient for the backend. See the
:doc:`../programmable/index` documentation for details on ``Tensor``.
:doc:`../backend-support/cpp-api` documentation for details on how to work
with ``Tensor``.
.. _initialize_inputs:
......
.. howto/index:
How to
======
Constructing Graphs
===================
.. toctree::
:maxdepth: 1
......
......@@ -17,126 +17,75 @@
.. This documentation is available online at
.. https://ngraph.nervanasys.com/docs/latest
######################
nGraph Compiler stack
######################
Welcome
=======
See the latest :doc:`project/release-notes`.
nGraph is an open-source C++ library, compiler stack, and runtime accelerator
for software and neural network engineering within the :abbr:`Deep Learning (DL)`
ecosystem. nGraph simplifies development and makes it possible to design, write,
compile, and deploy :abbr:`Deep Neural Network (DNN)`-based solutions that can
be adapted and deployed across many frameworks and backends. See our project
:doc:`project/about` and `ecosystem`_ for more details.
.. figure:: graphics/ngcompiler-ecosystem.png
:width: 650px
:alt: ecosystem
The Intel nGraph Compiler stack supports a broad ecosystem of frameworks and backends.
.. _quickstart:
Quick Start
===========
We have many documentation pages to help you get started.
* **TensorFlow or MXNet users** can get started with
:doc:`framework-integration-guides`.
* `TensorFlow bridge to nGraph`_
* `Compiling MXNet with nGraph`_
.. note:: Note that the ``pip`` package option works only with Ubuntu 16.04
or greater and Intel® Xeon® CPUs. CPUs without Intel® Advanced Vector Extensions
512 (Intel® AVX-512) will not run these packages; the alternative is to
build from source. Wider support for other CPUs will be offered starting
in early 2019.
* **Data scientists** interested in the `ONNX`_ format will find the
`nGraph ONNX companion tool`_ of interest.
* **Framework authors and architects** will likely want to :doc:`buildlb`
and learn how nGraph can be used to :doc:`howto/execute`. For examples
of generic configurations or optimizations available when designing or
bridging a framework directly with nGraph, see :doc:`frameworks/index`.
* To start learning about nGraph's set of **Core ops** and how they can
be used with Ops from other frameworks, go to :doc:`ops/index`.
* **Optimization pass writers** will find :doc:`fusion/index` useful. Also
look for our upcoming documentation on :term:`quantization`.
* For details about **PlaidML integration** and other nGraph runtime APIs,
see the section :doc:`programmable/index`.
.. csv-table::
:header: "Framework", "Bridge Available?", "ONNX Support?"
:widths: 27, 10, 10
TensorFlow, Yes, Yes
MXNet, Yes, Yes
PaddlePaddle, Coming Soon, Yes
PyTorch, No, Yes
Other, Write your own, Custom
.. toctree::
:maxdepth: 1
.. csv-table::
:header: "Backend", "Current support", "Future nGraph support"
:widths: 35, 10, 10
project/introduction.rst
Intel® Architecture Processors (CPUs), Yes, Yes
Intel® Nervana™ Neural Network Processor (NNPs), Yes, Yes
Intel® Architecture GPUs, Yes, Yes
AMD\* GPUs, via PlaidML, Yes
:abbr:`Field Programmable Gate Arrays (FPGA)` (FPGAs), Coming soon, Yes
NVIDIA\* GPUs, via PlaidML, Some
Intel Movidius™ Myriad™ 2 (VPU), Coming soon, Yes
.. toctree::
:maxdepth: 1
:caption: Framework Support
frameworks/index.rst
frameworks/validated/list.rst
frameworks/generic-configs.rst
.. note:: The code in this repo is under active development as we're continually
adding support for more kinds of DL models and ops, compiler optimizations,
and backend optimizations.
.. toctree::
:maxdepth: 1
:caption: nGraph Core
core/overview.rst
Pattern matcher <fusion/index.rst>
nGraph ops <ops/about.rst>
Graph construction <howto/index.rst>
Using the Python API <python_api/index.rst>
Compiler passes <fusion/graph-rewrite.rst>
buildlb.rst
Using the C++ API <ops/index.rst>
.. toctree::
:maxdepth: 1
:caption: Backend support
=======
backend-support/index.rst
backend-support/cpp-api.rst
Contents
========
.. toctree::
:maxdepth: 1
:caption: Python Ops for ONNX
:caption: Distributed training
distr/index.rst
python_api/index.rst
.. toctree::
:maxdepth: 1
:caption: Core Documentation
:caption: Diagnostics and visualization
diagnostics/nbench.rst
diagnostics/performance-profile.rst
diagnostics/visualize.rst
diagnostics/debug.rst
buildlb.rst
framework-integration-guides.rst
frameworks/validation.rst
frameworks/index.rst
graph-basics.rst
howto/index.rst
ops/about.rst
ops/index.rst
fusion/index.rst
programmable/index.rst
distr/index.rst
.. toctree::
:maxdepth: 2
:maxdepth: 1
:caption: Project Metadata
project/index.rst
project/release-notes.rst
project/contribution-guide.rst
project/index.rst
glossary.rst
Indices and tables
==================
......
.. about:
Architecture, Features, FAQs
############################
......@@ -168,7 +167,7 @@ added with new functions that build sub-graphs from existing core ops.
For a more detailed dive into how custom bridge code can be implemented, see our
documentation on how to :doc:`../howto/execute`. To learn how TensorFlow and
MXNet currently make use of custom bridge code, see the section on
:doc:`../framework-integration-guides`.
:doc:`../frameworks/index`.
.. figure:: ../graphics/bridge-to-graph-compiler.png
:width: 733px
......
.. code-contributor-README:
.. contribution-guide:
######################
Code Contributor Guide
######################
##################
Contribution Guide
##################
License
=======
......@@ -12,6 +12,7 @@ preferably by being contributed under the Apache 2 license. Code
contributed with another license will need the license reviewed by
Intel before it can be accepted.
Code formatting
===============
......@@ -259,5 +260,8 @@ it is automatically enforced and reduces merge conflicts.
int* z;
To contribute documentation for your code, please see the :doc:`doc-contributor-README`.
.. _Apache 2: https://www.apache.org/licenses/LICENSE-2.0
.. _repo wiki: https://github.com/NervanaSystems/ngraph/wiki
\ No newline at end of file
......@@ -99,7 +99,7 @@ and the generated output will show readers of your helpful documentation
:caption: "caption for a block of code that initializes tensors"
Our documentation practices are designed around "write once, reuse" that we can
use to prevent code bloat. See the :doc:`code-contributor-README` for our code
use to prevent code bloat. See the :doc:`contribution-guide` for our code
style guide.
......@@ -127,7 +127,7 @@ To build documentation locally, run:
.. code-block:: console
$ sudo apt-get install python3-sphinx
$ sudo apt-get install python3-sphinxcd
$ pip3 install [-I] Sphinx==1.7.5 [--user]
$ pip3 install [-I] breathe numpy [--user]
$ cd doc/sphinx/
......@@ -165,3 +165,6 @@ stable reST documentation.
.. _wiki: https://github.com/NervanaSystems/ngraph/wiki/
.. _breathe: https://breathe.readthedocs.io/en/latest/
.. _doxygen: http://www.doxygen.org/index.html
.. 45555555555555555555555555555
\ No newline at end of file
.. governance:
Governance
##########
\ No newline at end of file
.. project/index.rst
.. project/index:
#################
More about nGraph
......@@ -7,9 +7,11 @@ More about nGraph
This section contains documentation about the project and how to contribute.
.. toctree::
:maxdepth: 1
:maxdepth: 2
about.rst
release-notes.rst
code-contributor-README.rst
about.rst
other-efforts.rst
contribution-guide.rst
governance.rst
doc-contributor-README.rst
.. introduction:
############
Introduction
############
The nGraph Compiler stack provides industry-standard reference and implementation
guidelines for working with various :abbr:`Deep Learning (DL)` (DL) models and
optimizing an :abbr:`Artificial Neural Network (ANN)` (often abbreviated :term:`NN`)
to run graph-based computations for training, inference, testing, or validation.
Because today's NNs make use of many custom-purpose devices (FPGAs, GPUs, CPUs,
and custom silicon), having such a standard simplifies what would otherwise be
an enormously complex and difficult-to-scale pipeline (:ref:`Figure 3 <figure-3>`)
from "training with your favorite framework using GPUs" (:ref:`Figure 4 <figure-4>`),
to deploying that (now) pre-trained model in a datacenter or production
environment, where infrastructure owners or software developers renting anything
in a datacenter ought to be mutually concerned with **efficiency per-watt**, to
keep costs in check.
So what exactly are the motivations behind the nGraph Compiler stack?
Motivations
===========
Kernel libraries do not support graph-level optimizations
---------------------------------------------------------
A framework designed for training using GPUs requires integration with a kernel
library unique to that vendor's hardware. For example, after integration, a
kernel library can run operations that it is "familar" with optimally; however,
the graph itself within any larger :term:`NN` won't be optimal.
.. _figure-0:
.. figure:: ../graphics/framework-to-kernel-lib.png
:width: 555px
:alt:
Figure 0: Lack of graph-level optimization makes framework-to-kernel library
integration enormously inefficient. The computation graph above represents
the computation: "A plus B times C".
.. _figure-1:
.. figure:: ../graphics/framework-to-graph-opt.png
:width: 555px
:alt:
Figure 1: Notice that an operation on the constant B (in this case a ``Broadcast``)
can be done at compile time. This is an example of constant folding, and it
is not available to a device-based kernel library.
.. _figure-2:
.. figure:: ../graphics/ngraph-algebraic-simp.png
:width: 555px
:alt:
Figure 2: Finally notice that the constant has value "zero" thus the add is an
*identity* operation and can be eliminated. This is an example of **Algebraic
simplification**, and it is not available to a device-based kernel library.
After the two graph-level optimizations above (**Algebraic Simplification** and
**Constant Folding**), we now have an optimal graph: A times C. Again, kernel
libraries do not support this type of optimization. Although each implementation
can be done individually, it will eventually yield an "exploding" number of
kernels the larger and more complex an :abbr:`NN (Neural Network)` becomes. For
some insight on why this happens, see the next section.
Too Many Kernels to write
-------------------------
A typical network is constructed using some kind of language-based API, which
translates the network or :abbr:`DL (Deep Learning)` model (statically or
dynamically) into serialized graphs. Those graphs can then passed through a
compilation process (the *Graph optimization or compilation* step in
*Figure 3* below), where various graph-level optimizations, like constant folding
or fusion can happen. These processes require unique vendor-provided libraries
to communicate with a driver (possibly through OpenCL\*, CUDA\*, or SYCL\*), to
compile and execute an implementation (kernel) for a specific
:abbr:`Instruction Set Architecture (ISA)`, or :term:`ISA`.
Illustrated below is a simplified DL stack, showing relative complexity of
each component. Note that optimizing for any one on its own usually requires
engineering expertise that can be highly specialized to that component, and that
the terms have been simplified for illustrative purposes.
.. _figure-3:
.. figure:: ../graphics/components-dl-stack.png
:width: 700px
:alt: A simplified DL stack
Figure 3: Components of a DL stack, simplified for illustrative purposes.
There are many deep learning frameworks, each with its own strengths and user
bases. A setup that is common to many DL practitioners is shown in the
illustration below.
.. _figure-4:
.. figure:: ../graphics/a-common-stack.png
:width: 700px
:alt: A common implementation
Figure 4: A commonly-implemented stack uses TensorFlow\* as the frontend.
The input is either optimized via Grappler, or executed directly via TensorFlow.
In either case, when targeting an Nvidia\* GPU, cuDNN is called to select an
optimal kernel for the operation; cuDNN then relies on CUDA\* or direct access
to run code on the target; in this toy example, the target is a V100.
A natural result of this approach is that the framework-level integration of
kernel libraries does not scale. Rather, each individual framework must be
manually integrated with each hardware-specific kernel library. Each integration
is unique to the framework and its set of deep learning operators, its view on
memory layout, its feature set, etc. Each of these connections, then, represents
significant work for what will ultimately be a brittle setup that is enormously
expensive to maintain.
.. _figure-5:
.. figure:: ../graphics/dl-current-state.png
:width: 700px
:alt: Scalability matters
Figure 5: The number of kernels necessary to achieve optimal performance is
bounded by the product of the number of chip designs one wishes to support,
the number of data types supported, the number of operations, and the
cardinality of each parameter for each operation.
In the past, this upper bound was quite limited; however, since the industry is
shifting toward a more diverse future in terms of deep learning hardware, the
number of distinct kernels is exploding and will continue to explode.
Get the best of both worlds
---------------------------
Integrating a framework on nGraph can be an attractive option for hardware
companies trying to design their own deep learning hardware or network architecture.
Framework integration is non-trivial amount of work, and nGraph automatically
does much of the heavy lifting. Furthermore, PlaidML can provide a wide range of
hardware coverage and optimization automatically. Any hardware that supports
LLVM, OpenCL, OpenGL, CUDA or Metal can be supported automatically with PlaidML
and nGraph.
.. _figure-6:
.. figure:: ../graphics/graph-compilers-at-a-glance.png
:width: 700px
:alt: Overview of various graph and tensor compilers.
Figure 6: Overview of various graph and tensor compilers.
.. _figure-7:
.. figure:: ../graphics/tensor-compilers-at-a-glance.png
:width: 700px
:alt: A closer look at tensor compilers.
Figure 7: A closer look at tensor compilers.
.. _other-efforts:
Other notable efforts
=====================
......@@ -214,3 +214,13 @@ size_t Function::get_graph_size() const
}
return total_size;
}
size_t Function::get_placement() const
{
return m_placement;
}
void Function::set_placement(size_t placement)
{
m_placement = placement;
}
......@@ -90,6 +90,9 @@ namespace ngraph
/// graphs and should not be considered the actual memory consumption of a graph.
size_t get_graph_size() const;
size_t get_placement() const;
void set_placement(size_t placement);
protected:
ResultVector m_results;
ParameterVector m_parameters;
......@@ -104,5 +107,6 @@ namespace ngraph
size_t m_instance_id;
std::string m_name;
const std::string m_unique_name;
size_t m_placement;
};
}
......@@ -17,7 +17,7 @@
add_library(hybrid_base STATIC
hybrid_backend.cpp
hybrid_util.cpp
pass/assign_placement.cpp
pass/default_placement.cpp
pass/dump.cpp
pass/fix_get_output_element.cpp
pass/liveness.cpp
......
......@@ -20,7 +20,7 @@
#include "ngraph/pass/visualize_tree.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/hybrid/hybrid_util.hpp"
#include "ngraph/runtime/hybrid/pass/assign_placement.hpp"
#include "ngraph/runtime/hybrid/pass/default_placement.hpp"
#include "ngraph/runtime/hybrid/pass/dump.hpp"
#include "ngraph/runtime/hybrid/pass/fix_get_output_element.hpp"
#include "ngraph/runtime/hybrid/pass/liveness.hpp"
......@@ -74,7 +74,7 @@ runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> fun
// Run placement pass
ngraph::pass::Manager pass_manager;
pass_manager.register_pass<runtime::hybrid::pass::AssignPlacement>(m_backend_list);
pass_manager.register_pass<runtime::hybrid::pass::DefaultPlacement>(m_backend_list);
pass_manager.register_pass<runtime::hybrid::pass::FixGetOutputElement>();
pass_manager.register_pass<runtime::hybrid::pass::Liveness>();
pass_manager.register_pass<runtime::hybrid::pass::Dump>("graph.dump");
......@@ -94,7 +94,7 @@ runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> fun
size_t subfunction_number = 0;
for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
{
size_t placement = runtime::hybrid::get_colocated_function_placement(sub_function);
size_t placement = sub_function->get_placement();
if (m_debug_enabled)
{
string name = "subfunction_" + to_string(subfunction_number++);
......@@ -149,7 +149,7 @@ bool runtime::hybrid::HybridBackend::call(shared_ptr<Function> func,
for (const shared_ptr<Function>& sub_function : instance.m_sub_functions)
{
// Init backend
size_t placement = runtime::hybrid::get_colocated_function_placement(sub_function);
size_t placement = sub_function->get_placement();
auto backend = m_backend_list[placement];
// Prepare parameter Tensors
......
......@@ -15,6 +15,7 @@
//*****************************************************************************
#include "ngraph/runtime/hybrid/hybrid_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/visualize_tree.hpp"
......@@ -25,20 +26,20 @@ static Node* take_independent_node_with_placement_priority(
map<size_t, deque<Node*>>& independent_nodes_by_placement, size_t placement)
{
Node* selected_node = nullptr;
if (independent_nodes_by_placement.find(placement) != independent_nodes_by_placement.end() &&
independent_nodes_by_placement.at(placement).size() != 0)
auto it = independent_nodes_by_placement.find(placement);
if (it != independent_nodes_by_placement.end() && it->second.size() != 0)
{
selected_node = independent_nodes_by_placement.at(placement).front();
independent_nodes_by_placement.at(placement).pop_front();
selected_node = it->second.front();
it->second.pop_front();
}
else
{
for (auto& it : independent_nodes_by_placement)
for (auto& p : independent_nodes_by_placement)
{
if (it.second.size() > 0)
if (p.second.size() > 0)
{
selected_node = it.second.front();
it.second.pop_front();
selected_node = p.second.front();
p.second.pop_front();
break;
}
}
......@@ -238,8 +239,10 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
{
ParameterVector par_vector;
ResultVector res_vector;
size_t placement = -1;
for (auto node : cluster)
{
placement = node->get_placement_index();
if (auto res_node = dynamic_pointer_cast<op::Result>(node))
{
res_vector.push_back(res_node);
......@@ -250,6 +253,7 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
}
}
auto sub_function = make_shared<Function>(res_vector, par_vector);
sub_function->set_placement(placement);
sub_functions.push_back(sub_function);
#ifdef HYBRID_DEBUG
ngraph::pass::Manager pass_manager;
......@@ -261,26 +265,3 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
return make_pair(sub_functions, map_parameter_to_result);
}
// Assert that nodes in the function is colocated and return that placement
size_t runtime::hybrid::get_colocated_function_placement(shared_ptr<Function> func)
{
auto ops = func->get_ops();
//it's okay to not do Placement::DEFAULT check; the same node will be checked in the loop below
size_t function_placement = ops.front()->get_placement_index();
for (auto op : ops)
{
size_t node_placement = op->get_placement_index();
if (node_placement == Node::placement_invalid)
{
throw ngraph_error("Node " + op->get_name() + " should have a device placement");
}
if (function_placement != node_placement)
{
throw ngraph_error("Function contains nodes of two different placements");
}
}
return function_placement;
}
......@@ -35,9 +35,6 @@ namespace ngraph
std::vector<std::shared_ptr<Function>>,
std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<op::Result>>>
split_function_by_placement(const std::shared_ptr<Function>& f);
// Assert that nodes in the function is colocated and return that placement
size_t get_colocated_function_placement(std::shared_ptr<Function> func);
}
}
}
......@@ -14,7 +14,7 @@
// limitations under the License.
//*****************************************************************************
#include "ngraph/runtime/hybrid/pass/assign_placement.hpp"
#include "ngraph/runtime/hybrid/pass/default_placement.hpp"
#include "ngraph/log.hpp"
#include "ngraph/node.hpp"
#include "ngraph/placement.hpp"
......@@ -23,13 +23,13 @@
using namespace ngraph;
using namespace std;
runtime::hybrid::pass::AssignPlacement::AssignPlacement(
runtime::hybrid::pass::DefaultPlacement::DefaultPlacement(
const vector<shared_ptr<runtime::Backend>>& placement_backends)
: m_placement_backends(placement_backends)
{
}
bool runtime::hybrid::pass::AssignPlacement::run_on_node(shared_ptr<Node> node)
bool runtime::hybrid::pass::DefaultPlacement::run_on_node(shared_ptr<Node> node)
{
size_t backend_index = 0;
for (auto backend : m_placement_backends)
......
......@@ -30,16 +30,16 @@ namespace ngraph
{
namespace pass
{
class AssignPlacement;
class DefaultPlacement;
}
}
}
}
class ngraph::runtime::hybrid::pass::AssignPlacement : public ngraph::pass::NodePass
class ngraph::runtime::hybrid::pass::DefaultPlacement : public ngraph::pass::NodePass
{
public:
AssignPlacement(
DefaultPlacement(
const std::vector<std::shared_ptr<ngraph::runtime::Backend>>& placement_backends);
private:
......
......@@ -56,7 +56,7 @@ TEST(HYBRID, abc)
auto t1 = A * B;
auto t2 = t1 * D;
auto C = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>((t2 + C) * t1, ParameterVector{A, B, C, D});
auto f = make_shared<Function>(((t2 + C) + A) * t1, ParameterVector{A, B, C, D});
shared_ptr<runtime::Backend> backend = runtime::Backend::create("H1");
static_pointer_cast<runtime::hybrid::HybridBackend>(backend)->set_debug_enabled(true);
......@@ -75,5 +75,5 @@ TEST(HYBRID, abc)
auto handle = backend->compile(f);
backend->call_with_validate(handle, {result}, {a, b, c, d});
EXPECT_EQ(read_vector<float>(result), (vector<float>{145, 552, 1113, 1408}));
EXPECT_EQ(read_vector<float>(result), (vector<float>{150, 576, 1176, 1536}));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment