Merge branch 'tfl/gpu_fix_constant_bug' of github.com:NervanaSystems/ngraph-cpp…

Merge branch 'tfl/gpu_fix_constant_bug' of github.com:NervanaSystems/ngraph-cpp into tfl/gpu_fix_constant_bug

Merge branch 'tfl/gpu_fix_constant_bug' of github.com:NervanaSystems/ngraph-cpp…
Merge branch 'tfl/gpu_fix_constant_bug' of github.com:NervanaSystems/ngraph-cpp into tfl/gpu_fix_constant_bug
ee463b66 · fenglei.tian · 24b72581 · 20e2a098 · ee463b66 · ee463b66
Commit ee463b66 authored Mar 06, 2018 by fenglei.tian
133 changed files
--- a/contrib/docker/Dockerfile.ngraph_cpp
+++ b/contrib/docker/Dockerfile.ngraph_cpp
-# Environment to build and unit-test private-ngraph-cpp
+# Environment to build and unit-test ngraph-cpp

 FROM ubuntu:16.04


--- a/contrib/docker/Dockerfile.ngraph_cpp.centos74_cmake3
+++ b/contrib/docker/Dockerfile.ngraph_cpp.centos74_cmake3
-# Environment to build and unit-test private-ngraph-cpp on centos74
+# Environment to build and unit-test ngraph-cpp on centos74
 # with gcc 4.8.5
 # with python 2.7
 # with cmake3

--- a/contrib/docker/Dockerfile.ngraph_cpp.ubuntu1604
+++ b/contrib/docker/Dockerfile.ngraph_cpp.ubuntu1604
-# Environment to build and unit-test private-ngraph-cpp
+# Environment to build and unit-test ngraph-cpp

 FROM ubuntu:16.04


--- a/contrib/docker/Dockerfile.ngraph_cpp.ubuntu1604_gcc48
+++ b/contrib/docker/Dockerfile.ngraph_cpp.ubuntu1604_gcc48
-# Environment to build and unit-test private-ngraph-cpp
+# Environment to build and unit-test ngraph-cpp

 FROM ubuntu:16.04


--- a/contrib/docker/Makefile
+++ b/contrib/docker/Makefile
 # Basic Makefile for contrib/docker. This can be expanded later as more targets
 # are added.

-# Default is to build with -j for parallel cmake/make.  Turn off with
-#   make PARALLEL=
-PARALLEL=-j
+# Building LLVM from source has been observed to trigger the oom-killer
+#    on systems with a large number of cores
+#    running with make -j
+#
+# Default is to build with -j 22 for parallel cmake/make.
+# Override with make PARALLEL="-j <num_parallel_processes>" where
+#    <num_parallel_processes> = the number of make processes to run in parallel
+# Turn off with make PARALLEL=
+PARALLEL=-j 22 

 # DIR is an internal variable that serves as an anchor to this cloned git
 # repository.  DIR is mounted into the docker container, so that builds

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ******************************************************************************
- 
+
+add_subdirectory(examples)
+
 if ("${NGRAPH_BUILD_DOCS}" MATCHES "^ON$")
    add_custom_target( docs
        COMMENT "Build all of the documentation types selected during CMake configuration."

--- a/doc/doxygen/xml/combine.xslt
+++ b/doc/doxygen/xml/combine.xslt
-<!-- XSLT script to combine the generated output into a single file. 
-     If you have xsltproc you could use:
-     xsltproc combine.xslt index.xml >all.xml
-->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
-  <xsl:output method="xml" version="1.0" indent="no" standalone="yes" />
-  <xsl:template match="/">
-    <doxygen version="{doxygenindex/@version}">
-      <!-- Load all doxgen generated xml files -->
-      <xsl:for-each select="doxygenindex/compound">
-        <xsl:copy-of select="document( concat( @refid, '.xml' ) )/doxygen/*" />
-      </xsl:for-each>
-    </doxygen>
-  </xsl:template>
-</xsl:stylesheet>
--- a/doc/doxygen/xml/compound.xsd
+++ b/doc/doxygen/xml/compound.xsd
--- a/doc/doxygen/xml/index.xsd
+++ b/doc/doxygen/xml/index.xsd
-<?xml version='1.0' encoding='utf-8' ?>
-<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
-  <xsd:element name="doxygenindex" type="DoxygenType"/>
-
-  <xsd:complexType name="DoxygenType">
-    <xsd:sequence>
-      <xsd:element name="compound" type="CompoundType" minOccurs="0" maxOccurs="unbounded"/>
-    </xsd:sequence>
-    <xsd:attribute name="version" type="xsd:string" use="required"/>
-  </xsd:complexType>
-
-  <xsd:complexType name="CompoundType">
-    <xsd:sequence>
-      <xsd:element name="name" type="xsd:string"/>
-      <xsd:element name="member" type="MemberType" minOccurs="0" maxOccurs="unbounded"/>
-    </xsd:sequence>
-    <xsd:attribute name="refid" type="xsd:string" use="required"/>
-    <xsd:attribute name="kind" type="CompoundKind" use="required"/>
-  </xsd:complexType>
-
-  <xsd:complexType name="MemberType">
-    <xsd:sequence>
-      <xsd:element name="name" type="xsd:string"/>
-    </xsd:sequence>
-    <xsd:attribute name="refid" type="xsd:string" use="required"/>
-    <xsd:attribute name="kind" type="MemberKind" use="required"/>
-  </xsd:complexType>
-  
-  <xsd:simpleType name="CompoundKind">
-    <xsd:restriction base="xsd:string">
-      <xsd:enumeration value="class"/>
-      <xsd:enumeration value="struct"/>
-      <xsd:enumeration value="union"/>
-      <xsd:enumeration value="interface"/>
-      <xsd:enumeration value="protocol"/>
-      <xsd:enumeration value="category"/>
-      <xsd:enumeration value="exception"/>
-      <xsd:enumeration value="file"/>
-      <xsd:enumeration value="namespace"/>
-      <xsd:enumeration value="group"/>
-      <xsd:enumeration value="page"/>
-      <xsd:enumeration value="example"/>
-      <xsd:enumeration value="dir"/>
-    </xsd:restriction>
-  </xsd:simpleType>
-
-  <xsd:simpleType name="MemberKind">
-    <xsd:restriction base="xsd:string">
-      <xsd:enumeration value="define"/>
-      <xsd:enumeration value="property"/>
-      <xsd:enumeration value="event"/>
-      <xsd:enumeration value="variable"/>
-      <xsd:enumeration value="typedef"/>
-      <xsd:enumeration value="enum"/>
-      <xsd:enumeration value="enumvalue"/>
-      <xsd:enumeration value="function"/>
-      <xsd:enumeration value="signal"/>
-      <xsd:enumeration value="prototype"/>
-      <xsd:enumeration value="friend"/>
-      <xsd:enumeration value="dcop"/>
-      <xsd:enumeration value="slot"/>
-    </xsd:restriction>
-  </xsd:simpleType>
-
-</xsd:schema>
-
--- a/doc/examples/CMakeLists.txt
+++ b/doc/examples/CMakeLists.txt
+# ******************************************************************************
+# Copyright 2017-2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+
+if(MKLDNN_INCLUDE_DIR)
+    link_directories(${MKLDNN_LIB_DIR})
+endif()
+
+if (NGRAPH_CPU_ENABLE)
+    set (SRC
+        abc.cpp
+        ${PROJECT_SOURCE_DIR}/doc/examples/abc.cpp
+    )
+
+    add_executable(abc ${SRC})
+    add_dependencies(abc ngraph)
+
+    set(HEADER_SEARCH_DEFINES
+        "NGRAPH_HEADERS_PATH=\"${NGRAPH_INCLUDE_PATH}\""
+    )
+
+    target_link_libraries(abc ngraph)
+    include_directories(SYSTEM ${JSON_INCLUDE_DIR})
+
+    set_source_files_properties(abc.cpp PROPERTIES COMPILE_DEFINITIONS "${HEADER_SEARCH_DEFINES}")
+
+endif()
--- a/doc/examples/abc.cpp
+++ b/doc/examples/abc.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <iostream>
+
+#include <ngraph/ngraph.hpp>
+
+using namespace ngraph;
+
+int main()
+{
+    // Build the graph
+    Shape s{2, 3};
+    auto a = std::make_shared<op::Parameter>(element::f32, s);
+    auto b = std::make_shared<op::Parameter>(element::f32, s);
+    auto c = std::make_shared<op::Parameter>(element::f32, s);
+
+    auto t0 = std::make_shared<op::Add>(a, b);
+    auto t1 = std::make_shared<op::Multiply>(t0, c);
+
+    // Make the function
+    auto f = std::make_shared<Function>(NodeVector{t1}, op::ParameterVector{a, b, c});
+
+    // Get the backend
+    auto manager = runtime::Manager::get("CPU");
+    auto backend = manager->allocate_backend();
+
+    // Compile the function
+    auto external = manager->compile(f);
+    auto cf = backend->make_call_frame(external);
+
+    // Allocate tensors
+    auto t_a = backend->make_primary_tensor_view(element::f32, s);
+    auto t_b = backend->make_primary_tensor_view(element::f32, s);
+    auto t_c = backend->make_primary_tensor_view(element::f32, s);
+    auto t_result = backend->make_primary_tensor_view(element::f32, s);
+
+    // Initialize tensors
+    float v_a[2][3] = {{1, 2, 3}, {4, 5, 6}};
+    float v_b[2][3] = {{7, 8, 9}, {10, 11, 12}};
+    float v_c[2][3] = {{1, 0, -1}, {-1, 1, 2}};
+
+    t_a->write(&v_a, 0, sizeof(v_a));
+    t_b->write(&v_b, 0, sizeof(v_b));
+    t_c->write(&v_c, 0, sizeof(v_c));
+
+    // Invoke the function
+    cf->call({t_a, t_b, t_c}, {t_result});
+
+    // Get the result
+    float r[2][3];
+    t_result->read(&r, 0, sizeof(r));
+
+    std::cout << "[" << std::endl;
+    for (size_t i = 0; i < s[0]; ++i)
+    {
+        std::cout << " [";
+        for (size_t j = 0; j < s[1]; ++j)
+        {
+            std::cout << r[i][j] << ' ';
+        }
+        std::cout << ']' << std::endl;
+    }
+    std::cout << ']' << std::endl;
+
+    return 0;
+}
--- a/doc/sphinx/ngraph_theme/static/css/theme.css
+++ b/doc/sphinx/ngraph_theme/static/css/theme.css
@@ -1837,18 +1837,19 @@ div[class^='highlight'] td.code {
 }

 code, p.caption, caption-text { 
-  font-family: RobotoSlab, sans, monospace;
+  font-family: Inconsolata, sans, monospace;
  color: #A79992;
-  font-size: 0.95em;
-  line-height: 1.11em;
+  font-size: 0.99em;
+  line-height: 1.39em;
  }

  .code-block-caption { 
    font-variant: small-caps;
    font-size: 0.88em;
-    background-color: #c3d5d5;
+    background-color: #d0dfdf;
    padding-right: 0.43em;
    padding-top: 0.23em;
+    padding-left: 0.11em;
    padding-bottom: 0.23em;
    text-align: right; 
    }

--- a/doc/sphinx/source/framework-integration-guides.rst
+++ b/doc/sphinx/source/framework-integration-guides.rst
@@ -18,6 +18,7 @@ Compile MXNet\* with ``libngraph``
   If the |nGl| code has not yet been installed to your system, please go back
   and return here to finish compiling MXNet with ``libngraph``.

+
 #. Set the ``LD_LIBRARY_PATH`` path to the location where we built the nGraph 
   libraries:


--- a/doc/sphinx/source/glossary.rst
+++ b/doc/sphinx/source/glossary.rst
@@ -6,36 +6,94 @@ Glossary
 ========

 .. glossary::
+   :sorted:
+
+   backend
+
+      A component that can execute computations.
+
+   bridge
+
+      A component of nGraph that acts as a backend for a framework,
+      allowing the framework to define and execute computations.
+
+   framework
+
+      A machine learning environment, such as TensorFlow, MXNet, or
+      neon.

   function graph
-	     The Intel nGraph library uses a function graph to represent an ``op``'s
-	     parameters and results.
+
+      The Intel nGraph library uses a function graph to represent an
+      ``op``'s parameters and results.

   op
-      An op represents an operation. Ops are stateless and have zero or more 
-      inputs and zero or more outputs. Some ops have additional constant 
-      attributes. Every output of an op corresponds to a tensor and has an 
-      element type and a shape. The element types and shapes of the outputs of 
-      an op are determined by the inputs and attributes of the op.

-   tensors
-     Tensors are maps from *coordinates* to scalar values, all of the same type, 
-     called the *element type* of the tensor.
+      An op represents an operation. Ops are stateless and have zero
+      or more inputs and zero or more outputs. Some ops have
+      additional constant attributes. Every output of an op
+      corresponds to a tensor and has an element type and a shape. The
+      element types and shapes of the outputs of an op are determined
+      by the inputs and attributes of the op.

   parameter
-	    In the context of a function graph, a "parameter" refers to what "stands 
-      in" for an argument in an ``op`` definition.
+
+      In the context of a function graph, a "parameter" refers to what
+      "stands in" for an argument in an ``op`` definition.

   result
-       In the context of a function graph, the term "result" refers to what 
-       stands in for the returned value.
+
+      In the context of a function graph, the term "result" refers to
+      what stands in for the returned value.

   shape
-       The shape of a tensor is a tuple of non-negative integers that represents an  
-       exclusive upper bound for coordinate values.

+      The shape of a tensor is a tuple of non-negative integers that
+      represents an exclusive upper bound for coordinate values.
+
+   shared pointer
+
+      The C++ standard template library has the template
+      ``std::shared_ptr<X>``. A shared pointer is used like an ``X*``
+      pointer, but maintains a reference count to the underlying
+      object. Each new shared pointer to the object increases the
+      count. When a shared pointer goes out of scope, the reference
+      count is decremented, and, when the count reaches 0, the
+      underlying object is deleted. The function template
+      ``std::make_shared<X>(...)`` can be used similarly to ``new
+      X(...)``, except it returns a ``std::shared_ptr<X>`` instead of
+      an ``X*``.
+
+      If there is a chain of shared pointers from an object back to
+      itself, every object in the chain is referenced, so the
+      reference counts will never reach 0 and the objects will never
+      be deleted.
+
+      If ``a`` referenced ``b`` and ``b`` wanted to track all
+      references to itself and shared pointers were used both
+      directions, there would be a chain of pointers form ``a`` to
+      itself. We avoid this by using shared pointers in only one
+      direction, and raw pointers for the inverse
+      direction. ``std::enabled_shared_from_this`` is a class template
+      that defines a method ``shared_from_this`` that provides a
+      shared pointer from a raw pointer.
+
+      nGraph makes use of shared pointers for objects whose lifetime
+      is hard to determine when they are allocated.
+   
   step
-       An abstract "action" that produces zero or more tensor outputs from zero or more tensor 
-       inputs. Steps correspond to *ops* that connect *nodes*.
+
+      An abstract "action" that produces zero or more tensor outputs
+      from zero or more tensor inputs. Steps correspond to *ops* that
+      connect *nodes*.
           
+   tensors
+
+      Tensors are maps from *coordinates* to scalar values, all of the
+      same type, called the *element type* of the tensor.
+
+   model description
+
+      A description of a program's fundamental operations that are 
+      used by a framework to generate inputs for computation.   

--- a/doc/sphinx/source/howto/execute.rst
+++ b/doc/sphinx/source/howto/execute.rst
+.. execute-cmp.rst
+
+######################
+Execute a Computation
+######################
+
+This section explains how to manually perform the steps that would normally be 
+performed by a framework :term:`bridge` to execute a computation. Intel® nGraph++ 
+library is targeted toward automatic construction; it is far easier for a 
+processing unit (GPU, CPU, or an `Intel Nervana NNP`_) to run a computation than 
+it is for a user to map out how that computation happens. Unfortunately, things 
+that make by-hand graph construction simpler tend to make automatic construction 
+more difficult, and vice versa.
+
+Here we will do all the bridge steps manually. The :term:`model description` 
+we're explaining is based on the :file:`abc.cpp` file in the ``/doc/examples/`` 
+directory. We'll be deconstructing the steps that an entity (framework or 
+user) must be able to carry out in order to successfully execute a computation:
+
+* :ref:`define_cmp`
+* :ref:`specify_bkd`
+* :ref:`compile_cmp`
+* :ref:`allocate_bkd_storage`
+* :ref:`initialize_inputs`
+* :ref:`invoke_cmp`
+* :ref:`access_outputs`
+
+The final code is at the :ref:`end of this page <all_together>`.
+
+
+.. _define_cmp:
+
+Define the computation
+======================
+
+To a :term:`framework`, a computation is simply a transformation of inputs to 
+outputs. While a *framework bridge* can programmatically construct the graph 
+from a framework's representation of the computation, graph construction can be 
+somewhat more tedious for users. To a user, who is usually interested in 
+specific nodes (vertices) or edges of a computation that reveal "what is 
+happening where", it can be helpful to think of a computation as a zoomed-out 
+and *stateless* dataflow graph where all of the nodes are well-defined tensor 
+operations and all of the edges denote use of an output from one operation as 
+an input for another operation.
+
+.. TODO
+
+.. image for representing nodes and edges of (a+b)*c
+
+
+Most of the public portion of the nGraph API is in the ``ngraph`` namespace, so 
+we will omit the namespace. Use of namespaces other than ``std`` will be 
+namespaces in ``ngraph``. For example, the ``op::Add`` is assumed to refer to 
+``ngraph::op::Add``.
+
+A computation's graph is constructed from ops; each is a member of a subclass of 
+``op::Op``, which, in turn, is a subclass of ``Node``. Not all graphs are 
+computation, but all graphs are composed entirely of instances of ``Node``.  
+Computation graphs contain only ``op::Op`` nodes.
+
+We mostly use :term:`shared pointers<shared pointer>` for nodes, i.e.
+``std::shared_ptr<Node>`` so that they will be automatically
+deallocated when they are no longer needed. A brief summary of shared
+pointers is given in the glossary.
+
+Every node has zero or more *inputs*, zero or more *outputs*, and zero or more 
+*attributes*.  The specifics for each ``type`` permitted on a core ``Op``-specific 
+basis can be discovered in our :doc:`../ops/index` docs. For our 
+purpose to :ref:`define a computation <define_cmp>`, nodes should be thought of 
+as essentially immutable; that is, when constructing a node, we need to supply 
+all of its inputs. We get this process started with ops that have no inputs, 
+since any op with no inputs is going to first need some inputs.
+
+``op::Parameter`` specifes the tensors that will be passed to the computation. 
+They receive their values from outside of the graph, so they have no inputs. 
+They have attributes for the element type and the shape of the tensor that will 
+be passed to them.
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 26-29
+
+Here we have made three parameter nodes, each a 32-bit float of shape ``(2, 3)`` 
+using a row-major element layout.
+
+We can create a graph for ``(a+b)*c`` by creating an ``op::Add`` node with inputs 
+from ``a`` and ``b``, and an ``op::Multiply`` node from the add node and ``c``:
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 31-32
+
+When the ``op::Add`` op is constructed, it will check that the element types and 
+shapes of its inputs match; to support multiple frameworks, ngraph does not do 
+automatic type conversion or broadcasting. In this case, they match, and the 
+shape of the unique output of ``t0`` will be a 32-bit float with shape ``(2, 3)``. 
+Similarly, ``op::Multiply`` checks that its inputs match and sets the element 
+type and shape of its unique output.
+
+Once the graph is built, we need to package it in a ``Function``:
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 35
+
+The first argument to the constuctor specifies the nodes that the function will 
+return; in this case, the product. A ``NodeVector`` is a vector of shared 
+pointers of ``op::Node``.  The second argument specifies the parameters of the 
+function, in the order they are to be passed to the compiled function. A 
+``ParameterVector`` is a vector of shared pointers to ``op::Parameter``. 
+
+.. important:: The parameter vector must include **every** parameter used in 
+   the computation of the results.
+
+
+.. _specify_bkd:
+
+Specify the backend upon which to run the computation
+=====================================================
+
+For a framework bridge, a *backend* is the environment that can perform the 
+computations; it can be done with a CPU, GPU, or an Intel Nervana NNP. A 
+*transformer* can compile computations for a backend, allocate and deallocate 
+tensors, and invoke computations.
+
+Factory-like managers for classes of backend managers can compile a ``Function`` 
+and allocate backends. A backend is somewhat analogous to a multi-threaded
+process.
+
+There are two backends for the CPU: the optimized ``"CPU"`` backend, which uses 
+the `Intel MKL-DNN`_, and the ``"INTERPRETER"`` backend, which runs reference 
+versions of kernels that favor implementation clarity over speed. The 
+``"INTERPRETER"`` backend can be slow, and is primarily intended for testing.  
+
+To select the ``"CPU"`` backend,
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 38-39
+
+
+.. _compile_cmp:
+
+Compile the computation 
+=======================
+
+Compilation triggers something that can be used as a factory for producing a 
+``CallFrame`` which is a *function* and its associated *state* that can run 
+in a single thread at a time. A ``CallFrame`` may be reused, but any particular 
+``CallFrame`` must only be running in one thread at any time. If more than one 
+thread needs to execute the function at the same time, create multiple 
+``CallFrame`` objects from the ``ExternalFunction``.
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 42-43
+
+
+.. _allocate_bkd_storage:
+
+Allocate backend storage for the inputs and outputs
+===================================================
+
+At the graph level, functions are stateless. They do have internal state related 
+to execution, but there is no user-visible state. Variables must be passed as 
+arguments. If the function updates variables, it must return the updated 
+variables.
+
+To invoke a function, tensors must be provided for every input and every output. 
+At this time, a tensor used as an input cannot also be used as an output. If 
+variables are being updated, you should use a double-buffering approach where 
+you switch between odd/even generations of variables on each update.
+
+Backends are responsible for managing storage. If the storage is off-CPU, caches 
+are used to minimize copying between device and CPU. We can allocate storage for 
+the three parameters and return value as follows:
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 30-33
+
+Each tensor is a shared pointer to a ``runtime::TensorView``, the interface 
+backends implement for tensor use. When there are no more references to the 
+tensor view, it will be freed when convenient for the backend.
+
+.. _initialize_inputs:
+
+Initialize the inputs
+=====================
+
+Next we need to copy some data into the tensors.
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 45-58
+
+The ``runtime::TensorView`` interface has ``write`` and ``read`` methods for 
+copying data to/from the tensor.
+
+.. _invoke_cmp:
+
+Invoke the computation
+======================
+
+To invoke the function, we simply pass argument and resultant tensors to the 
+call frame:
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 61
+
+
+.. _access_outputs:
+
+Access the outputs
+==================
+
+We can use the ``read`` method to access the result:
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :lines: 64-65
+
+.. _all_together:
+
+Put it all together
+===================
+
+.. literalinclude:: ../../../examples/abc.cpp
+   :language: cpp
+   :caption: "The (a + b) * c example for executing a computation on nGraph"
+
+
+
+
+.. _Intel MKL-DNN: https://01.org/mkl-dnn
+.. _Intel Nervana NNP: https://ai.intel.com/intel-nervana-neural-network-processors-nnp-redefine-ai-silicon/
\ No newline at end of file
--- a/doc/sphinx/source/howto/index.rst
+++ b/doc/sphinx/source/howto/index.rst
+.. howto/index: 
+
+How to 
+======
+
+.. toctree::
+   :maxdepth: 1
+   :caption: How to 
+
+   execute.rst
+    
+
+The "How to" articles in this section explain how to do specific tasks with the 
+Intel nGraph++ library. The recipes are all framework agnostic; in other words, 
+if an entity (framework or user) wishes to make use of target-based computational 
+resources, it can either:
+
+* Do the tasks programatically through the framework, or 
+* Provide a clear model definition with documentation for the computational 
+  resources needed. 
+
+.. note:: This section is aimed at intermediate-level developers working with
+   the nGraph++ library. It assumes a developer has understanding of the concepts 
+   in the previous sections. It does not assume knowledge of any particular 
+   frontend framework. 
+  
+Since our primary audience is developers who are pushing the boundaries of deep 
+learning systems, we go beyond the use of deep learning primitives, and include 
+APIs and documentation for developers who want the ability to write programs 
+that use custom backends. For example, we know that GPU resources can be useful 
+backends for *some* kinds of algorithmic operations while they impose inherent 
+limitations and slow down others. We are barely scraping the surface of what is 
+possible for a hybridized approach to many kinds of training and inference-based 
+computational tasks. 
+
+One of our goals with the nGraph project is to enable developers with tools to 
+build programs that quickly access and process data with or from a breadth of 
+edge and network devices.  Furthermore, we want them to be able to make use of 
+the best kind of computational resources for the kind of data they are processing,
+after it has been gathered.
+
+To get started, we've provided a basic example for how to execute a computation 
+that can run on an nGraph backend; this is analogous to a framework bridge.  
+
+This section is under development; it will eventually be populated with more 
+articles geared toward data scientists, algorithm designers, framework developers, 
+backend engineers, and others.  We welcome contributions from the community and
+invite you to experiment with the variety of hybridization and performance 
+extractions available through the nGraph library.    
+
--- a/doc/sphinx/source/index.rst
+++ b/doc/sphinx/source/index.rst
@@ -13,26 +13,29 @@
 .. limitations under the License.
 .. ---------------------------------------------------------------------------

-#############################
-Intel nGraph library project
-#############################
+########################
+Intel nGraph++ library
+########################

-Welcome to Intel nGraph, an open source C++ library for developers of 
-:abbr:`Deep Learning (DL)` (DL) systems. Here you will find a suite 
-of components, APIs, and documentation that can be used to compile 
-and run :abbr:`Deep Neural Network (DNN)` (DNN) models defined in a 
-variety of frameworks.  
+Welcome to Intel® nGraph™, an open source C++ library for developers of 
+:abbr:`Deep Learning (DL)` (DL) systems. Here you will find a suite of 
+components, APIs, and documentation that can be used to compile and run 
+:abbr:`Deep Neural Network (DNN)` (DNN) models defined in a variety of 
+frameworks.  

 .. figure:: graphics/ngraph-hub.png  


-For this early release, we provide :doc:`framework-integration-guides` to compile 
-and run MXNet and TensorFlow-based projects.
+For this early release, we've provided :doc:`framework-integration-guides` to 
+compile and run MXNet\* and TensorFlow\*-based projects.

-The nGraph library translates a framework’s representation of computations into 
-an :abbr:`Intermediate Representation (IR)` designed to promote computational 
+.. note:: The library code is under active development as we're continually 
+   adding support for more ops, more frameworks, and more backends. 
+
+The nGraph++ library translates a framework’s representation of computations 
+into an :abbr:`Intermediate Representation (IR)` that promotes computational 
 efficiency on target hardware. Initially-supported backends include Intel 
-Architecture CPUs (CPU), the Intel® Nervana Neural Network Processor™ (NNP), 
+Architecture CPUs (``CPU``), the Intel® Nervana Neural Network Processor™ (NNP), 
 and NVIDIA\* GPUs. Currently-supported compiler optimizations include efficient 
 memory management and data layout abstraction. 

@@ -54,6 +57,7 @@ Sections
   testing-libngraph.rst
   framework-integration-guides.rst
   graph-basics.rst
+   howto/index.rst
   ops/index.rst
   project/index.rst


--- a/doc/sphinx/source/installation.rst
+++ b/doc/sphinx/source/installation.rst
 .. installation:

-###################################
-Install the Intel® nGraph™ library 
-###################################
+########
+Install 
+########

 Build Environments
 ==================

-The |release| version of |project| supports Linux\*-based systems which 
-have recent updates of the following packages and prerequisites: 
+The |release| version of |project| supports Linux\*-based systems which have 
+recent updates of the following packages and prerequisites: 

 .. csv-table::
   :header: "Operating System", "Compiler", "Build System", "Status", "Additional Packages"
@@ -25,15 +25,15 @@ Other configurations may work, but aren't tested; on Ubuntu 16.04 with
 below. This gets a pre-built tarball of LLVM+Clang from `llvm.org`_, and will
 substantially reduce build time.

-If using ``gcc-4.8``, it may be necessary to add symlinksfrom ``gcc`` to
+If using ``gcc-4.8``, it may be necessary to add symlinks from ``gcc`` to
 ``gcc-4.8``, and from ``g++`` to ``g++-4.8``, in your :envvar:`PATH`, even 
 if you explicitly specify the ``CMAKE_C_COMPILER`` and ``CMAKE_CXX_COMPILER`` 
-flags when building. (You should NOT supply the `-DNGRAPH_USE_PREBUILT_LLVM` 
+flags when building. (You **should NOT** supply the ``-DNGRAPH_USE_PREBUILT_LLVM`` 
 flag in this case, because the prebuilt tarball supplied on llvm.org is not 
 compatible with a gcc-4.8 based build.)

-Support for macOS is limited; see the `macOS development prerequisites`_ 
-section at the end of this page for details.
+Support for macOS is limited; see the `macOS development`_ section at the end of 
+this page for details.


 Installation Steps
@@ -44,11 +44,10 @@ install ``ngraph_dist`` to the installing user's ``$HOME`` directory as
 the default location. See the :file:`CMakeLists.txt` file for more 
 information about how to change or customize this location.

-#.  (Optional) Since most of a developer's interaction with a frontend 
-    framework will take place locally through Pythonic APIs to the C++
-    library, you can set a reference placeholder for the documented source 
-    cloned from the repo. Create something like ``/opt/local`` and (with sudo 
-    permissions), give ownership of that directory to your user.  
+#.  (Optional) Create something like ``/opt/local`` and (with sudo permissions), 
+    give ownership of that directory to your user. Under this directory, you can 
+    add a placeholder for ``libraries`` to have a placeholder for the documented 
+    source cloned from the repo: 

    .. code-block:: console

@@ -62,62 +61,70 @@ information about how to change or customize this location.
   .. code-block:: console

      $ cd /opt/local/libraries
-      $ git clone git@github.com:NervanaSystems/private-ngraph-cpp.git
-      $ cd private-ngraph-cpp
+      $ git clone git@github.com:NervanaSystems/ngraph-cpp.git
+      $ cd ngraph-cpp

-#. Create a build directory outside of the ``private-ngraph-cpp/src`` directory 
-   tree; somewhere like ``private-ngraph-cpp/build``, for example.
+#. Create a build directory outside of the ``ngraph-cpp/src`` directory 
+   tree; somewhere like ``ngraph-cpp/build``, for example.

   .. code-block:: console

      $ mkdir build   

 #. ``$ cd`` to the build directory and generate the GNUMakefiles in the 
-   customary manner from within your ``build`` directory:
+   customary manner from within your ``build`` directory (remember to append the 
+   command with the prebuilt option, if needed):

   .. code-block:: console

-      $ cd build && cmake ../
+      $ cd build && cmake ../ [-DNGRAPH_USE_PREBUILT_LLVM=TRUE]

-#. Run ``$ make -j8`` and ``make install`` to install ``libngraph.so`` and the 
-   header files to the default location of ``$HOME/ngraph_dist``.
+#. (Optional) Run ``$ make [-jN]`` where ``-jN`` specifies the number of 
+   cores. The example here uses a configuration of ``j8``, which is 
+   good for a system install using an Intel® Xeon® (CPU processor). This step 
+   is **not recommended** with Docker / VM installs. 

   .. code-block:: console
+      
+      $ make -j8

-      $ make -j8 && make install 
+#. Run ``make install`` to install ``libngraph.so`` and the header files to the 
+   default location of ``$HOME/ngraph_dist``

+   .. code-block:: console

-#. (Optional, requires `Sphinx`_.)  Run ``make html`` inside the  
-   ``doc/sphinx`` directory to build HTML docs for the nGraph library.    
+      $ make install

-#. (Optional, requires `doxygen`_.)  Run ``$ make htmldocs`` inside
-   the ``doc/sphinx`` directory to build HTML API docs inside the 
-   ``/docs/doxygen/`` directory. 
+#. (Optional, requires `doxygen`_, `Sphinx`_, and `breathe`_). Run ``make html`` 
+   inside the ``doc/sphinx`` directory of the cloned source to build a copy of 
+   the `website docs`_ locally. The low-level API docs with inheritance diagrams 
+   and collaboration diagrams can be found inside the ``/docs/doxygen/`` 
+   directory.    

-.. macos_development_prerequisites: 
+.. macos_development: 

-macOS Development Prerequisites
-------------------------------
+macOS development
+-----------------

-.. note:: If you are developing |nGl| projects on macOS*\, please be 
-   aware that this platform is officially unsupported.
+.. note:: The macOS*\ platform is officially unsupported.

 The repository includes two scripts (``maint/check-code-format.sh`` and 
 ``maint/apply-code-format.sh``) that are used respectively to check adherence 
-to `libngraph` code formatting conventions, and to automatically reformat code 
+to ``libngraph`` code formatting conventions, and to automatically reformat code 
 according to those conventions. These scripts require the command 
 ``clang-format-3.9`` to be in your ``PATH``. Run the following commands 
 (you will need to adjust them if you are not using bash):

 .. code-block:: bash

-  $ brew install llvm@3.9
-  $ mkdir -p $HOME/bin
-  $ ln -s /usr/local/opt/llvm@3.9/bin/clang-format $HOME/bin/clang-format-3.9
-  $ echo 'export PATH=$HOME/bin:$PATH' >> $HOME/.bash_profile
+   $ brew install llvm@3.9
+   $ mkdir -p $HOME/bin
+   $ ln -s /usr/local/opt/llvm@3.9/bin/clang-format $HOME/bin/clang-format-3.9
+   $ echo 'export PATH=$HOME/bin:$PATH' >> $HOME/.bash_profile

 .. _doxygen: https://www.stack.nl/~dimitri/doxygen/
 .. _Sphinx:  http://www.sphinx-doc.org/en/stable/
-.. _NervanaSystems: https://github.com/NervanaSystems/private-ngraph-cpp/blob/master/README.md
+.. _breathe: https://breathe.readthedocs.io/en/latest/
 .. _llvm.org: https://www.llvm.org 
-
+.. _NervanaSystems: https://github.com/NervanaSystems/ngraph-cpp/blob/master/README.md
+.. _website docs: http://ngraph.nervanasys.com/index.html/index.html
--- a/doc/sphinx/source/ops/allreduce.rst
+++ b/doc/sphinx/source/ops/allreduce.rst
 .. allreduce.rst:

-###
+##########
 AllReduce
-###
+##########

 .. code-block:: cpp


--- a/doc/sphinx/source/ops/index.rst
+++ b/doc/sphinx/source/ops/index.rst
@@ -82,3 +82,6 @@ Not currently a comprehensive list.
   negative.rst
   not_equal.rst
   not.rst
+   softmax.rst
+
+   
--- a/doc/sphinx/source/ops/softmax.rst
+++ b/doc/sphinx/source/ops/softmax.rst
+.. softmax.rst:
+
+#######
+Softmax
+#######
+
+.. code-block:: cpp
+
+   Softmax  // Softmax operation
+
+
+Description
+===========
+
+Produces a tensor of the same element type and shape as ``arg``,
+where the value at each coordinate of ``output`` is the expine of the
+value of the corresponding coordinate of ``arg`` divided by the sum
+of the expine of all coordinates of ``arg`` in the specified ``axes``.
+
+Inputs
+------
+
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``arg``         | Any                     | Any                            |
+-----------------+-------------------------+--------------------------------+
+
+Parameters
+----------
+-----------------+----------------------------------------------------------------+
+| Name            | Description                                                    |
+=================+================================================================+
+| ``axes``        | The axis positions (0-based) on which to calculate the softmax |
+-----------------+----------------------------------------------------------------+
+
+Outputs
+-------
+
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``output``      | Same as ``arg``         | Same as ``arg``                |
+-----------------+-------------------------+--------------------------------+
+
+
+Mathematical Definition
+=======================
+
+.. math::
+
+   \texttt{output}_{i} = \frac{\exp(\texttt{arg}_{i})}{\sum_{j} \exp(\texttt{arg}_{j})}
+
+
+C++ Interface
+=============
+
+.. doxygenclass:: ngraph::op::Softmax
+   :project: ngraph
+   :members: m_axes
\ No newline at end of file
--- a/doc/sphinx/source/testing-libngraph.rst
+++ b/doc/sphinx/source/testing-libngraph.rst
@@ -32,7 +32,7 @@ For this early |release| release, we're providing :doc:`framework-integration-gu
 for:

 * :doc:`MXNet<framework-integration-guides>` framework,  
-* :doc:`Tensorflow<framework-integration-guides>` framework, and
+* :doc:`TensorFlow<framework-integration-guides>` framework, and
 * neon™ `frontend framework`_.

 Integration guides for other frameworks are tentatively forthcoming.

--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -72,6 +72,7 @@ set (SRC
    ops/sin.cpp
    ops/sinh.cpp
    ops/slice.cpp
+    ops/softmax.cpp
    ops/sqrt.cpp
    ops/subtract.cpp
    ops/sum.cpp
@@ -313,7 +314,7 @@ endif()
 # Nvidia
 if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
    find_library(CUDA_nvrtc_LIBRARY nvrtc /usr/local/cuda/lib64)
-    find_library(CUDA_cuda_LIBRARY cuda /usr/local/cuda/lib64)
+    find_library(CUDA_cuda_LIBRARY cuda /usr/local/cuda/lib64/stubs)
    target_link_libraries(ngraph PUBLIC ${CUDA_cuda_LIBRARY} ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
 endif()


--- a/src/ngraph/builder/numpy_transpose.cpp
+++ b/src/ngraph/builder/numpy_transpose.cpp
@@ -20,6 +20,7 @@
 #include "ngraph/builder/numpy_transpose.hpp"
 #include "ngraph/except.hpp"
 #include "ngraph/ops/reshape.hpp"
+#include "ngraph/util.hpp"

 namespace ngraph
 {

--- a/src/ngraph/builder/numpy_transpose.hpp
+++ b/src/ngraph/builder/numpy_transpose.hpp
@@ -17,12 +17,7 @@
 #pragma once

 #include "ngraph/axis_vector.hpp"
-#include "ngraph/function.hpp"
 #include "ngraph/node.hpp"
-#include "ngraph/ops/constant.hpp"
-#include "ngraph/ops/parameter.hpp"
-#include "ngraph/ops/reduce.hpp"
-#include "ngraph/types/type.hpp"

 namespace ngraph
 {

--- a/src/ngraph/builder/reduce_ops.cpp
+++ b/src/ngraph/builder/reduce_ops.cpp
@@ -19,7 +19,7 @@
 #include "ngraph/axis_set.hpp"
 #include "ngraph/builder/autobroadcast.hpp"
 #include "ngraph/builder/reduce_ops.hpp"
-#include "ngraph/ops/add.hpp"
+#include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/multiply.hpp"
 #include "ngraph/ops/power.hpp"

--- a/src/ngraph/builder/reduce_ops.hpp
+++ b/src/ngraph/builder/reduce_ops.hpp
@@ -17,12 +17,7 @@
 #pragma once

 #include "ngraph/axis_set.hpp"
-#include "ngraph/function.hpp"
 #include "ngraph/node.hpp"
-#include "ngraph/ops/constant.hpp"
-#include "ngraph/ops/parameter.hpp"
-#include "ngraph/ops/reduce.hpp"
-#include "ngraph/types/type.hpp"

 namespace ngraph
 {

--- a/src/ngraph/codegen/code_writer.hpp
+++ b/src/ngraph/codegen/code_writer.hpp
@@ -19,8 +19,6 @@
 #include <sstream>
 #include <string>

-#include "ngraph/log.hpp"
-
 namespace ngraph
 {
    namespace codegen

--- a/src/ngraph/codegen/compiler.hpp
+++ b/src/ngraph/codegen/compiler.hpp
@@ -16,7 +16,6 @@

 #pragma once

-#include <functional>
 #include <memory>
 #include <string>
 #include <vector>

--- a/src/ngraph/codegen/execution_engine.hpp
+++ b/src/ngraph/codegen/execution_engine.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include <functional>
 #include <memory>

 #include "ngraph/codegen/compiler.hpp"

--- a/src/ngraph/coordinate_transform.cpp
+++ b/src/ngraph/coordinate_transform.cpp
@@ -14,7 +14,6 @@
 * limitations under the License.
 *******************************************************************************/

-#include <algorithm>
 #include <cstdio>
 #include <iostream>
 #include <sstream>
@@ -89,8 +88,10 @@ CoordinateTransform::CoordinateTransform(const Shape& source_shape,
    }

    AxisVector all_axes(m_n_axes);
-    size_t n = 0;
-    std::generate(all_axes.begin(), all_axes.end(), [&n]() -> size_t { return n++; });
+    for (size_t i = 0; i < all_axes.size(); i++)
+    {
+        all_axes[i] = i;
+    }

    if (!std::is_permutation(all_axes.begin(), all_axes.end(), source_axis_order.begin()))
    {

--- a/src/ngraph/coordinate_transform.hpp
+++ b/src/ngraph/coordinate_transform.hpp
@@ -16,11 +16,6 @@

 #pragma once

-#include <cassert>
-#include <cstdio>
-#include <iostream>
-#include <vector>
-
 #include "ngraph/axis_vector.hpp"
 #include "ngraph/coordinate.hpp"
 #include "ngraph/coordinate_diff.hpp"

--- a/src/ngraph/descriptor/buffer.hpp
+++ b/src/ngraph/descriptor/buffer.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace descriptor

--- a/src/ngraph/descriptor/buffer_pos.hpp
+++ b/src/ngraph/descriptor/buffer_pos.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cassert>
+#include <memory>

 #include "ngraph/descriptor/buffer.hpp"


--- a/src/ngraph/descriptor/layout/dense_tensor_view_layout.cpp
+++ b/src/ngraph/descriptor/layout/dense_tensor_view_layout.cpp
@@ -21,9 +21,6 @@
 #include "ngraph/types/type.hpp"

 using namespace ngraph;
-using ngraph::Shape;
-using ngraph::descriptor::TensorView;
-using ngraph::TensorViewType;

 descriptor::layout::DenseTensorViewLayout::DenseTensorViewLayout(const TensorView& tensor_view)
    : TensorViewLayout(tensor_view)

--- a/src/ngraph/descriptor/layout/tensor_view_layout.hpp
+++ b/src/ngraph/descriptor/layout/tensor_view_layout.hpp
@@ -17,7 +17,6 @@
 #pragma once

 #include <memory>
-#include <tuple>
 #include <vector>

 #include "ngraph/descriptor/buffer_pos.hpp"

--- a/src/ngraph/descriptor/primary_tensor_view.hpp
+++ b/src/ngraph/descriptor/primary_tensor_view.hpp
@@ -20,9 +20,6 @@

 #include "ngraph/descriptor/tensor.hpp"
 #include "ngraph/descriptor/tensor_view.hpp"
-#include "ngraph/log.hpp"
-#include "ngraph/shape.hpp"
-#include "ngraph/types/type.hpp"

 namespace ngraph
 {
@@ -30,9 +27,6 @@ namespace ngraph

    namespace descriptor
    {
-        class Tensor;
-        class TensorViewLayout;
-
        /// @brief A PrimaryTensorView owns the tensor. All other views are the result
        /// of some index operation on the primary view.
        class PrimaryTensorView : public TensorView

--- a/src/ngraph/descriptor/tensor_view.hpp
+++ b/src/ngraph/descriptor/tensor_view.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <memory>
+#include <string>

 #include "ngraph/shape.hpp"


--- a/src/ngraph/function.hpp
+++ b/src/ngraph/function.hpp
@@ -23,12 +23,7 @@
 #include <string>
 #include <vector>

-#include "ngraph/descriptor/output.hpp"
-#include "ngraph/descriptor/tensor_view.hpp"
-#include "ngraph/log.hpp"
 #include "ngraph/node.hpp"
-#include "ngraph/ops/op.hpp"
-#include "ngraph/ops/parameter.hpp"
 #include "ngraph/ops/parameter_vector.hpp"
 #include "ngraph/types/type.hpp"


--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -14,13 +14,9 @@
 * limitations under the License.
 *******************************************************************************/

-#include <algorithm>
 #include <cassert>
 #include <deque>
-#include <forward_list>
-#include <iomanip>
-#include <iterator>
-#include <map>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>


--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -16,27 +16,18 @@

 #pragma once

-#include <algorithm>
-#include <chrono>
-#include <deque>
-#include <functional>
-#include <iostream>
 #include <list>
-#include <map>
 #include <memory>
-#include <sstream>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>

+#include "ngraph/function.hpp"
+#include "ngraph/node.hpp"
 #include "ngraph/placement.hpp"

 namespace ngraph
 {
-    class Node;
-    class Function;
-
    namespace descriptor
    {
        class Input;

--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -118,6 +118,7 @@
 #include "ngraph/ops/sin.hpp"
 #include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sqrt.hpp"
 #include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"

--- a/src/ngraph/ops/batch_norm.cpp
+++ b/src/ngraph/ops/batch_norm.cpp
@@ -16,6 +16,7 @@

 #include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/constant.hpp"
+#include "ngraph/ops/get_output_element.hpp"

 ngraph::op::BatchNorm::BatchNorm(double eps,
                                 std::shared_ptr<ngraph::Node> gamma,
@@ -94,3 +95,94 @@ std::shared_ptr<ngraph::Node>
    return std::make_shared<BatchNorm>(
        m_epsilon, new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), new_args.at(4));
 }
+
+ngraph::op::BatchNormBackprop::BatchNormBackprop(double eps,
+                                                 std::shared_ptr<ngraph::Node> gamma,
+                                                 std::shared_ptr<ngraph::Node> beta,
+                                                 std::shared_ptr<ngraph::Node> input,
+                                                 std::shared_ptr<ngraph::Node> mean,
+                                                 std::shared_ptr<ngraph::Node> variance,
+                                                 std::shared_ptr<ngraph::Node> delta)
+    : RequiresTensorViewArgs("BatchNormBackprop", {gamma, beta, input, mean, variance, delta})
+    , epsilon(eps)
+
+{
+    if (input->get_shape().size() != 4)
+    {
+        throw ngraph_error("Input expected to be a 4D tensor");
+    }
+
+    auto et = input->get_element_type();
+    const char* input_names[] = {"gamma", "beta", "input", "mean", "variance", "delta"};
+
+    for (size_t i = 0; i < get_input_size(); i++)
+    {
+        if (get_input_op(i)->get_element_type() != et)
+        {
+            auto err_msg = std::string("The element type of ") + input_names[i] +
+                           " isn't equal to input data's type";
+            throw ngraph_error(err_msg.c_str());
+        }
+    }
+
+    Shape channel_shape{input->get_shape().at(1)};
+
+    for (size_t i = 0; i < get_input_size(); i++)
+    {
+        if (i == 2 || i == 5) //don't check input and delta
+        {
+            continue;
+        }
+
+        if (get_input_op(i)->get_shape() != channel_shape)
+        {
+            auto err_msg = std::string("The shape of ") + input_names[i] +
+                           " isn't equal to input channel's shape";
+            throw ngraph_error(err_msg.c_str());
+        }
+    }
+
+    if (delta->get_shape() != input->get_shape())
+    {
+        throw ngraph_error("delta shape is expected to be equal to input shape");
+    }
+
+    add_output(input->get_element_type(), input->get_shape());
+    add_output(gamma->get_element_type(), gamma->get_shape());
+    add_output(beta->get_element_type(), beta->get_shape());
+}
+
+std::shared_ptr<ngraph::Node>
+    ngraph::op::BatchNormBackprop::copy_with_new_args(const NodeVector& new_args) const
+{
+    if (new_args.size() != 6)
+    {
+        throw ngraph_error("Incorrect number of new arguments");
+    }
+    return std::make_shared<op::BatchNormBackprop>(epsilon,
+                                                   new_args.at(0),
+                                                   new_args.at(1),
+                                                   new_args.at(2),
+                                                   new_args.at(3),
+                                                   new_args.at(4),
+                                                   new_args.at(5));
+}
+
+void ngraph::op::BatchNorm::generate_adjoints(autodiff::Adjoints& adjoints,
+                                              const std::shared_ptr<Node>& delta)
+{
+    auto gamma = get_input_op(0);
+    auto beta = get_input_op(1);
+    auto input = get_input_op(2);
+    auto mean = get_input_op(3);
+    auto variance = get_input_op(4);
+    auto bbn = std::make_shared<op::BatchNormBackprop>(
+        get_eps_value(), gamma, beta, input, mean, variance, delta);
+    auto dinput = std::make_shared<op::GetOutputElement>(bbn, 0);
+    auto dgamma = std::make_shared<op::GetOutputElement>(bbn, 1);
+    auto dbeta = std::make_shared<op::GetOutputElement>(bbn, 2);
+
+    adjoints.add_delta(input, dinput);
+    adjoints.add_delta(gamma, dgamma);
+    adjoints.add_delta(beta, dbeta);
+}
--- a/src/ngraph/ops/batch_norm.hpp
+++ b/src/ngraph/ops/batch_norm.hpp
@@ -44,11 +44,34 @@ namespace ngraph
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;

+        protected:
+            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
+                                           const std::shared_ptr<Node>& delta) override;
+
        private:
            Shape m_bn_input_shape;
            Shape m_bn_variance_shape;
            Shape m_bn_mean_shape;
            double m_epsilon;
        };
+
+        class BatchNormBackprop : public util::RequiresTensorViewArgs
+        {
+        public:
+            BatchNormBackprop(double eps,
+                              std::shared_ptr<Node> gamma,
+                              std::shared_ptr<Node> beta,
+                              std::shared_ptr<Node> input,
+                              std::shared_ptr<Node> mean,
+                              std::shared_ptr<Node> variance,
+                              std::shared_ptr<Node> delta);
+
+            double get_eps_value() const { return epsilon; }
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+
+        private:
+            double epsilon;
+        };
    }
 }
--- a/src/ngraph/ops/get_output_element.hpp
+++ b/src/ngraph/ops/get_output_element.hpp
@@ -62,6 +62,11 @@ namespace ngraph

            /// \return The index of the tuple element to get.
            size_t get_n() const { return m_n; }
+            virtual NodeVector get_input_ops() override
+            {
+                return NodeVector{get_inputs().at(0).get_output().get_node()};
+            }
+
        protected:
            size_t m_n;
        };

--- a/src/ngraph/ops/parameter_vector.hpp
+++ b/src/ngraph/ops/parameter_vector.hpp
@@ -16,14 +16,15 @@

 #pragma once

+#include <memory>
 #include <vector>

+#include "ngraph/ops/parameter.hpp"
+
 namespace ngraph
 {
    namespace op
    {
-        class Parameter;
-
        /// \brief Zero or more nodes.
        class ParameterVector : public std::vector<std::shared_ptr<op::Parameter>>
        {

--- a/src/ngraph/ops/reverse.cpp
+++ b/src/ngraph/ops/reverse.cpp
@@ -14,10 +14,11 @@
 * limitations under the License.
 *******************************************************************************/

-#include "ngraph/ops/reverse.hpp"
-#include "ngraph/function.hpp"
-
 #include <algorithm>
+#include <sstream>
+
+#include "ngraph/function.hpp"
+#include "ngraph/ops/reverse.hpp"

 using namespace std;
 using namespace ngraph;

--- a/src/ngraph/ops/select_and_scatter.cpp
+++ b/src/ngraph/ops/select_and_scatter.cpp
@@ -16,6 +16,7 @@

 #include "ngraph/ops/select_and_scatter.hpp"
 #include "ngraph/function.hpp"
+#include "ngraph/ops/parameter.hpp"
 #include "ngraph/util.hpp"

 using namespace std;

--- a/src/ngraph/ops/xla_get_tuple_element.cpp
+++ b/src/ngraph/ops/xla_get_tuple_element.cpp
@@ -14,44 +14,42 @@
 * limitations under the License.
 *******************************************************************************/

-#include <memory>
-#include <sstream>
+#include "ngraph/ops/softmax.hpp"

-#include "ngraph/ops/xla_get_tuple_element.hpp"
-#include "ngraph/ops/xla_tuple.hpp"
+#include <algorithm>
+#include <numeric>

-using namespace std;
-using namespace ngraph;
+#include "ngraph/builder/autobroadcast.hpp"
+#include "ngraph/ops/multiply.hpp"
+#include "ngraph/ops/reshape.hpp"
+#include "ngraph/ops/subtract.hpp"
+#include "ngraph/ops/sum.hpp"

-op::XLAGetTupleElement::XLAGetTupleElement(const std::shared_ptr<Node>& arg, size_t n)
-    : XLANode("XLAGetTupleElement", {arg})
-    , m_n{n}
+void ngraph::op::Softmax::generate_adjoints(autodiff::Adjoints& adjoints,
+                                            const std::shared_ptr<Node>& delta)
 {
-    m_arg = dynamic_pointer_cast<XLANode>(arg);
-    if (m_arg == nullptr || m_arg->get_tuple_value() == nullptr)
-    {
-        throw ngraph_error("Argument must be a tuple view");
-    }
-
-    const Nodes& elements = m_arg->get_tuple_elements();
+    auto z = delta * shared_from_this();
+    auto zsum = std::make_shared<op::Sum>(z, m_axes);

-    if (m_n >= elements.size())
+    Shape shape;
+    for (size_t i = 0; i < get_shape().size(); ++i)
    {
-        throw ngraph_error("Indexing tuple beyond its size");
+        if (m_axes.find(i) == m_axes.end())
+        {
+            shape.push_back(get_shape()[i]);
+        }
+        else
+        {
+            shape.push_back(1);
+        }
    }
-}
-
-Nodes op::XLAGetTupleElement::get_input_ops() //const
-{
-    return Nodes{m_arg};
-}
+    AxisVector order(zsum->get_shape().size());
+    std::iota(order.begin(), order.end(), 0);
+    auto zreshape = std::make_shared<op::Reshape>(zsum, order, shape);

-shared_ptr<const op::XLATuple> op::XLAGetTupleElement::get_tuple_value() const
-{
-    return dynamic_pointer_cast<const op::XLATuple>(m_arg->get_tuple_elements().at(m_n));
-}
+    auto adjoint =
+        z - builder::make_with_numpy_broadcast<op::Multiply>(shared_from_this(), zreshape);

-const Nodes& op::XLAGetTupleElement::get_tuple_elements() const
-{
-    return get_tuple_value()->get_tuple_elements();
+    auto x = get_input_op(0);
+    adjoints.add_delta(x, adjoint);
 }
--- a/src/ngraph/ops/xla_get_tuple_element.hpp
+++ b/src/ngraph/ops/xla_get_tuple_element.hpp
@@ -16,61 +16,64 @@

 #pragma once

-#include "ngraph/node.hpp"
-#include "ngraph/ops/xla_node.hpp"
+#include "ngraph/ops/util/unary_elementwise_arithmetic.hpp"

 namespace ngraph
 {
    namespace op
    {
-        /// \brief Operation to get an element from a tuple.
+        /// \brief Softmax operation.
        ///
-        /// ## Parameters
-        ///
-        /// |     | Description                                                        |
-        /// | --- | ------------------------------------------------------------------ |
-        /// | `n` | The position of the element (0-based) to get from the input tuple. |
-        ///
-        /// ## Inputs
-        ///
-        /// |        | Type                                                        | Description                                |
-        /// | ------ | ----------------------------------------------------------- | ------------------------------------------ |
-        /// | `arg`  | \f$(T_1,\dots,T_{n-1},T_n,T_{n+1},\dots,T_m)~(m \geq 1)\f$ | An input tuple with at least `n` elements. |
-        ///
-        /// ## Output
-        ///
-        /// | Type      | Description                           |
-        /// | --------- | ------------------------------------- |
-        /// | \f$T_n\f$ | The `n`th element of the input tuple. |
-        class XLAGetTupleElement : public XLANode
+        class Softmax : public util::UnaryElementwiseArithmetic
        {
        public:
-            /// \brief Constructs a get-tuple-element operation.
+            /// \brief Constructs a softmax operation.
            ///
-            /// \param arg The input tuple.
-            /// \param n The index of the tuple element to get.
-            XLAGetTupleElement(const std::shared_ptr<Node>& arg, size_t n);
+            /// \param arg Node that produces the first input tensor.<br>
+            /// `[d0, ...]`
+            /// \param axes The axis positions (0-based) on which to calculate the softmax.
+            ///
+            /// Output `[d0, ...]`
+            ///
+            Softmax(const std::shared_ptr<Node>& arg, const AxisSet& axes)
+                : UnaryElementwiseArithmetic("Softmax", arg)
+                , m_axes(axes)
+            {
+                for (auto axis : m_axes)
+                {
+                    if (axis >= get_shape().size())
+                    {
+                        throw ngraph_error("Axis for softmax reduction operator is out of bounds");
+                    }
+                }
+
+                // empty axes == all axes
+                if (m_axes.size() == 0)
+                {
+                    for (size_t i = 0; i < get_shape().size(); ++i)
+                    {
+                        m_axes.insert(i);
+                    }
+                }
+            }

-            virtual std::shared_ptr<Node> copy_with_new_args(
-                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override
            {
                if (new_args.size() != 1)
                {
                    throw ngraph_error("Incorrect number of new arguments");
                }
-                return std::make_shared<XLAGetTupleElement>(new_args.at(0), m_n);
+                return std::make_shared<Softmax>(new_args.at(0), m_axes);
            }

-            virtual Nodes get_input_ops() override; //const;
-
-            virtual std::shared_ptr<const XLATuple> get_tuple_value() const override;
-            virtual const Nodes& get_tuple_elements() const override;
-
-            /// \return The index of the tuple element to get.
-            size_t get_n() const { return m_n; }
+            const AxisSet& get_axes() const { return m_axes; }
        protected:
-            std::shared_ptr<XLANode> m_arg;
-            size_t m_n;
+            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
+                                           const std::shared_ptr<Node>& delta) override;
+
+        private:
+            AxisSet m_axes;
        };
    }
 }
--- a/src/ngraph/ops/tanh.cpp
+++ b/src/ngraph/ops/tanh.cpp
@@ -15,16 +15,13 @@
 *******************************************************************************/

 #include "ngraph/ops/tanh.hpp"
-#include "ngraph/ops/cosh.hpp"
-#include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/multiply.hpp"
+#include "ngraph/ops/subtract.hpp"

 void ngraph::op::Tanh::generate_adjoints(autodiff::Adjoints& adjoints,
                                         const std::shared_ptr<Node>& delta)
 {
    auto x = get_input_op(0);

-    auto c = std::make_shared<op::Cosh>(x);
-
-    adjoints.add_delta(x, delta / (c * c));
+    adjoints.add_delta(x, delta - (delta * (shared_from_this() * shared_from_this())));
 }
--- a/src/ngraph/pass/inliner.cpp
+++ b/src/ngraph/pass/inliner.cpp
@@ -16,6 +16,7 @@

 #include "inliner.hpp"
 #include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
 #include "ngraph/ops/function_call.hpp"

 std::vector<std::shared_ptr<ngraph::op::FunctionCall>>

--- a/src/ngraph/pattern/op/any.hpp
+++ b/src/ngraph/pattern/op/any.hpp
@@ -17,7 +17,6 @@
 #pragma once

 #include "ngraph/node.hpp"
-#include "ngraph/pattern/matcher.hpp"
 #include "ngraph/pattern/op/pattern.hpp"

 namespace ngraph

--- a/src/ngraph/pattern/op/label.hpp
+++ b/src/ngraph/pattern/op/label.hpp
@@ -17,7 +17,6 @@
 #pragma once

 #include "ngraph/node.hpp"
-#include "ngraph/pattern/matcher.hpp"
 #include "ngraph/pattern/op/pattern.hpp"

 namespace ngraph

--- a/src/ngraph/pattern/op/pattern.hpp
+++ b/src/ngraph/pattern/op/pattern.hpp
@@ -20,7 +20,6 @@

 #include "ngraph/node.hpp"
 #include "ngraph/pass/graph_rewrite.hpp"
-#include "ngraph/pattern/matcher.hpp"

 namespace ngraph
 {

--- a/src/ngraph/runtime/aligned_buffer.cpp
+++ b/src/ngraph/runtime/aligned_buffer.cpp
@@ -14,6 +14,8 @@
 * limitations under the License.
 *******************************************************************************/

+#include <memory>
+
 #include "ngraph/runtime/aligned_buffer.hpp"

 using namespace ngraph;

--- a/src/ngraph/runtime/aligned_buffer.hpp
+++ b/src/ngraph/runtime/aligned_buffer.hpp
@@ -17,7 +17,6 @@
 #pragma once

 #include <cstddef>
-#include <memory>

 namespace ngraph
 {

--- a/src/ngraph/runtime/backend.hpp
+++ b/src/ngraph/runtime/backend.hpp
@@ -18,7 +18,6 @@

 #include <memory>

-#include "ngraph/log.hpp"
 #include "ngraph/shape.hpp"
 #include "ngraph/types/element_type.hpp"


--- a/src/ngraph/runtime/call_frame.hpp
+++ b/src/ngraph/runtime/call_frame.hpp
@@ -19,7 +19,6 @@
 #include <memory>
 #include <vector>

-#include "ngraph/function.hpp"
 #include "ngraph/runtime/tensor_view.hpp"

 namespace ngraph

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -89,6 +89,7 @@
 #include "ngraph/ops/sin.hpp"
 #include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sqrt.hpp"
 #include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"
@@ -179,6 +180,7 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Concat), &runtime::cpu::CPU_Emitter::emit<op::Concat>},
    {TI(ngraph::op::Divide), &runtime::cpu::CPU_Emitter::emit<op::Divide>},
    {TI(ngraph::op::Equal), &runtime::cpu::CPU_Emitter::emit<op::Equal>},
+    {TI(ngraph::op::GetOutputElement), &runtime::cpu::CPU_Emitter::emit<op::GetOutputElement>},
    {TI(ngraph::op::Greater), &runtime::cpu::CPU_Emitter::emit<op::Greater>},
    {TI(ngraph::op::GreaterEq), &runtime::cpu::CPU_Emitter::emit<op::GreaterEq>},
    {TI(ngraph::op::Less), &runtime::cpu::CPU_Emitter::emit<op::Less>},
@@ -231,12 +233,14 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::AvgPoolBackprop), &runtime::cpu::CPU_Emitter::emit<op::AvgPoolBackprop>},
    {TI(ngraph::op::Pad), &runtime::cpu::CPU_Emitter::emit<op::Pad>},
    {TI(ngraph::op::BatchNorm), &runtime::cpu::CPU_Emitter::emit<op::BatchNorm>},
+    {TI(ngraph::op::BatchNormBackprop), &runtime::cpu::CPU_Emitter::emit<op::BatchNormBackprop>},
    {TI(ngraph::op::MaxPoolBackprop), &runtime::cpu::CPU_Emitter::emit<op::MaxPoolBackprop>},
    {TI(ngraph::op::Product), &runtime::cpu::CPU_Emitter::emit<op::Product>},
    {TI(ngraph::op::Max), &runtime::cpu::CPU_Emitter::emit<op::Max>},
    {TI(ngraph::op::Min), &runtime::cpu::CPU_Emitter::emit<op::Min>},
    {TI(ngraph::op::Relu), &runtime::cpu::CPU_Emitter::emit<op::Relu>},
    {TI(ngraph::op::ReluBackprop), &runtime::cpu::CPU_Emitter::emit<op::ReluBackprop>},
+    {TI(ngraph::op::Softmax), &runtime::cpu::CPU_Emitter::emit<op::Softmax>},
 };

 runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(

--- a/src/ngraph/runtime/cpu/cpu_tracing.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tracing.hpp
@@ -17,7 +17,6 @@
 #pragma once

 #include <cstdint>
-#include <list>
 #include <string>
 #include <vector>


--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -43,6 +43,7 @@ static const std::unordered_set<std::type_index> s_op_registry{
    TI(ngraph::op::AvgPool),
    TI(ngraph::op::AvgPoolBackprop),
    TI(ngraph::op::BatchNorm),
+    TI(ngraph::op::BatchNormBackprop),
    TI(ngraph::op::Convolution),
    TI(ngraph::op::ConvolutionBackpropData),
    TI(ngraph::op::ConvolutionBackpropFilters),

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
@@ -15,6 +15,8 @@
 *******************************************************************************/

 #include "matmul_bias.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/util.hpp"

 std::shared_ptr<ngraph::Node>
    ngraph::op::MatmulBias::copy_with_new_args(const NodeVector& new_args) const

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
@@ -16,11 +16,7 @@

 #pragma once

-#include "ngraph/node.hpp"
 #include "ngraph/ops/util/requires_tensor_view_args.hpp"
-#include "ngraph/util.hpp"
-
-#include <memory>

 namespace ngraph
 {

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -198,7 +198,8 @@ namespace ngraph
                    auto arg0_rank = arg0_shape.size();
                    auto result_shape = node->get_output_shape(0);

-                    if (arg0_rank == 4 && node->get_input_element_type(0) == element::f32)
+                    if ((arg0_rank == 4 || arg0_rank == 2) &&
+                        node->get_input_element_type(0) == element::f32)
                    {
                        auto op_annotations =
                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -25,6 +25,7 @@
 #include "cpu_layout.hpp"
 #include "ngraph/descriptor/output.hpp"
 #include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
 #include "ngraph/ops/add.hpp"
 #include "ngraph/ops/avg_pool.hpp"
 #include "ngraph/ops/convolution.hpp"
@@ -75,7 +76,7 @@ shared_ptr<Node> runtime::cpu::pass::CPULayout::insert_input_conversions(
        }
        else
        {
-            new_args.push_back(node->get_input_op(index));
+            new_args.push_back(output.get_node());
        }
        index++;
    }
@@ -163,7 +164,7 @@ void runtime::cpu::pass::CPULayout::set_default_layouts(
        }
        else
        {
-            new_args.push_back(node->get_input_op(index));
+            new_args.push_back(output.get_node());
        }
        index++;
    }

--- a/src/ngraph/runtime/external_function.hpp
+++ b/src/ngraph/runtime/external_function.hpp
@@ -17,9 +17,6 @@
 #pragma once

 #include <memory>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>

 #include "ngraph/function.hpp"


--- a/src/ngraph/runtime/interpreter/int_call_frame.hpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.hpp
@@ -42,6 +42,7 @@
 #include "ngraph/ops/reverse.hpp"
 #include "ngraph/ops/select_and_scatter.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/runtime/call_frame.hpp"
 #include "ngraph/runtime/host_tensor_view.hpp"
@@ -95,6 +96,7 @@
 #include "ngraph/runtime/kernel/sin.hpp"
 #include "ngraph/runtime/kernel/sinh.hpp"
 #include "ngraph/runtime/kernel/slice.hpp"
+#include "ngraph/runtime/kernel/softmax.hpp"
 #include "ngraph/runtime/kernel/sqrt.hpp"
 #include "ngraph/runtime/kernel/subtract.hpp"
 #include "ngraph/runtime/kernel/sum.hpp"
@@ -812,6 +814,14 @@ private:
                             slice->get_strides(),
                             out[0]->get_shape());
        }
+        else if (node_op == "Softmax")
+        {
+            const op::Softmax* softmax = static_cast<const op::Softmax*>(&node);
+            kernel::softmax<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                               reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                               out[0]->get_shape(),
+                               softmax->get_axes());
+        }
        else if (node_op == "Sqrt")
        {
            kernel::sqrt<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),

--- a/src/ngraph/runtime/interpreter/int_external_function.hpp
+++ b/src/ngraph/runtime/interpreter/int_external_function.hpp
@@ -16,16 +16,11 @@

 #pragma once

-#include <functional>
 #include <memory>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>

 #include "ngraph/function.hpp"
+#include "ngraph/runtime/call_frame.hpp"
 #include "ngraph/runtime/external_function.hpp"
-#include "ngraph/runtime/interpreter/int_backend.hpp"
-#include "ngraph/runtime/interpreter/int_call_frame.hpp"

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/abs.hpp
+++ b/src/ngraph/runtime/kernel/abs.hpp
@@ -16,7 +16,7 @@

 #pragma once

-#include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/acos.hpp
+++ b/src/ngraph/runtime/kernel/acos.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/add.hpp
+++ b/src/ngraph/runtime/kernel/add.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/asin.hpp
+++ b/src/ngraph/runtime/kernel/asin.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/atan.hpp
+++ b/src/ngraph/runtime/kernel/atan.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/avg_pool.hpp
+++ b/src/ngraph/runtime/kernel/avg_pool.hpp
@@ -16,7 +16,6 @@

 #pragma once

-#include <algorithm>
 #include <cmath>
 #include <numeric>
 #include <vector>

--- a/src/ngraph/runtime/kernel/ceiling.hpp
+++ b/src/ngraph/runtime/kernel/ceiling.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/constant.hpp
+++ b/src/ngraph/runtime/kernel/constant.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/convert.hpp
+++ b/src/ngraph/runtime/kernel/convert.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/convolution.hpp
+++ b/src/ngraph/runtime/kernel/convolution.hpp
@@ -119,10 +119,10 @@ namespace ngraph
                    }

                    AxisVector input_batch_transform_axis_order(2 + n_spatial_dimensions);
-                    size_t n = 0;
-                    std::generate(input_batch_transform_axis_order.begin(),
-                                  input_batch_transform_axis_order.end(),
-                                  [&n]() -> size_t { return n++; });
+                    for (size_t i = 0; i < input_batch_transform_axis_order.size(); i++)
+                    {
+                        input_batch_transform_axis_order[i] = i;
+                    }

                    CoordinateTransform input_batch_transform(
                        arg0_shape,

--- a/src/ngraph/runtime/kernel/copy.hpp
+++ b/src/ngraph/runtime/kernel/copy.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/cos.hpp
+++ b/src/ngraph/runtime/kernel/cos.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/cosh.hpp
+++ b/src/ngraph/runtime/kernel/cosh.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/divide.hpp
+++ b/src/ngraph/runtime/kernel/divide.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+#include <stdexcept>
 #include <type_traits>

 namespace ngraph

--- a/src/ngraph/runtime/kernel/equal.hpp
+++ b/src/ngraph/runtime/kernel/equal.hpp
@@ -19,6 +19,8 @@
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/exp.hpp
+++ b/src/ngraph/runtime/kernel/exp.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/floor.hpp
+++ b/src/ngraph/runtime/kernel/floor.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/greater.hpp
+++ b/src/ngraph/runtime/kernel/greater.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/greater_eq.hpp
+++ b/src/ngraph/runtime/kernel/greater_eq.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/less.hpp
+++ b/src/ngraph/runtime/kernel/less.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/less_eq.hpp
+++ b/src/ngraph/runtime/kernel/less_eq.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/log.hpp
+++ b/src/ngraph/runtime/kernel/log.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cmath>
+#include <cstddef>

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/max_pool.hpp
+++ b/src/ngraph/runtime/kernel/max_pool.hpp
@@ -16,7 +16,6 @@

 #pragma once

-#include <algorithm>
 #include <cmath>
 #include <numeric>


--- a/src/ngraph/runtime/kernel/maximum.hpp
+++ b/src/ngraph/runtime/kernel/maximum.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/minimum.hpp
+++ b/src/ngraph/runtime/kernel/minimum.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/multiply.hpp
+++ b/src/ngraph/runtime/kernel/multiply.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/negate.hpp
+++ b/src/ngraph/runtime/kernel/negate.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/not.hpp
+++ b/src/ngraph/runtime/kernel/not.hpp
@@ -16,6 +16,8 @@

 #pragma once

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/not_equal.hpp
+++ b/src/ngraph/runtime/kernel/not_equal.hpp
@@ -19,6 +19,8 @@
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"

+#include <cstddef>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/kernel/pad.hpp
+++ b/src/ngraph/runtime/kernel/pad.hpp
@@ -20,7 +20,6 @@

 #include "ngraph/axis_vector.hpp"
 #include "ngraph/coordinate_transform.hpp"
-#include "ngraph/util.hpp"

 namespace ngraph
 {

--- a/src/ngraph/runtime/kernel/power.hpp
+++ b/src/ngraph/runtime/kernel/power.hpp
--- a/src/ngraph/runtime/kernel/reduce.hpp
+++ b/src/ngraph/runtime/kernel/reduce.hpp
--- a/src/ngraph/runtime/kernel/reduce_window.hpp
+++ b/src/ngraph/runtime/kernel/reduce_window.hpp
--- a/src/ngraph/runtime/kernel/relu.hpp
+++ b/src/ngraph/runtime/kernel/relu.hpp
--- a/src/ngraph/runtime/kernel/select.hpp
+++ b/src/ngraph/runtime/kernel/select.hpp
--- a/src/ngraph/runtime/kernel/select_and_scatter.hpp
+++ b/src/ngraph/runtime/kernel/select_and_scatter.hpp
--- a/src/ngraph/runtime/kernel/sign.hpp
+++ b/src/ngraph/runtime/kernel/sign.hpp
--- a/src/ngraph/runtime/kernel/sin.hpp
+++ b/src/ngraph/runtime/kernel/sin.hpp
--- a/src/ngraph/runtime/kernel/sinh.hpp
+++ b/src/ngraph/runtime/kernel/sinh.hpp
--- a/src/ngraph/runtime/kernel/softmax.hpp
+++ b/src/ngraph/runtime/kernel/softmax.hpp
--- a/src/ngraph/runtime/kernel/sqrt.hpp
+++ b/src/ngraph/runtime/kernel/sqrt.hpp
--- a/src/ngraph/runtime/kernel/subtract.hpp
+++ b/src/ngraph/runtime/kernel/subtract.hpp
--- a/src/ngraph/runtime/kernel/tan.hpp
+++ b/src/ngraph/runtime/kernel/tan.hpp
--- a/src/ngraph/runtime/kernel/tanh.hpp
+++ b/src/ngraph/runtime/kernel/tanh.hpp
--- a/src/ngraph/runtime/manager.hpp
+++ b/src/ngraph/runtime/manager.hpp
--- a/src/ngraph/runtime/tensor_view.hpp
+++ b/src/ngraph/runtime/tensor_view.hpp
--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
--- a/src/ngraph/serializer.hpp
+++ b/src/ngraph/serializer.hpp
--- a/src/ngraph/shape.hpp
+++ b/src/ngraph/shape.hpp
--- a/src/ngraph/strides.hpp
+++ b/src/ngraph/strides.hpp
--- a/src/ngraph/types/element_type.cpp
+++ b/src/ngraph/types/element_type.cpp
--- a/src/ngraph/types/element_type.hpp
+++ b/src/ngraph/types/element_type.hpp
--- a/src/ngraph/types/type.cpp
+++ b/src/ngraph/types/type.cpp
--- a/src/ngraph/util.hpp
+++ b/src/ngraph/util.hpp
--- a/src/ngraph/uuid.hpp
+++ b/src/ngraph/uuid.hpp
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
--- a/test/includes.cpp
+++ b/test/includes.cpp
--- a/test/util.cpp
+++ b/test/util.cpp
--- a/test/util/benchmark.cpp
+++ b/test/util/benchmark.cpp
--- a/test/util/benchmark.hpp
+++ b/test/util/benchmark.hpp